File: /usr/src/linux/mm/filemap.c

1     /*
2      *	linux/mm/filemap.c
3      *
4      * Copyright (C) 1994-1999  Linus Torvalds
5      */
6     
7     /*
8      * This file handles the generic file mmap semantics used by
9      * most "normal" filesystems (but you don't /have/ to use this:
10      * the NFS filesystem used to do this differently, for example)
11      */
12     #include <linux/module.h>
13     #include <linux/slab.h>
14     #include <linux/shm.h>
15     #include <linux/mman.h>
16     #include <linux/locks.h>
17     #include <linux/pagemap.h>
18     #include <linux/swap.h>
19     #include <linux/smp_lock.h>
20     #include <linux/blkdev.h>
21     #include <linux/file.h>
22     #include <linux/swapctl.h>
23     #include <linux/init.h>
24     #include <linux/mm.h>
25     #include <linux/iobuf.h>
26     
27     #include <asm/pgalloc.h>
28     #include <asm/uaccess.h>
29     #include <asm/mman.h>
30     
31     #include <linux/highmem.h>
32     
33     /*
34      * Shared mappings implemented 30.11.1994. It's not fully working yet,
35      * though.
36      *
37      * Shared mappings now work. 15.8.1995  Bruno.
38      *
39      * finished 'unifying' the page and buffer cache and SMP-threaded the
40      * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41      *
42      * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
43      */
44     
45     atomic_t page_cache_size = ATOMIC_INIT(0);
46     unsigned int page_hash_bits;
47     struct page **page_hash_table;
48     
49     spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
50     /*
51      * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
52      *       the pagemap_lru_lock held.
53      */
54     spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
55     
56     #define CLUSTER_PAGES		(1 << page_cluster)
57     #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
58     
59     static void add_page_to_hash_queue(struct page * page, struct page **p)
60     {
61     	struct page *next = *p;
62     
63     	*p = page;
64     	page->next_hash = next;
65     	page->pprev_hash = p;
66     	if (next)
67     		next->pprev_hash = &page->next_hash;
68     	if (page->buffers)
69     		PAGE_BUG(page);
70     	atomic_inc(&page_cache_size);
71     }
72     
73     static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
74     {
75     	struct list_head *head = &mapping->clean_pages;
76     
77     	mapping->nrpages++;
78     	list_add(&page->list, head);
79     	page->mapping = mapping;
80     }
81     
82     static inline void remove_page_from_inode_queue(struct page * page)
83     {
84     	struct address_space * mapping = page->mapping;
85     
86     	mapping->nrpages--;
87     	list_del(&page->list);
88     	page->mapping = NULL;
89     }
90     
91     static inline void remove_page_from_hash_queue(struct page * page)
92     {
93     	struct page *next = page->next_hash;
94     	struct page **pprev = page->pprev_hash;
95     
96     	if (next)
97     		next->pprev_hash = pprev;
98     	*pprev = next;
99     	page->pprev_hash = NULL;
100     	atomic_dec(&page_cache_size);
101     }
102     
103     /*
104      * Remove a page from the page cache and free it. Caller has to make
105      * sure the page is locked and that nobody else uses it - or that usage
106      * is safe.
107      */
108     void __remove_inode_page(struct page *page)
109     {
110     	if (PageDirty(page)) BUG();
111     	remove_page_from_inode_queue(page);
112     	remove_page_from_hash_queue(page);
113     }
114     
115     void remove_inode_page(struct page *page)
116     {
117     	if (!PageLocked(page))
118     		PAGE_BUG(page);
119     
120     	spin_lock(&pagecache_lock);
121     	__remove_inode_page(page);
122     	spin_unlock(&pagecache_lock);
123     }
124     
125     static inline int sync_page(struct page *page)
126     {
127     	struct address_space *mapping = page->mapping;
128     
129     	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
130     		return mapping->a_ops->sync_page(page);
131     	return 0;
132     }
133     
134     /*
135      * Add a page to the dirty page list.
136      */
137     void __set_page_dirty(struct page *page)
138     {
139     	struct address_space *mapping = page->mapping;
140     
141     	spin_lock(&pagecache_lock);
142     	list_del(&page->list);
143     	list_add(&page->list, &mapping->dirty_pages);
144     	spin_unlock(&pagecache_lock);
145     
146     	if (mapping->host)
147     		mark_inode_dirty_pages(mapping->host);
148     }
149     
150     /**
151      * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
152      * @inode: the inode which pages we want to invalidate
153      *
154      * This function only removes the unlocked pages, if you want to
155      * remove all the pages of one inode, you must call truncate_inode_pages.
156      */
157     
158     void invalidate_inode_pages(struct inode * inode)
159     {
160     	struct list_head *head, *curr;
161     	struct page * page;
162     
163     	head = &inode->i_mapping->clean_pages;
164     
165     	spin_lock(&pagecache_lock);
166     	spin_lock(&pagemap_lru_lock);
167     	curr = head->next;
168     
169     	while (curr != head) {
170     		page = list_entry(curr, struct page, list);
171     		curr = curr->next;
172     
173     		/* We cannot invalidate something in use.. */
174     		if (page_count(page) != 1)
175     			continue;
176     
177     		/* ..or dirty.. */
178     		if (PageDirty(page))
179     			continue;
180     
181     		/* ..or locked */
182     		if (TryLockPage(page))
183     			continue;
184     
185     		__lru_cache_del(page);
186     		__remove_inode_page(page);
187     		UnlockPage(page);
188     		page_cache_release(page);
189     	}
190     
191     	spin_unlock(&pagemap_lru_lock);
192     	spin_unlock(&pagecache_lock);
193     }
194     
195     static inline void truncate_partial_page(struct page *page, unsigned partial)
196     {
197     	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
198     				
199     	if (page->buffers)
200     		block_flushpage(page, partial);
201     
202     }
203     
204     static void truncate_complete_page(struct page *page)
205     {
206     	/* Leave it on the LRU if it gets converted into anonymous buffers */
207     	if (!page->buffers || block_flushpage(page, 0))
208     		lru_cache_del(page);
209     
210     	/*
211     	 * We remove the page from the page cache _after_ we have
212     	 * destroyed all buffer-cache references to it. Otherwise some
213     	 * other process might think this inode page is not in the
214     	 * page cache and creates a buffer-cache alias to it causing
215     	 * all sorts of fun problems ...  
216     	 */
217     	ClearPageDirty(page);
218     	ClearPageUptodate(page);
219     	remove_inode_page(page);
220     	page_cache_release(page);
221     }
222     
223     static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
224     static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
225     {
226     	struct list_head *curr;
227     	struct page * page;
228     	int unlocked = 0;
229     
230      restart:
231     	curr = head->prev;
232     	while (curr != head) {
233     		unsigned long offset;
234     
235     		page = list_entry(curr, struct page, list);
236     		offset = page->index;
237     
238     		/* Is one of the pages to truncate? */
239     		if ((offset >= start) || (*partial && (offset + 1) == start)) {
240     			int failed;
241     
242     			page_cache_get(page);
243     			failed = TryLockPage(page);
244     
245     			list_del(head);
246     			if (!failed)
247     				/* Restart after this page */
248     				list_add_tail(head, curr);
249     			else
250     				/* Restart on this page */
251     				list_add(head, curr);
252     
253     			spin_unlock(&pagecache_lock);
254     			unlocked = 1;
255     
256      			if (!failed) {
257     				if (*partial && (offset + 1) == start) {
258     					truncate_partial_page(page, *partial);
259     					*partial = 0;
260     				} else 
261     					truncate_complete_page(page);
262     
263     				UnlockPage(page);
264     			} else
265      				wait_on_page(page);
266     
267     			page_cache_release(page);
268     
269     			if (current->need_resched) {
270     				__set_current_state(TASK_RUNNING);
271     				schedule();
272     			}
273     
274     			spin_lock(&pagecache_lock);
275     			goto restart;
276     		}
277     		curr = curr->prev;
278     	}
279     	return unlocked;
280     }
281     
282     
283     /**
284      * truncate_inode_pages - truncate *all* the pages from an offset
285      * @mapping: mapping to truncate
286      * @lstart: offset from with to truncate
287      *
288      * Truncate the page cache at a set offset, removing the pages
289      * that are beyond that offset (and zeroing out partial pages).
290      * If any page is locked we wait for it to become unlocked.
291      */
292     void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
293     {
294     	unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
295     	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
296     	int unlocked;
297     
298     	spin_lock(&pagecache_lock);
299     	do {
300     		unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
301     		unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
302     		unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
303     	} while (unlocked);
304     	/* Traversed all three lists without dropping the lock */
305     	spin_unlock(&pagecache_lock);
306     }
307     
308     static inline int invalidate_this_page2(struct page * page,
309     					struct list_head * curr,
310     					struct list_head * head)
311     {
312     	int unlocked = 1;
313     
314     	/*
315     	 * The page is locked and we hold the pagecache_lock as well
316     	 * so both page_count(page) and page->buffers stays constant here.
317     	 */
318     	if (page_count(page) == 1 + !!page->buffers) {
319     		/* Restart after this page */
320     		list_del(head);
321     		list_add_tail(head, curr);
322     
323     		page_cache_get(page);
324     		spin_unlock(&pagecache_lock);
325     		truncate_complete_page(page);
326     	} else {
327     		if (page->buffers) {
328     			/* Restart after this page */
329     			list_del(head);
330     			list_add_tail(head, curr);
331     
332     			page_cache_get(page);
333     			spin_unlock(&pagecache_lock);
334     			block_invalidate_page(page);
335     		} else
336     			unlocked = 0;
337     
338     		ClearPageDirty(page);
339     		ClearPageUptodate(page);
340     	}
341     
342     	return unlocked;
343     }
344     
345     static int FASTCALL(invalidate_list_pages2(struct list_head *));
346     static int invalidate_list_pages2(struct list_head *head)
347     {
348     	struct list_head *curr;
349     	struct page * page;
350     	int unlocked = 0;
351     
352      restart:
353     	curr = head->prev;
354     	while (curr != head) {
355     		page = list_entry(curr, struct page, list);
356     
357     		if (!TryLockPage(page)) {
358     			int __unlocked;
359     
360     			__unlocked = invalidate_this_page2(page, curr, head);
361     			UnlockPage(page);
362     			unlocked |= __unlocked;
363     			if (!__unlocked) {
364     				curr = curr->prev;
365     				continue;
366     			}
367     		} else {
368     			/* Restart on this page */
369     			list_del(head);
370     			list_add(head, curr);
371     
372     			page_cache_get(page);
373     			spin_unlock(&pagecache_lock);
374     			unlocked = 1;
375     			wait_on_page(page);
376     		}
377     
378     		page_cache_release(page);
379     		if (current->need_resched) {
380     			__set_current_state(TASK_RUNNING);
381     			schedule();
382     		}
383     
384     		spin_lock(&pagecache_lock);
385     		goto restart;
386     	}
387     	return unlocked;
388     }
389     
390     /**
391      * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
392      * free the pages because they're mapped.
393      * @mapping: the address_space which pages we want to invalidate
394      */
395     void invalidate_inode_pages2(struct address_space * mapping)
396     {
397     	int unlocked;
398     
399     	spin_lock(&pagecache_lock);
400     	do {
401     		unlocked = invalidate_list_pages2(&mapping->clean_pages);
402     		unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
403     		unlocked |= invalidate_list_pages2(&mapping->locked_pages);
404     	} while (unlocked);
405     	spin_unlock(&pagecache_lock);
406     }
407     
408     static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
409     {
410     	goto inside;
411     
412     	for (;;) {
413     		page = page->next_hash;
414     inside:
415     		if (!page)
416     			goto not_found;
417     		if (page->mapping != mapping)
418     			continue;
419     		if (page->index == offset)
420     			break;
421     	}
422     
423     not_found:
424     	return page;
425     }
426     
427     /*
428      * By the time this is called, the page is locked and
429      * we don't have to worry about any races any more.
430      *
431      * Start the IO..
432      */
433     static int writeout_one_page(struct page *page)
434     {
435     	struct buffer_head *bh, *head = page->buffers;
436     
437     	bh = head;
438     	do {
439     		if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
440     			continue;
441     
442     		bh->b_flushtime = jiffies;
443     		ll_rw_block(WRITE, 1, &bh);	
444     	} while ((bh = bh->b_this_page) != head);
445     	return 0;
446     }
447     
448     int waitfor_one_page(struct page *page)
449     {
450     	int error = 0;
451     	struct buffer_head *bh, *head = page->buffers;
452     
453     	bh = head;
454     	do {
455     		wait_on_buffer(bh);
456     		if (buffer_req(bh) && !buffer_uptodate(bh))
457     			error = -EIO;
458     	} while ((bh = bh->b_this_page) != head);
459     	return error;
460     }
461     
462     static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
463     {
464     	struct list_head *curr;
465     	struct page *page;
466     	int retval = 0;
467     
468     	spin_lock(&pagecache_lock);
469     	curr = head->next;
470     	while (curr != head) {
471     		page = list_entry(curr, struct page, list);
472     		curr = curr->next;
473     		if (!page->buffers)
474     			continue;
475     		if (page->index >= end)
476     			continue;
477     		if (page->index < start)
478     			continue;
479     
480     		page_cache_get(page);
481     		spin_unlock(&pagecache_lock);
482     		lock_page(page);
483     
484     		/* The buffers could have been free'd while we waited for the page lock */
485     		if (page->buffers)
486     			retval |= fn(page);
487     
488     		UnlockPage(page);
489     		spin_lock(&pagecache_lock);
490     		curr = page->list.next;
491     		page_cache_release(page);
492     	}
493     	spin_unlock(&pagecache_lock);
494     
495     	return retval;
496     }
497     
498     /*
499      * Two-stage data sync: first start the IO, then go back and
500      * collect the information..
501      */
502     int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
503     {
504     	int retval;
505     
506     	/* writeout dirty buffers on pages from both clean and dirty lists */
507     	retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
508     	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
509     	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
510     
511     	/* now wait for locked buffers on pages from both clean and dirty lists */
512     	retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
513     	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
514     	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
515     
516     	return retval;
517     }
518     
519     /**
520      *      filemap_fdatasync - walk the list of dirty pages of the given address space
521      *     	and writepage() all of them.
522      * 
523      *      @mapping: address space structure to write
524      *
525      */
526     void filemap_fdatasync(struct address_space * mapping)
527     {
528     	int (*writepage)(struct page *) = mapping->a_ops->writepage;
529     
530     	spin_lock(&pagecache_lock);
531     
532             while (!list_empty(&mapping->dirty_pages)) {
533     		struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);
534     
535     		list_del(&page->list);
536     		list_add(&page->list, &mapping->locked_pages);
537     
538     		if (!PageDirty(page))
539     			continue;
540     
541     		page_cache_get(page);
542     		spin_unlock(&pagecache_lock);
543     
544     		lock_page(page);
545     
546     		if (PageDirty(page)) {
547     			ClearPageDirty(page);
548     			writepage(page);
549     		} else
550     			UnlockPage(page);
551     
552     		page_cache_release(page);
553     		spin_lock(&pagecache_lock);
554     	}
555     	spin_unlock(&pagecache_lock);
556     }
557     
558     /**
559      *      filemap_fdatawait - walk the list of locked pages of the given address space
560      *     	and wait for all of them.
561      * 
562      *      @mapping: address space structure to wait for
563      *
564      */
565     void filemap_fdatawait(struct address_space * mapping)
566     {
567     	spin_lock(&pagecache_lock);
568     
569             while (!list_empty(&mapping->locked_pages)) {
570     		struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
571     
572     		list_del(&page->list);
573     		list_add(&page->list, &mapping->clean_pages);
574     
575     		if (!PageLocked(page))
576     			continue;
577     
578     		page_cache_get(page);
579     		spin_unlock(&pagecache_lock);
580     
581     		___wait_on_page(page);
582     
583     		page_cache_release(page);
584     		spin_lock(&pagecache_lock);
585     	}
586     	spin_unlock(&pagecache_lock);
587     }
588     
589     /*
590      * Add a page to the inode page cache.
591      *
592      * The caller must have locked the page and 
593      * set all the page flags correctly..
594      */
595     void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
596     {
597     	if (!PageLocked(page))
598     		BUG();
599     
600     	page->index = index;
601     	page_cache_get(page);
602     	spin_lock(&pagecache_lock);
603     	add_page_to_inode_queue(mapping, page);
604     	add_page_to_hash_queue(page, page_hash(mapping, index));
605     	lru_cache_add(page);
606     	spin_unlock(&pagecache_lock);
607     }
608     
609     /*
610      * This adds a page to the page cache, starting out as locked,
611      * owned by us, but unreferenced, not uptodate and with no errors.
612      */
613     static inline void __add_to_page_cache(struct page * page,
614     	struct address_space *mapping, unsigned long offset,
615     	struct page **hash)
616     {
617     	unsigned long flags;
618     
619     	if (PageLocked(page))
620     		BUG();
621     
622     	flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
623     	page->flags = flags | (1 << PG_locked);
624     	page_cache_get(page);
625     	page->index = offset;
626     	add_page_to_inode_queue(mapping, page);
627     	add_page_to_hash_queue(page, hash);
628     	lru_cache_add(page);
629     }
630     
631     void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
632     {
633     	spin_lock(&pagecache_lock);
634     	__add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
635     	spin_unlock(&pagecache_lock);
636     }
637     
638     static int add_to_page_cache_unique(struct page * page,
639     	struct address_space *mapping, unsigned long offset,
640     	struct page **hash)
641     {
642     	int err;
643     	struct page *alias;
644     
645     	spin_lock(&pagecache_lock);
646     	alias = __find_page_nolock(mapping, offset, *hash);
647     
648     	err = 1;
649     	if (!alias) {
650     		__add_to_page_cache(page,mapping,offset,hash);
651     		err = 0;
652     	}
653     
654     	spin_unlock(&pagecache_lock);
655     	return err;
656     }
657     
658     /*
659      * This adds the requested page to the page cache if it isn't already there,
660      * and schedules an I/O to read in its contents from disk.
661      */
662     static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
663     static int page_cache_read(struct file * file, unsigned long offset)
664     {
665     	struct inode *inode = file->f_dentry->d_inode;
666     	struct address_space *mapping = inode->i_mapping;
667     	struct page **hash = page_hash(mapping, offset);
668     	struct page *page; 
669     
670     	spin_lock(&pagecache_lock);
671     	page = __find_page_nolock(mapping, offset, *hash);
672     	spin_unlock(&pagecache_lock);
673     	if (page)
674     		return 0;
675     
676     	page = page_cache_alloc(mapping);
677     	if (!page)
678     		return -ENOMEM;
679     
680     	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
681     		int error = mapping->a_ops->readpage(file, page);
682     		page_cache_release(page);
683     		return error;
684     	}
685     	/*
686     	 * We arrive here in the unlikely event that someone 
687     	 * raced with us and added our page to the cache first.
688     	 */
689     	page_cache_release(page);
690     	return 0;
691     }
692     
693     /*
694      * Read in an entire cluster at once.  A cluster is usually a 64k-
695      * aligned block that includes the page requested in "offset."
696      */
697     static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
698     					     unsigned long filesize));
699     static int read_cluster_nonblocking(struct file * file, unsigned long offset,
700     	unsigned long filesize)
701     {
702     	unsigned long pages = CLUSTER_PAGES;
703     
704     	offset = CLUSTER_OFFSET(offset);
705     	while ((pages-- > 0) && (offset < filesize)) {
706     		int error = page_cache_read(file, offset);
707     		if (error < 0)
708     			return error;
709     		offset ++;
710     	}
711     
712     	return 0;
713     }
714     
715     /* 
716      * Wait for a page to get unlocked.
717      *
718      * This must be called with the caller "holding" the page,
719      * ie with increased "page->count" so that the page won't
720      * go away during the wait..
721      */
722     void ___wait_on_page(struct page *page)
723     {
724     	struct task_struct *tsk = current;
725     	DECLARE_WAITQUEUE(wait, tsk);
726     
727     	add_wait_queue(&page->wait, &wait);
728     	do {
729     		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
730     		if (!PageLocked(page))
731     			break;
732     		sync_page(page);
733     		schedule();
734     	} while (PageLocked(page));
735     	tsk->state = TASK_RUNNING;
736     	remove_wait_queue(&page->wait, &wait);
737     }
738     
739     /*
740      * Get a lock on the page, assuming we need to sleep
741      * to get it..
742      */
743     static void __lock_page(struct page *page)
744     {
745     	struct task_struct *tsk = current;
746     	DECLARE_WAITQUEUE(wait, tsk);
747     
748     	add_wait_queue_exclusive(&page->wait, &wait);
749     	for (;;) {
750     		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
751     		if (PageLocked(page)) {
752     			sync_page(page);
753     			schedule();
754     		}
755     		if (!TryLockPage(page))
756     			break;
757     	}
758     	tsk->state = TASK_RUNNING;
759     	remove_wait_queue(&page->wait, &wait);
760     }
761     	
762     
763     /*
764      * Get an exclusive lock on the page, optimistically
765      * assuming it's not locked..
766      */
767     void lock_page(struct page *page)
768     {
769     	if (TryLockPage(page))
770     		__lock_page(page);
771     }
772     
773     /*
774      * a rather lightweight function, finding and getting a reference to a
775      * hashed page atomically.
776      */
777     struct page * __find_get_page(struct address_space *mapping,
778     			      unsigned long offset, struct page **hash)
779     {
780     	struct page *page;
781     
782     	/*
783     	 * We scan the hash list read-only. Addition to and removal from
784     	 * the hash-list needs a held write-lock.
785     	 */
786     	spin_lock(&pagecache_lock);
787     	page = __find_page_nolock(mapping, offset, *hash);
788     	if (page)
789     		page_cache_get(page);
790     	spin_unlock(&pagecache_lock);
791     	return page;
792     }
793     
794     /*
795      * Same as the above, but lock the page too, verifying that
796      * it's still valid once we own it.
797      */
798     struct page * __find_lock_page (struct address_space *mapping,
799     				unsigned long offset, struct page **hash)
800     {
801     	struct page *page;
802     
803     	/*
804     	 * We scan the hash list read-only. Addition to and removal from
805     	 * the hash-list needs a held write-lock.
806     	 */
807     repeat:
808     	spin_lock(&pagecache_lock);
809     	page = __find_page_nolock(mapping, offset, *hash);
810     	if (page) {
811     		page_cache_get(page);
812     		spin_unlock(&pagecache_lock);
813     
814     		lock_page(page);
815     
816     		/* Is the page still hashed? Ok, good.. */
817     		if (page->mapping == mapping && page->index == offset)
818     			return page;
819     
820     		/* Nope: we raced. Release and try again.. */
821     		UnlockPage(page);
822     		page_cache_release(page);
823     		goto repeat;
824     	}
825     	spin_unlock(&pagecache_lock);
826     	return NULL;
827     }
828     
829     #if 0
830     #define PROFILE_READAHEAD
831     #define DEBUG_READAHEAD
832     #endif
833     
834     /*
835      * Read-ahead profiling information
836      * --------------------------------
837      * Every PROFILE_MAXREADCOUNT, the following information is written 
838      * to the syslog:
839      *   Percentage of asynchronous read-ahead.
840      *   Average of read-ahead fields context value.
841      * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
842      * to the syslog.
843      */
844     
845     #ifdef PROFILE_READAHEAD
846     
847     #define PROFILE_MAXREADCOUNT 1000
848     
849     static unsigned long total_reada;
850     static unsigned long total_async;
851     static unsigned long total_ramax;
852     static unsigned long total_ralen;
853     static unsigned long total_rawin;
854     
855     static void profile_readahead(int async, struct file *filp)
856     {
857     	unsigned long flags;
858     
859     	++total_reada;
860     	if (async)
861     		++total_async;
862     
863     	total_ramax	+= filp->f_ramax;
864     	total_ralen	+= filp->f_ralen;
865     	total_rawin	+= filp->f_rawin;
866     
867     	if (total_reada > PROFILE_MAXREADCOUNT) {
868     		save_flags(flags);
869     		cli();
870     		if (!(total_reada > PROFILE_MAXREADCOUNT)) {
871     			restore_flags(flags);
872     			return;
873     		}
874     
875     		printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
876     			total_ramax/total_reada,
877     			total_ralen/total_reada,
878     			total_rawin/total_reada,
879     			(total_async*100)/total_reada);
880     #ifdef DEBUG_READAHEAD
881     		printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
882     			filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
883     #endif
884     
885     		total_reada	= 0;
886     		total_async	= 0;
887     		total_ramax	= 0;
888     		total_ralen	= 0;
889     		total_rawin	= 0;
890     
891     		restore_flags(flags);
892     	}
893     }
894     #endif  /* defined PROFILE_READAHEAD */
895     
896     /*
897      * Read-ahead context:
898      * -------------------
899      * The read ahead context fields of the "struct file" are the following:
900      * - f_raend : position of the first byte after the last page we tried to
901      *	       read ahead.
902      * - f_ramax : current read-ahead maximum size.
903      * - f_ralen : length of the current IO read block we tried to read-ahead.
904      * - f_rawin : length of the current read-ahead window.
905      *		if last read-ahead was synchronous then
906      *			f_rawin = f_ralen
907      *		otherwise (was asynchronous)
908      *			f_rawin = previous value of f_ralen + f_ralen
909      *
910      * Read-ahead limits:
911      * ------------------
912      * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
913      * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
914      *
915      * Synchronous read-ahead benefits:
916      * --------------------------------
917      * Using reasonable IO xfer length from peripheral devices increase system 
918      * performances.
919      * Reasonable means, in this context, not too large but not too small.
920      * The actual maximum value is:
921      *	MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
922      *      and 32K if defined (4K page size assumed).
923      *
924      * Asynchronous read-ahead benefits:
925      * ---------------------------------
926      * Overlapping next read request and user process execution increase system 
927      * performance.
928      *
929      * Read-ahead risks:
930      * -----------------
931      * We have to guess which further data are needed by the user process.
932      * If these data are often not really needed, it's bad for system 
933      * performances.
934      * However, we know that files are often accessed sequentially by 
935      * application programs and it seems that it is possible to have some good 
936      * strategy in that guessing.
937      * We only try to read-ahead files that seems to be read sequentially.
938      *
939      * Asynchronous read-ahead risks:
940      * ------------------------------
941      * In order to maximize overlapping, we must start some asynchronous read 
942      * request from the device, as soon as possible.
943      * We must be very careful about:
944      * - The number of effective pending IO read requests.
945      *   ONE seems to be the only reasonable value.
946      * - The total memory pool usage for the file access stream.
947      *   This maximum memory usage is implicitly 2 IO read chunks:
948      *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
949      *   64k if defined (4K page size assumed).
950      */
951     
952     static inline int get_max_readahead(struct inode * inode)
953     {
954     	if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
955     		return MAX_READAHEAD;
956     	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
957     }
958     
959     static inline unsigned long calc_end_index(struct inode * inode)
960     {
961     	unsigned long end_index;
962     
963     	if (!S_ISBLK(inode->i_mode))
964     		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
965     	else
966     		end_index = buffered_blk_size(inode->i_rdev) >> (PAGE_CACHE_SHIFT - BLOCK_SIZE_BITS);
967     
968     	return end_index;
969     }
970     
971     static inline loff_t calc_rsize(struct inode * inode)
972     {
973     	loff_t rsize;
974     
975     	if (!S_ISBLK(inode->i_mode))
976     		rsize = inode->i_size;
977     	else
978     		rsize = (loff_t) buffered_blk_size(inode->i_rdev) << BLOCK_SIZE_BITS;
979     
980     	return rsize;
981     }
982     
983     static void generic_file_readahead(int reada_ok,
984     	struct file * filp, struct inode * inode,
985     	struct page * page)
986     {
987     	unsigned long end_index;
988     	unsigned long index = page->index;
989     	unsigned long max_ahead, ahead;
990     	unsigned long raend;
991     	int max_readahead = get_max_readahead(inode);
992     
993     	end_index = calc_end_index(inode);
994     
995     	raend = filp->f_raend;
996     	max_ahead = 0;
997     
998     /*
999      * The current page is locked.
1000      * If the current position is inside the previous read IO request, do not
1001      * try to reread previously read ahead pages.
1002      * Otherwise decide or not to read ahead some pages synchronously.
1003      * If we are not going to read ahead, set the read ahead context for this 
1004      * page only.
1005      */
1006     	if (PageLocked(page)) {
1007     		if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
1008     			raend = index;
1009     			if (raend < end_index)
1010     				max_ahead = filp->f_ramax;
1011     			filp->f_rawin = 0;
1012     			filp->f_ralen = 1;
1013     			if (!max_ahead) {
1014     				filp->f_raend  = index + filp->f_ralen;
1015     				filp->f_rawin += filp->f_ralen;
1016     			}
1017     		}
1018     	}
1019     /*
1020      * The current page is not locked.
1021      * If we were reading ahead and,
1022      * if the current max read ahead size is not zero and,
1023      * if the current position is inside the last read-ahead IO request,
1024      *   it is the moment to try to read ahead asynchronously.
1025      * We will later force unplug device in order to force asynchronous read IO.
1026      */
1027     	else if (reada_ok && filp->f_ramax && raend >= 1 &&
1028     		 index <= raend && index + filp->f_ralen >= raend) {
1029     /*
1030      * Add ONE page to max_ahead in order to try to have about the same IO max size
1031      * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
1032      * Compute the position of the last page we have tried to read in order to 
1033      * begin to read ahead just at the next page.
1034      */
1035     		raend -= 1;
1036     		if (raend < end_index)
1037     			max_ahead = filp->f_ramax + 1;
1038     
1039     		if (max_ahead) {
1040     			filp->f_rawin = filp->f_ralen;
1041     			filp->f_ralen = 0;
1042     			reada_ok      = 2;
1043     		}
1044     	}
1045     /*
1046      * Try to read ahead pages.
1047      * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
1048      * scheduler, will work enough for us to avoid too bad actuals IO requests.
1049      */
1050     	ahead = 0;
1051     	while (ahead < max_ahead) {
1052     		ahead ++;
1053     		if ((raend + ahead) >= end_index)
1054     			break;
1055     		if (page_cache_read(filp, raend + ahead) < 0)
1056     			break;
1057     	}
1058     /*
1059      * If we tried to read ahead some pages,
1060      * If we tried to read ahead asynchronously,
1061      *   Try to force unplug of the device in order to start an asynchronous
1062      *   read IO request.
1063      * Update the read-ahead context.
1064      * Store the length of the current read-ahead window.
1065      * Double the current max read ahead size.
1066      *   That heuristic avoid to do some large IO for files that are not really
1067      *   accessed sequentially.
1068      */
1069     	if (ahead) {
1070     		filp->f_ralen += ahead;
1071     		filp->f_rawin += filp->f_ralen;
1072     		filp->f_raend = raend + ahead + 1;
1073     
1074     		filp->f_ramax += filp->f_ramax;
1075     
1076     		if (filp->f_ramax > max_readahead)
1077     			filp->f_ramax = max_readahead;
1078     
1079     #ifdef PROFILE_READAHEAD
1080     		profile_readahead((reada_ok == 2), filp);
1081     #endif
1082     	}
1083     
1084     	return;
1085     }
1086     
1087     /*
1088      * Mark a page as having seen activity.
1089      *
1090      * If it was already so marked, move it
1091      * to the active queue and drop the referenced
1092      * bit. Otherwise, just mark it for future
1093      * action..
1094      */
1095     void mark_page_accessed(struct page *page)
1096     {
1097     	if (!PageActive(page) && PageReferenced(page)) {
1098     		activate_page(page);
1099     		ClearPageReferenced(page);
1100     		return;
1101     	}
1102     
1103     	/* Mark the page referenced, AFTER checking for previous usage.. */
1104     	SetPageReferenced(page);
1105     }
1106     
1107     /*
1108      * This is a generic file read routine, and uses the
1109      * inode->i_op->readpage() function for the actual low-level
1110      * stuff.
1111      *
1112      * This is really ugly. But the goto's actually try to clarify some
1113      * of the logic when it comes to error handling etc.
1114      */
1115     void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1116     {
1117     	struct inode *inode = filp->f_dentry->d_inode;
1118     	struct address_space *mapping = inode->i_mapping;
1119     	unsigned long index, offset;
1120     	struct page *cached_page;
1121     	int reada_ok;
1122     	int error;
1123     	int max_readahead = get_max_readahead(inode);
1124     
1125     	cached_page = NULL;
1126     	index = *ppos >> PAGE_CACHE_SHIFT;
1127     	offset = *ppos & ~PAGE_CACHE_MASK;
1128     
1129     /*
1130      * If the current position is outside the previous read-ahead window, 
1131      * we reset the current read-ahead context and set read ahead max to zero
1132      * (will be set to just needed value later),
1133      * otherwise, we assume that the file accesses are sequential enough to
1134      * continue read-ahead.
1135      */
1136     	if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1137     		reada_ok = 0;
1138     		filp->f_raend = 0;
1139     		filp->f_ralen = 0;
1140     		filp->f_ramax = 0;
1141     		filp->f_rawin = 0;
1142     	} else {
1143     		reada_ok = 1;
1144     	}
1145     /*
1146      * Adjust the current value of read-ahead max.
1147      * If the read operation stay in the first half page, force no readahead.
1148      * Otherwise try to increase read ahead max just enough to do the read request.
1149      * Then, at least MIN_READAHEAD if read ahead is ok,
1150      * and at most MAX_READAHEAD in all cases.
1151      */
1152     	if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1153     		filp->f_ramax = 0;
1154     	} else {
1155     		unsigned long needed;
1156     
1157     		needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1158     
1159     		if (filp->f_ramax < needed)
1160     			filp->f_ramax = needed;
1161     
1162     		if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1163     				filp->f_ramax = MIN_READAHEAD;
1164     		if (filp->f_ramax > max_readahead)
1165     			filp->f_ramax = max_readahead;
1166     	}
1167     
1168     	for (;;) {
1169     		struct page *page, **hash;
1170     		unsigned long end_index, nr, ret;
1171     
1172     		end_index = calc_end_index(inode);
1173     			
1174     		if (index > end_index)
1175     			break;
1176     		nr = PAGE_CACHE_SIZE;
1177     		if (index == end_index) {
1178     			nr = calc_rsize(inode) & ~PAGE_CACHE_MASK;
1179     			if (nr <= offset)
1180     				break;
1181     		}
1182     
1183     		nr = nr - offset;
1184     
1185     		/*
1186     		 * Try to find the data in the page cache..
1187     		 */
1188     		hash = page_hash(mapping, index);
1189     
1190     		spin_lock(&pagecache_lock);
1191     		page = __find_page_nolock(mapping, index, *hash);
1192     		if (!page)
1193     			goto no_cached_page;
1194     found_page:
1195     		page_cache_get(page);
1196     		spin_unlock(&pagecache_lock);
1197     
1198     		if (!Page_Uptodate(page))
1199     			goto page_not_up_to_date;
1200     		generic_file_readahead(reada_ok, filp, inode, page);
1201     page_ok:
1202     		/* If users can be writing to this page using arbitrary
1203     		 * virtual addresses, take care about potential aliasing
1204     		 * before reading the page on the kernel side.
1205     		 */
1206     		if (mapping->i_mmap_shared != NULL)
1207     			flush_dcache_page(page);
1208     
1209     		/*
1210     		 * Ok, we have the page, and it's up-to-date, so
1211     		 * now we can copy it to user space...
1212     		 *
1213     		 * The actor routine returns how many bytes were actually used..
1214     		 * NOTE! This may not be the same as how much of a user buffer
1215     		 * we filled up (we may be padding etc), so we can only update
1216     		 * "pos" here (the actor routine has to update the user buffer
1217     		 * pointers and the remaining count).
1218     		 */
1219     		ret = actor(desc, page, offset, nr);
1220     		offset += ret;
1221     		index += offset >> PAGE_CACHE_SHIFT;
1222     		offset &= ~PAGE_CACHE_MASK;
1223     
1224     		mark_page_accessed(page);
1225     		page_cache_release(page);
1226     		if (ret == nr && desc->count)
1227     			continue;
1228     		break;
1229     
1230     /*
1231      * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1232      */
1233     page_not_up_to_date:
1234     		generic_file_readahead(reada_ok, filp, inode, page);
1235     
1236     		if (Page_Uptodate(page))
1237     			goto page_ok;
1238     
1239     		/* Get exclusive access to the page ... */
1240     		lock_page(page);
1241     
1242     		/* Did it get unhashed before we got the lock? */
1243     		if (!page->mapping) {
1244     			UnlockPage(page);
1245     			page_cache_release(page);
1246     			continue;
1247     		}
1248     
1249     		/* Did somebody else fill it already? */
1250     		if (Page_Uptodate(page)) {
1251     			UnlockPage(page);
1252     			goto page_ok;
1253     		}
1254     
1255     readpage:
1256     		/* ... and start the actual read. The read will unlock the page. */
1257     		error = mapping->a_ops->readpage(filp, page);
1258     
1259     		if (!error) {
1260     			if (Page_Uptodate(page))
1261     				goto page_ok;
1262     
1263     			/* Again, try some read-ahead while waiting for the page to finish.. */
1264     			generic_file_readahead(reada_ok, filp, inode, page);
1265     			wait_on_page(page);
1266     			if (Page_Uptodate(page))
1267     				goto page_ok;
1268     			error = -EIO;
1269     		}
1270     
1271     		/* UHHUH! A synchronous read error occurred. Report it */
1272     		desc->error = error;
1273     		page_cache_release(page);
1274     		break;
1275     
1276     no_cached_page:
1277     		/*
1278     		 * Ok, it wasn't cached, so we need to create a new
1279     		 * page..
1280     		 *
1281     		 * We get here with the page cache lock held.
1282     		 */
1283     		if (!cached_page) {
1284     			spin_unlock(&pagecache_lock);
1285     			cached_page = page_cache_alloc(mapping);
1286     			if (!cached_page) {
1287     				desc->error = -ENOMEM;
1288     				break;
1289     			}
1290     
1291     			/*
1292     			 * Somebody may have added the page while we
1293     			 * dropped the page cache lock. Check for that.
1294     			 */
1295     			spin_lock(&pagecache_lock);
1296     			page = __find_page_nolock(mapping, index, *hash);
1297     			if (page)
1298     				goto found_page;
1299     		}
1300     
1301     		/*
1302     		 * Ok, add the new page to the hash-queues...
1303     		 */
1304     		page = cached_page;
1305     		__add_to_page_cache(page, mapping, index, hash);
1306     		spin_unlock(&pagecache_lock);
1307     		cached_page = NULL;
1308     
1309     		goto readpage;
1310     	}
1311     
1312     	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1313     	filp->f_reada = 1;
1314     	if (cached_page)
1315     		page_cache_release(cached_page);
1316     	UPDATE_ATIME(inode);
1317     }
1318     
1319     static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
1320     {
1321     	ssize_t retval;
1322     	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
1323     	struct kiobuf * iobuf;
1324     	struct inode * inode = filp->f_dentry->d_inode;
1325     	struct address_space * mapping = inode->i_mapping;
1326     
1327     	new_iobuf = 0;
1328     	iobuf = filp->f_iobuf;
1329     	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
1330     		/*
1331     		 * A parallel read/write is using the preallocated iobuf
1332     		 * so just run slow and allocate a new one.
1333     		 */
1334     		retval = alloc_kiovec(1, &iobuf);
1335     		if (retval)
1336     			goto out;
1337     		new_iobuf = 1;
1338     	}
1339     
1340     	if (!S_ISBLK(inode->i_mode)) {
1341     		blocksize = inode->i_sb->s_blocksize;
1342     		blocksize_bits = inode->i_sb->s_blocksize_bits;
1343     	} else {
1344     		blocksize = BUFFERED_BLOCKSIZE;
1345     		blocksize_bits = BUFFERED_BLOCKSIZE_BITS;
1346     	}
1347     	blocksize_mask = blocksize - 1;
1348     	chunk_size = KIO_MAX_ATOMIC_IO << 10;
1349     
1350     	retval = -EINVAL;
1351     	if ((offset & blocksize_mask) || (count & blocksize_mask))
1352     		goto out_free;
1353     	if (!mapping->a_ops->direct_IO)
1354     		goto out_free;
1355     
1356     	/*
1357     	 * Flush to disk exlusively the _data_, metadata must remains
1358     	 * completly asynchronous or performance will go to /dev/null.
1359     	 */
1360     	filemap_fdatasync(mapping);
1361     	retval = fsync_inode_data_buffers(inode);
1362     	filemap_fdatawait(mapping);
1363     	if (retval < 0)
1364     		goto out_free;
1365     
1366     	progress = retval = 0;
1367     	while (count > 0) {
1368     		iosize = count;
1369     		if (iosize > chunk_size)
1370     			iosize = chunk_size;
1371     
1372     		retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
1373     		if (retval)
1374     			break;
1375     
1376     		retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
1377     
1378     		if (rw == READ && retval > 0)
1379     			mark_dirty_kiobuf(iobuf, retval);
1380     		
1381     		if (retval >= 0) {
1382     			count -= retval;
1383     			buf += retval;
1384     			progress += retval;
1385     		}
1386     
1387     		unmap_kiobuf(iobuf);
1388     
1389     		if (retval != iosize)
1390     			break;
1391     	}
1392     
1393     	if (progress)
1394     		retval = progress;
1395     
1396      out_free:
1397     	if (!new_iobuf)
1398     		clear_bit(0, &filp->f_iobuf_lock);
1399     	else
1400     		free_kiovec(1, &iobuf);
1401      out:	
1402     	return retval;
1403     }
1404     
1405     int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1406     {
1407     	char *kaddr;
1408     	unsigned long left, count = desc->count;
1409     
1410     	if (size > count)
1411     		size = count;
1412     
1413     	kaddr = kmap(page);
1414     	left = __copy_to_user(desc->buf, kaddr + offset, size);
1415     	kunmap(page);
1416     	
1417     	if (left) {
1418     		size -= left;
1419     		desc->error = -EFAULT;
1420     	}
1421     	desc->count = count - size;
1422     	desc->written += size;
1423     	desc->buf += size;
1424     	return size;
1425     }
1426     
1427     /*
1428      * This is the "read()" routine for all filesystems
1429      * that can use the page cache directly.
1430      */
1431     ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1432     {
1433     	ssize_t retval;
1434     
1435     	if ((ssize_t) count < 0)
1436     		return -EINVAL;
1437     
1438     	if (filp->f_flags & O_DIRECT)
1439     		goto o_direct;
1440     
1441     	retval = -EFAULT;
1442     	if (access_ok(VERIFY_WRITE, buf, count)) {
1443     		retval = 0;
1444     
1445     		if (count) {
1446     			read_descriptor_t desc;
1447     
1448     			desc.written = 0;
1449     			desc.count = count;
1450     			desc.buf = buf;
1451     			desc.error = 0;
1452     			do_generic_file_read(filp, ppos, &desc, file_read_actor);
1453     
1454     			retval = desc.written;
1455     			if (!retval)
1456     				retval = desc.error;
1457     		}
1458     	}
1459      out:
1460     	return retval;
1461     
1462      o_direct:
1463     	{
1464     		loff_t pos = *ppos, size;
1465     		struct inode * inode = filp->f_dentry->d_inode;
1466     
1467     		retval = 0;
1468     		if (!count)
1469     			goto out; /* skip atime */
1470     		size = calc_rsize(inode);
1471     		if (pos < size) {
1472     			if (pos + count > size)
1473     				count = size - pos;
1474     			retval = generic_file_direct_IO(READ, filp, buf, count, pos);
1475     			if (retval > 0)
1476     				*ppos = pos + retval;
1477     		}
1478     		UPDATE_ATIME(filp->f_dentry->d_inode);
1479     		goto out;
1480     	}
1481     }
1482     
1483     static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1484     {
1485     	ssize_t written;
1486     	unsigned long count = desc->count;
1487     	struct file *file = (struct file *) desc->buf;
1488     
1489     	if (size > count)
1490     		size = count;
1491     
1492      	if (file->f_op->sendpage) {
1493      		written = file->f_op->sendpage(file, page, offset,
1494     					       size, &file->f_pos, size<count);
1495     	} else {
1496     		char *kaddr;
1497     		mm_segment_t old_fs;
1498     
1499     		old_fs = get_fs();
1500     		set_fs(KERNEL_DS);
1501     
1502     		kaddr = kmap(page);
1503     		written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1504     		kunmap(page);
1505     
1506     		set_fs(old_fs);
1507     	}
1508     	if (written < 0) {
1509     		desc->error = written;
1510     		written = 0;
1511     	}
1512     	desc->count = count - written;
1513     	desc->written += written;
1514     	return written;
1515     }
1516     
1517     asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1518     {
1519     	ssize_t retval;
1520     	struct file * in_file, * out_file;
1521     	struct inode * in_inode, * out_inode;
1522     
1523     	/*
1524     	 * Get input file, and verify that it is ok..
1525     	 */
1526     	retval = -EBADF;
1527     	in_file = fget(in_fd);
1528     	if (!in_file)
1529     		goto out;
1530     	if (!(in_file->f_mode & FMODE_READ))
1531     		goto fput_in;
1532     	retval = -EINVAL;
1533     	in_inode = in_file->f_dentry->d_inode;
1534     	if (!in_inode)
1535     		goto fput_in;
1536     	if (!in_inode->i_mapping->a_ops->readpage)
1537     		goto fput_in;
1538     	retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1539     	if (retval)
1540     		goto fput_in;
1541     
1542     	/*
1543     	 * Get output file, and verify that it is ok..
1544     	 */
1545     	retval = -EBADF;
1546     	out_file = fget(out_fd);
1547     	if (!out_file)
1548     		goto fput_in;
1549     	if (!(out_file->f_mode & FMODE_WRITE))
1550     		goto fput_out;
1551     	retval = -EINVAL;
1552     	if (!out_file->f_op || !out_file->f_op->write)
1553     		goto fput_out;
1554     	out_inode = out_file->f_dentry->d_inode;
1555     	retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1556     	if (retval)
1557     		goto fput_out;
1558     
1559     	retval = 0;
1560     	if (count) {
1561     		read_descriptor_t desc;
1562     		loff_t pos = 0, *ppos;
1563     
1564     		retval = -EFAULT;
1565     		ppos = &in_file->f_pos;
1566     		if (offset) {
1567     			if (get_user(pos, offset))
1568     				goto fput_out;
1569     			ppos = &pos;
1570     		}
1571     
1572     		desc.written = 0;
1573     		desc.count = count;
1574     		desc.buf = (char *) out_file;
1575     		desc.error = 0;
1576     		do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1577     
1578     		retval = desc.written;
1579     		if (!retval)
1580     			retval = desc.error;
1581     		if (offset)
1582     			put_user(pos, offset);
1583     	}
1584     
1585     fput_out:
1586     	fput(out_file);
1587     fput_in:
1588     	fput(in_file);
1589     out:
1590     	return retval;
1591     }
1592     
1593     /*
1594      * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1595      * sure this is sequential access, we don't need a flexible read-ahead
1596      * window size -- we can always use a large fixed size window.
1597      */
1598     static void nopage_sequential_readahead(struct vm_area_struct * vma,
1599     	unsigned long pgoff, unsigned long filesize)
1600     {
1601     	unsigned long ra_window;
1602     
1603     	ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1604     	ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1605     
1606     	/* vm_raend is zero if we haven't read ahead in this area yet.  */
1607     	if (vma->vm_raend == 0)
1608     		vma->vm_raend = vma->vm_pgoff + ra_window;
1609     
1610     	/*
1611     	 * If we've just faulted the page half-way through our window,
1612     	 * then schedule reads for the next window, and release the
1613     	 * pages in the previous window.
1614     	 */
1615     	if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1616     		unsigned long start = vma->vm_pgoff + vma->vm_raend;
1617     		unsigned long end = start + ra_window;
1618     
1619     		if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1620     			end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1621     		if (start > end)
1622     			return;
1623     
1624     		while ((start < end) && (start < filesize)) {
1625     			if (read_cluster_nonblocking(vma->vm_file,
1626     							start, filesize) < 0)
1627     				break;
1628     			start += CLUSTER_PAGES;
1629     		}
1630     		run_task_queue(&tq_disk);
1631     
1632     		/* if we're far enough past the beginning of this area,
1633     		   recycle pages that are in the previous window. */
1634     		if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1635     			unsigned long window = ra_window << PAGE_SHIFT;
1636     
1637     			end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1638     			end -= window + window;
1639     			filemap_sync(vma, end - window, window, MS_INVALIDATE);
1640     		}
1641     
1642     		vma->vm_raend += ra_window;
1643     	}
1644     
1645     	return;
1646     }
1647     
1648     /*
1649      * filemap_nopage() is invoked via the vma operations vector for a
1650      * mapped memory region to read in file data during a page fault.
1651      *
1652      * The goto's are kind of ugly, but this streamlines the normal case of having
1653      * it in the page cache, and handles the special cases reasonably without
1654      * having a lot of duplicated code.
1655      */
1656     struct page * filemap_nopage(struct vm_area_struct * area,
1657     	unsigned long address, int no_share)
1658     {
1659     	int error;
1660     	struct file *file = area->vm_file;
1661     	struct inode *inode = file->f_dentry->d_inode;
1662     	struct address_space *mapping = inode->i_mapping;
1663     	struct page *page, **hash, *old_page;
1664     	unsigned long size, pgoff;
1665     	loff_t rsize;
1666     
1667     	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1668     
1669     retry_all:
1670     	/*
1671     	 * An external ptracer can access pages that normally aren't
1672     	 * accessible..
1673     	 */
1674     	rsize = calc_rsize(inode);
1675     	size = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1676     	if ((pgoff >= size) && (area->vm_mm == current->mm))
1677     		return NULL;
1678     
1679     	/*
1680     	 * Do we have something in the page cache already?
1681     	 */
1682     	hash = page_hash(mapping, pgoff);
1683     retry_find:
1684     	page = __find_get_page(mapping, pgoff, hash);
1685     	if (!page)
1686     		goto no_cached_page;
1687     
1688     	/*
1689     	 * Ok, found a page in the page cache, now we need to check
1690     	 * that it's up-to-date.
1691     	 */
1692     	if (!Page_Uptodate(page))
1693     		goto page_not_uptodate;
1694     
1695     success:
1696      	/*
1697     	 * Try read-ahead for sequential areas.
1698     	 */
1699     	if (VM_SequentialReadHint(area))
1700     		nopage_sequential_readahead(area, pgoff, size);
1701     
1702     	/*
1703     	 * Found the page and have a reference on it, need to check sharing
1704     	 * and possibly copy it over to another page..
1705     	 */
1706     	old_page = page;
1707     	mark_page_accessed(page);
1708     	if (no_share) {
1709     		struct page *new_page = alloc_page(GFP_HIGHUSER);
1710     
1711     		if (new_page) {
1712     			copy_user_highpage(new_page, old_page, address);
1713     			flush_page_to_ram(new_page);
1714     		} else
1715     			new_page = NOPAGE_OOM;
1716     		page_cache_release(page);
1717     		return new_page;
1718     	}
1719     
1720     	flush_page_to_ram(old_page);
1721     	return old_page;
1722     
1723     no_cached_page:
1724     	/*
1725     	 * If the requested offset is within our file, try to read a whole 
1726     	 * cluster of pages at once.
1727     	 *
1728     	 * Otherwise, we're off the end of a privately mapped file,
1729     	 * so we need to map a zero page.
1730     	 */
1731     	if ((pgoff < size) && !VM_RandomReadHint(area))
1732     		error = read_cluster_nonblocking(file, pgoff, size);
1733     	else
1734     		error = page_cache_read(file, pgoff);
1735     
1736     	/*
1737     	 * The page we want has now been added to the page cache.
1738     	 * In the unlikely event that someone removed it in the
1739     	 * meantime, we'll just come back here and read it again.
1740     	 */
1741     	if (error >= 0)
1742     		goto retry_find;
1743     
1744     	/*
1745     	 * An error return from page_cache_read can result if the
1746     	 * system is low on memory, or a problem occurs while trying
1747     	 * to schedule I/O.
1748     	 */
1749     	if (error == -ENOMEM)
1750     		return NOPAGE_OOM;
1751     	return NULL;
1752     
1753     page_not_uptodate:
1754     	lock_page(page);
1755     
1756     	/* Did it get unhashed while we waited for it? */
1757     	if (!page->mapping) {
1758     		UnlockPage(page);
1759     		page_cache_release(page);
1760     		goto retry_all;
1761     	}
1762     
1763     	/* Did somebody else get it up-to-date? */
1764     	if (Page_Uptodate(page)) {
1765     		UnlockPage(page);
1766     		goto success;
1767     	}
1768     
1769     	if (!mapping->a_ops->readpage(file, page)) {
1770     		wait_on_page(page);
1771     		if (Page_Uptodate(page))
1772     			goto success;
1773     	}
1774     
1775     	/*
1776     	 * Umm, take care of errors if the page isn't up-to-date.
1777     	 * Try to re-read it _once_. We do this synchronously,
1778     	 * because there really aren't any performance issues here
1779     	 * and we need to check for errors.
1780     	 */
1781     	lock_page(page);
1782     
1783     	/* Somebody truncated the page on us? */
1784     	if (!page->mapping) {
1785     		UnlockPage(page);
1786     		page_cache_release(page);
1787     		goto retry_all;
1788     	}
1789     
1790     	/* Somebody else successfully read it in? */
1791     	if (Page_Uptodate(page)) {
1792     		UnlockPage(page);
1793     		goto success;
1794     	}
1795     	ClearPageError(page);
1796     	if (!mapping->a_ops->readpage(file, page)) {
1797     		wait_on_page(page);
1798     		if (Page_Uptodate(page))
1799     			goto success;
1800     	}
1801     
1802     	/*
1803     	 * Things didn't work out. Return zero to tell the
1804     	 * mm layer so, possibly freeing the page cache page first.
1805     	 */
1806     	page_cache_release(page);
1807     	return NULL;
1808     }
1809     
1810     /* Called with mm->page_table_lock held to protect against other
1811      * threads/the swapper from ripping pte's out from under us.
1812      */
1813     static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1814     	unsigned long address, unsigned int flags)
1815     {
1816     	pte_t pte = *ptep;
1817     
1818     	if (pte_present(pte)) {
1819     		struct page *page = pte_page(pte);
1820     		if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
1821     			flush_tlb_page(vma, address);
1822     			if (page->mapping)
1823     				set_page_dirty(page);
1824     		}
1825     	}
1826     	return 0;
1827     }
1828     
1829     static inline int filemap_sync_pte_range(pmd_t * pmd,
1830     	unsigned long address, unsigned long size, 
1831     	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1832     {
1833     	pte_t * pte;
1834     	unsigned long end;
1835     	int error;
1836     
1837     	if (pmd_none(*pmd))
1838     		return 0;
1839     	if (pmd_bad(*pmd)) {
1840     		pmd_ERROR(*pmd);
1841     		pmd_clear(pmd);
1842     		return 0;
1843     	}
1844     	pte = pte_offset(pmd, address);
1845     	offset += address & PMD_MASK;
1846     	address &= ~PMD_MASK;
1847     	end = address + size;
1848     	if (end > PMD_SIZE)
1849     		end = PMD_SIZE;
1850     	error = 0;
1851     	do {
1852     		error |= filemap_sync_pte(pte, vma, address + offset, flags);
1853     		address += PAGE_SIZE;
1854     		pte++;
1855     	} while (address && (address < end));
1856     	return error;
1857     }
1858     
1859     static inline int filemap_sync_pmd_range(pgd_t * pgd,
1860     	unsigned long address, unsigned long size, 
1861     	struct vm_area_struct *vma, unsigned int flags)
1862     {
1863     	pmd_t * pmd;
1864     	unsigned long offset, end;
1865     	int error;
1866     
1867     	if (pgd_none(*pgd))
1868     		return 0;
1869     	if (pgd_bad(*pgd)) {
1870     		pgd_ERROR(*pgd);
1871     		pgd_clear(pgd);
1872     		return 0;
1873     	}
1874     	pmd = pmd_offset(pgd, address);
1875     	offset = address & PGDIR_MASK;
1876     	address &= ~PGDIR_MASK;
1877     	end = address + size;
1878     	if (end > PGDIR_SIZE)
1879     		end = PGDIR_SIZE;
1880     	error = 0;
1881     	do {
1882     		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1883     		address = (address + PMD_SIZE) & PMD_MASK;
1884     		pmd++;
1885     	} while (address && (address < end));
1886     	return error;
1887     }
1888     
1889     int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1890     	size_t size, unsigned int flags)
1891     {
1892     	pgd_t * dir;
1893     	unsigned long end = address + size;
1894     	int error = 0;
1895     
1896     	/* Aquire the lock early; it may be possible to avoid dropping
1897     	 * and reaquiring it repeatedly.
1898     	 */
1899     	spin_lock(&vma->vm_mm->page_table_lock);
1900     
1901     	dir = pgd_offset(vma->vm_mm, address);
1902     	flush_cache_range(vma->vm_mm, end - size, end);
1903     	if (address >= end)
1904     		BUG();
1905     	do {
1906     		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1907     		address = (address + PGDIR_SIZE) & PGDIR_MASK;
1908     		dir++;
1909     	} while (address && (address < end));
1910     	flush_tlb_range(vma->vm_mm, end - size, end);
1911     
1912     	spin_unlock(&vma->vm_mm->page_table_lock);
1913     
1914     	return error;
1915     }
1916     
1917     static struct vm_operations_struct generic_file_vm_ops = {
1918     	nopage:		filemap_nopage,
1919     };
1920     
1921     /* This is used for a general mmap of a disk file */
1922     
1923     int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1924     {
1925     	struct inode *inode = file->f_dentry->d_inode;
1926     
1927     	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1928     		if (!inode->i_mapping->a_ops->writepage)
1929     			return -EINVAL;
1930     	}
1931     	if (!inode->i_sb || !S_ISREG(inode->i_mode))
1932     		return -EACCES;
1933     	if (!inode->i_mapping->a_ops->readpage)
1934     		return -ENOEXEC;
1935     	UPDATE_ATIME(inode);
1936     	vma->vm_ops = &generic_file_vm_ops;
1937     	return 0;
1938     }
1939     
1940     /*
1941      * The msync() system call.
1942      */
1943     
1944     static int msync_interval(struct vm_area_struct * vma,
1945     	unsigned long start, unsigned long end, int flags)
1946     {
1947     	struct file * file = vma->vm_file;
1948     	if (file && (vma->vm_flags & VM_SHARED)) {
1949     		int error;
1950     		error = filemap_sync(vma, start, end-start, flags);
1951     
1952     		if (!error && (flags & MS_SYNC)) {
1953     			struct inode * inode = file->f_dentry->d_inode;
1954     			down(&inode->i_sem);
1955     			filemap_fdatasync(inode->i_mapping);
1956     			if (file->f_op && file->f_op->fsync)
1957     				error = file->f_op->fsync(file, file->f_dentry, 1);
1958     			filemap_fdatawait(inode->i_mapping);
1959     			up(&inode->i_sem);
1960     		}
1961     		return error;
1962     	}
1963     	return 0;
1964     }
1965     
1966     asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1967     {
1968     	unsigned long end;
1969     	struct vm_area_struct * vma;
1970     	int unmapped_error, error = -EINVAL;
1971     
1972     	down_read(&current->mm->mmap_sem);
1973     	if (start & ~PAGE_MASK)
1974     		goto out;
1975     	len = (len + ~PAGE_MASK) & PAGE_MASK;
1976     	end = start + len;
1977     	if (end < start)
1978     		goto out;
1979     	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1980     		goto out;
1981     	error = 0;
1982     	if (end == start)
1983     		goto out;
1984     	/*
1985     	 * If the interval [start,end) covers some unmapped address ranges,
1986     	 * just ignore them, but return -EFAULT at the end.
1987     	 */
1988     	vma = find_vma(current->mm, start);
1989     	unmapped_error = 0;
1990     	for (;;) {
1991     		/* Still start < end. */
1992     		error = -EFAULT;
1993     		if (!vma)
1994     			goto out;
1995     		/* Here start < vma->vm_end. */
1996     		if (start < vma->vm_start) {
1997     			unmapped_error = -EFAULT;
1998     			start = vma->vm_start;
1999     		}
2000     		/* Here vma->vm_start <= start < vma->vm_end. */
2001     		if (end <= vma->vm_end) {
2002     			if (start < end) {
2003     				error = msync_interval(vma, start, end, flags);
2004     				if (error)
2005     					goto out;
2006     			}
2007     			error = unmapped_error;
2008     			goto out;
2009     		}
2010     		/* Here vma->vm_start <= start < vma->vm_end < end. */
2011     		error = msync_interval(vma, start, vma->vm_end, flags);
2012     		if (error)
2013     			goto out;
2014     		start = vma->vm_end;
2015     		vma = vma->vm_next;
2016     	}
2017     out:
2018     	up_read(&current->mm->mmap_sem);
2019     	return error;
2020     }
2021     
2022     static inline void setup_read_behavior(struct vm_area_struct * vma,
2023     	int behavior)
2024     {
2025     	VM_ClearReadHint(vma);
2026     	switch(behavior) {
2027     		case MADV_SEQUENTIAL:
2028     			vma->vm_flags |= VM_SEQ_READ;
2029     			break;
2030     		case MADV_RANDOM:
2031     			vma->vm_flags |= VM_RAND_READ;
2032     			break;
2033     		default:
2034     			break;
2035     	}
2036     	return;
2037     }
2038     
2039     static long madvise_fixup_start(struct vm_area_struct * vma,
2040     	unsigned long end, int behavior)
2041     {
2042     	struct vm_area_struct * n;
2043     	struct mm_struct * mm = vma->vm_mm;
2044     
2045     	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2046     	if (!n)
2047     		return -EAGAIN;
2048     	*n = *vma;
2049     	n->vm_end = end;
2050     	setup_read_behavior(n, behavior);
2051     	n->vm_raend = 0;
2052     	if (n->vm_file)
2053     		get_file(n->vm_file);
2054     	if (n->vm_ops && n->vm_ops->open)
2055     		n->vm_ops->open(n);
2056     	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
2057     	lock_vma_mappings(vma);
2058     	spin_lock(&mm->page_table_lock);
2059     	vma->vm_start = end;
2060     	__insert_vm_struct(mm, n);
2061     	spin_unlock(&mm->page_table_lock);
2062     	unlock_vma_mappings(vma);
2063     	return 0;
2064     }
2065     
2066     static long madvise_fixup_end(struct vm_area_struct * vma,
2067     	unsigned long start, int behavior)
2068     {
2069     	struct vm_area_struct * n;
2070     	struct mm_struct * mm = vma->vm_mm;
2071     
2072     	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2073     	if (!n)
2074     		return -EAGAIN;
2075     	*n = *vma;
2076     	n->vm_start = start;
2077     	n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
2078     	setup_read_behavior(n, behavior);
2079     	n->vm_raend = 0;
2080     	if (n->vm_file)
2081     		get_file(n->vm_file);
2082     	if (n->vm_ops && n->vm_ops->open)
2083     		n->vm_ops->open(n);
2084     	lock_vma_mappings(vma);
2085     	spin_lock(&mm->page_table_lock);
2086     	vma->vm_end = start;
2087     	__insert_vm_struct(mm, n);
2088     	spin_unlock(&mm->page_table_lock);
2089     	unlock_vma_mappings(vma);
2090     	return 0;
2091     }
2092     
2093     static long madvise_fixup_middle(struct vm_area_struct * vma,
2094     	unsigned long start, unsigned long end, int behavior)
2095     {
2096     	struct vm_area_struct * left, * right;
2097     	struct mm_struct * mm = vma->vm_mm;
2098     
2099     	left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2100     	if (!left)
2101     		return -EAGAIN;
2102     	right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2103     	if (!right) {
2104     		kmem_cache_free(vm_area_cachep, left);
2105     		return -EAGAIN;
2106     	}
2107     	*left = *vma;
2108     	*right = *vma;
2109     	left->vm_end = start;
2110     	right->vm_start = end;
2111     	right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
2112     	left->vm_raend = 0;
2113     	right->vm_raend = 0;
2114     	if (vma->vm_file)
2115     		atomic_add(2, &vma->vm_file->f_count);
2116     
2117     	if (vma->vm_ops && vma->vm_ops->open) {
2118     		vma->vm_ops->open(left);
2119     		vma->vm_ops->open(right);
2120     	}
2121     	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
2122     	vma->vm_raend = 0;
2123     	lock_vma_mappings(vma);
2124     	spin_lock(&mm->page_table_lock);
2125     	vma->vm_start = start;
2126     	vma->vm_end = end;
2127     	setup_read_behavior(vma, behavior);
2128     	__insert_vm_struct(mm, left);
2129     	__insert_vm_struct(mm, right);
2130     	spin_unlock(&mm->page_table_lock);
2131     	unlock_vma_mappings(vma);
2132     	return 0;
2133     }
2134     
2135     /*
2136      * We can potentially split a vm area into separate
2137      * areas, each area with its own behavior.
2138      */
2139     static long madvise_behavior(struct vm_area_struct * vma,
2140     	unsigned long start, unsigned long end, int behavior)
2141     {
2142     	int error = 0;
2143     
2144     	/* This caps the number of vma's this process can own */
2145     	if (vma->vm_mm->map_count > MAX_MAP_COUNT)
2146     		return -ENOMEM;
2147     
2148     	if (start == vma->vm_start) {
2149     		if (end == vma->vm_end) {
2150     			setup_read_behavior(vma, behavior);
2151     			vma->vm_raend = 0;
2152     		} else
2153     			error = madvise_fixup_start(vma, end, behavior);
2154     	} else {
2155     		if (end == vma->vm_end)
2156     			error = madvise_fixup_end(vma, start, behavior);
2157     		else
2158     			error = madvise_fixup_middle(vma, start, end, behavior);
2159     	}
2160     
2161     	return error;
2162     }
2163     
2164     /*
2165      * Schedule all required I/O operations, then run the disk queue
2166      * to make sure they are started.  Do not wait for completion.
2167      */
2168     static long madvise_willneed(struct vm_area_struct * vma,
2169     	unsigned long start, unsigned long end)
2170     {
2171     	long error = -EBADF;
2172     	struct file * file;
2173     	unsigned long size, rlim_rss;
2174     	loff_t rsize;
2175     
2176     	/* Doesn't work if there's no mapped file. */
2177     	if (!vma->vm_file)
2178     		return error;
2179     	file = vma->vm_file;
2180     	rsize = calc_rsize(file->f_dentry->d_inode);
2181     	size = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2182     
2183     	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2184     	if (end > vma->vm_end)
2185     		end = vma->vm_end;
2186     	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2187     
2188     	/* Make sure this doesn't exceed the process's max rss. */
2189     	error = -EIO;
2190     	rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
2191     				LONG_MAX; /* default: see resource.h */
2192     	if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
2193     		return error;
2194     
2195     	/* round to cluster boundaries if this isn't a "random" area. */
2196     	if (!VM_RandomReadHint(vma)) {
2197     		start = CLUSTER_OFFSET(start);
2198     		end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
2199     
2200     		while ((start < end) && (start < size)) {
2201     			error = read_cluster_nonblocking(file, start, size);
2202     			start += CLUSTER_PAGES;
2203     			if (error < 0)
2204     				break;
2205     		}
2206     	} else {
2207     		while ((start < end) && (start < size)) {
2208     			error = page_cache_read(file, start);
2209     			start++;
2210     			if (error < 0)
2211     				break;
2212     		}
2213     	}
2214     
2215     	/* Don't wait for someone else to push these requests. */
2216     	run_task_queue(&tq_disk);
2217     
2218     	return error;
2219     }
2220     
2221     /*
2222      * Application no longer needs these pages.  If the pages are dirty,
2223      * it's OK to just throw them away.  The app will be more careful about
2224      * data it wants to keep.  Be sure to free swap resources too.  The
2225      * zap_page_range call sets things up for refill_inactive to actually free
2226      * these pages later if no one else has touched them in the meantime,
2227      * although we could add these pages to a global reuse list for
2228      * refill_inactive to pick up before reclaiming other pages.
2229      *
2230      * NB: This interface discards data rather than pushes it out to swap,
2231      * as some implementations do.  This has performance implications for
2232      * applications like large transactional databases which want to discard
2233      * pages in anonymous maps after committing to backing store the data
2234      * that was kept in them.  There is no reason to write this data out to
2235      * the swap area if the application is discarding it.
2236      *
2237      * An interface that causes the system to free clean pages and flush
2238      * dirty pages is already available as msync(MS_INVALIDATE).
2239      */
2240     static long madvise_dontneed(struct vm_area_struct * vma,
2241     	unsigned long start, unsigned long end)
2242     {
2243     	if (vma->vm_flags & VM_LOCKED)
2244     		return -EINVAL;
2245     
2246     	zap_page_range(vma->vm_mm, start, end - start);
2247     	return 0;
2248     }
2249     
2250     static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2251     	unsigned long end, int behavior)
2252     {
2253     	long error = -EBADF;
2254     
2255     	switch (behavior) {
2256     	case MADV_NORMAL:
2257     	case MADV_SEQUENTIAL:
2258     	case MADV_RANDOM:
2259     		error = madvise_behavior(vma, start, end, behavior);
2260     		break;
2261     
2262     	case MADV_WILLNEED:
2263     		error = madvise_willneed(vma, start, end);
2264     		break;
2265     
2266     	case MADV_DONTNEED:
2267     		error = madvise_dontneed(vma, start, end);
2268     		break;
2269     
2270     	default:
2271     		error = -EINVAL;
2272     		break;
2273     	}
2274     		
2275     	return error;
2276     }
2277     
2278     /*
2279      * The madvise(2) system call.
2280      *
2281      * Applications can use madvise() to advise the kernel how it should
2282      * handle paging I/O in this VM area.  The idea is to help the kernel
2283      * use appropriate read-ahead and caching techniques.  The information
2284      * provided is advisory only, and can be safely disregarded by the
2285      * kernel without affecting the correct operation of the application.
2286      *
2287      * behavior values:
2288      *  MADV_NORMAL - the default behavior is to read clusters.  This
2289      *		results in some read-ahead and read-behind.
2290      *  MADV_RANDOM - the system should read the minimum amount of data
2291      *		on any access, since it is unlikely that the appli-
2292      *		cation will need more than what it asks for.
2293      *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2294      *		once, so they can be aggressively read ahead, and
2295      *		can be freed soon after they are accessed.
2296      *  MADV_WILLNEED - the application is notifying the system to read
2297      *		some pages ahead.
2298      *  MADV_DONTNEED - the application is finished with the given range,
2299      *		so the kernel can free resources associated with it.
2300      *
2301      * return values:
2302      *  zero    - success
2303      *  -EINVAL - start + len < 0, start is not page-aligned,
2304      *		"behavior" is not a valid value, or application
2305      *		is attempting to release locked or shared pages.
2306      *  -ENOMEM - addresses in the specified range are not currently
2307      *		mapped, or are outside the AS of the process.
2308      *  -EIO    - an I/O error occurred while paging in data.
2309      *  -EBADF  - map exists, but area maps something that isn't a file.
2310      *  -EAGAIN - a kernel resource was temporarily unavailable.
2311      */
2312     asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2313     {
2314     	unsigned long end;
2315     	struct vm_area_struct * vma;
2316     	int unmapped_error = 0;
2317     	int error = -EINVAL;
2318     
2319     	down_write(&current->mm->mmap_sem);
2320     
2321     	if (start & ~PAGE_MASK)
2322     		goto out;
2323     	len = (len + ~PAGE_MASK) & PAGE_MASK;
2324     	end = start + len;
2325     	if (end < start)
2326     		goto out;
2327     
2328     	error = 0;
2329     	if (end == start)
2330     		goto out;
2331     
2332     	/*
2333     	 * If the interval [start,end) covers some unmapped address
2334     	 * ranges, just ignore them, but return -ENOMEM at the end.
2335     	 */
2336     	vma = find_vma(current->mm, start);
2337     	for (;;) {
2338     		/* Still start < end. */
2339     		error = -ENOMEM;
2340     		if (!vma)
2341     			goto out;
2342     
2343     		/* Here start < vma->vm_end. */
2344     		if (start < vma->vm_start) {
2345     			unmapped_error = -ENOMEM;
2346     			start = vma->vm_start;
2347     		}
2348     
2349     		/* Here vma->vm_start <= start < vma->vm_end. */
2350     		if (end <= vma->vm_end) {
2351     			if (start < end) {
2352     				error = madvise_vma(vma, start, end,
2353     							behavior);
2354     				if (error)
2355     					goto out;
2356     			}
2357     			error = unmapped_error;
2358     			goto out;
2359     		}
2360     
2361     		/* Here vma->vm_start <= start < vma->vm_end < end. */
2362     		error = madvise_vma(vma, start, vma->vm_end, behavior);
2363     		if (error)
2364     			goto out;
2365     		start = vma->vm_end;
2366     		vma = vma->vm_next;
2367     	}
2368     
2369     out:
2370     	up_write(&current->mm->mmap_sem);
2371     	return error;
2372     }
2373     
2374     /*
2375      * Later we can get more picky about what "in core" means precisely.
2376      * For now, simply check to see if the page is in the page cache,
2377      * and is up to date; i.e. that no page-in operation would be required
2378      * at this time if an application were to map and access this page.
2379      */
2380     static unsigned char mincore_page(struct vm_area_struct * vma,
2381     	unsigned long pgoff)
2382     {
2383     	unsigned char present = 0;
2384     	struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2385     	struct page * page, ** hash = page_hash(as, pgoff);
2386     
2387     	spin_lock(&pagecache_lock);
2388     	page = __find_page_nolock(as, pgoff, *hash);
2389     	if ((page) && (Page_Uptodate(page)))
2390     		present = 1;
2391     	spin_unlock(&pagecache_lock);
2392     
2393     	return present;
2394     }
2395     
2396     static long mincore_vma(struct vm_area_struct * vma,
2397     	unsigned long start, unsigned long end, unsigned char * vec)
2398     {
2399     	long error, i, remaining;
2400     	unsigned char * tmp;
2401     
2402     	error = -ENOMEM;
2403     	if (!vma->vm_file)
2404     		return error;
2405     
2406     	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2407     	if (end > vma->vm_end)
2408     		end = vma->vm_end;
2409     	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2410     
2411     	error = -EAGAIN;
2412     	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2413     	if (!tmp)
2414     		return error;
2415     
2416     	/* (end - start) is # of pages, and also # of bytes in "vec */
2417     	remaining = (end - start),
2418     
2419     	error = 0;
2420     	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2421     		int j = 0;
2422     		long thispiece = (remaining < PAGE_SIZE) ?
2423     						remaining : PAGE_SIZE;
2424     
2425     		while (j < thispiece)
2426     			tmp[j++] = mincore_page(vma, start++);
2427     
2428     		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2429     			error = -EFAULT;
2430     			break;
2431     		}
2432     	}
2433     
2434     	free_page((unsigned long) tmp);
2435     	return error;
2436     }
2437     
2438     /*
2439      * The mincore(2) system call.
2440      *
2441      * mincore() returns the memory residency status of the pages in the
2442      * current process's address space specified by [addr, addr + len).
2443      * The status is returned in a vector of bytes.  The least significant
2444      * bit of each byte is 1 if the referenced page is in memory, otherwise
2445      * it is zero.
2446      *
2447      * Because the status of a page can change after mincore() checks it
2448      * but before it returns to the application, the returned vector may
2449      * contain stale information.  Only locked pages are guaranteed to
2450      * remain in memory.
2451      *
2452      * return values:
2453      *  zero    - success
2454      *  -EFAULT - vec points to an illegal address
2455      *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2456      *		or len has a nonpositive value
2457      *  -ENOMEM - Addresses in the range [addr, addr + len] are
2458      *		invalid for the address space of this process, or
2459      *		specify one or more pages which are not currently
2460      *		mapped
2461      *  -EAGAIN - A kernel resource was temporarily unavailable.
2462      */
2463     asmlinkage long sys_mincore(unsigned long start, size_t len,
2464     	unsigned char * vec)
2465     {
2466     	int index = 0;
2467     	unsigned long end;
2468     	struct vm_area_struct * vma;
2469     	int unmapped_error = 0;
2470     	long error = -EINVAL;
2471     
2472     	down_read(&current->mm->mmap_sem);
2473     
2474     	if (start & ~PAGE_CACHE_MASK)
2475     		goto out;
2476     	len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2477     	end = start + len;
2478     	if (end < start)
2479     		goto out;
2480     
2481     	error = 0;
2482     	if (end == start)
2483     		goto out;
2484     
2485     	/*
2486     	 * If the interval [start,end) covers some unmapped address
2487     	 * ranges, just ignore them, but return -ENOMEM at the end.
2488     	 */
2489     	vma = find_vma(current->mm, start);
2490     	for (;;) {
2491     		/* Still start < end. */
2492     		error = -ENOMEM;
2493     		if (!vma)
2494     			goto out;
2495     
2496     		/* Here start < vma->vm_end. */
2497     		if (start < vma->vm_start) {
2498     			unmapped_error = -ENOMEM;
2499     			start = vma->vm_start;
2500     		}
2501     
2502     		/* Here vma->vm_start <= start < vma->vm_end. */
2503     		if (end <= vma->vm_end) {
2504     			if (start < end) {
2505     				error = mincore_vma(vma, start, end,
2506     							&vec[index]);
2507     				if (error)
2508     					goto out;
2509     			}
2510     			error = unmapped_error;
2511     			goto out;
2512     		}
2513     
2514     		/* Here vma->vm_start <= start < vma->vm_end < end. */
2515     		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2516     		if (error)
2517     			goto out;
2518     		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2519     		start = vma->vm_end;
2520     		vma = vma->vm_next;
2521     	}
2522     
2523     out:
2524     	up_read(&current->mm->mmap_sem);
2525     	return error;
2526     }
2527     
2528     static inline
2529     struct page *__read_cache_page(struct address_space *mapping,
2530     				unsigned long index,
2531     				int (*filler)(void *,struct page*),
2532     				void *data)
2533     {
2534     	struct page **hash = page_hash(mapping, index);
2535     	struct page *page, *cached_page = NULL;
2536     	int err;
2537     repeat:
2538     	page = __find_get_page(mapping, index, hash);
2539     	if (!page) {
2540     		if (!cached_page) {
2541     			cached_page = page_cache_alloc(mapping);
2542     			if (!cached_page)
2543     				return ERR_PTR(-ENOMEM);
2544     		}
2545     		page = cached_page;
2546     		if (add_to_page_cache_unique(page, mapping, index, hash))
2547     			goto repeat;
2548     		cached_page = NULL;
2549     		err = filler(data, page);
2550     		if (err < 0) {
2551     			page_cache_release(page);
2552     			page = ERR_PTR(err);
2553     		}
2554     	}
2555     	if (cached_page)
2556     		page_cache_release(cached_page);
2557     	return page;
2558     }
2559     
2560     /*
2561      * Read into the page cache. If a page already exists,
2562      * and Page_Uptodate() is not set, try to fill the page.
2563      */
2564     struct page *read_cache_page(struct address_space *mapping,
2565     				unsigned long index,
2566     				int (*filler)(void *,struct page*),
2567     				void *data)
2568     {
2569     	struct page *page;
2570     	int err;
2571     
2572     retry:
2573     	page = __read_cache_page(mapping, index, filler, data);
2574     	if (IS_ERR(page))
2575     		goto out;
2576     	mark_page_accessed(page);
2577     	if (Page_Uptodate(page))
2578     		goto out;
2579     
2580     	lock_page(page);
2581     	if (!page->mapping) {
2582     		UnlockPage(page);
2583     		page_cache_release(page);
2584     		goto retry;
2585     	}
2586     	if (Page_Uptodate(page)) {
2587     		UnlockPage(page);
2588     		goto out;
2589     	}
2590     	err = filler(data, page);
2591     	if (err < 0) {
2592     		page_cache_release(page);
2593     		page = ERR_PTR(err);
2594     	}
2595      out:
2596     	return page;
2597     }
2598     
2599     static inline struct page * __grab_cache_page(struct address_space *mapping,
2600     				unsigned long index, struct page **cached_page)
2601     {
2602     	struct page *page, **hash = page_hash(mapping, index);
2603     repeat:
2604     	page = __find_lock_page(mapping, index, hash);
2605     	if (!page) {
2606     		if (!*cached_page) {
2607     			*cached_page = page_cache_alloc(mapping);
2608     			if (!*cached_page)
2609     				return NULL;
2610     		}
2611     		page = *cached_page;
2612     		if (add_to_page_cache_unique(page, mapping, index, hash))
2613     			goto repeat;
2614     		*cached_page = NULL;
2615     	}
2616     	return page;
2617     }
2618     
2619     /*
2620      * Returns locked page at given index in given cache, creating it if needed.
2621      */
2622     
2623     struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2624     {
2625     	struct page *cached_page = NULL;
2626     	struct page *page = __grab_cache_page(mapping,index,&cached_page);
2627     	if (cached_page)
2628     		page_cache_release(cached_page);
2629     	return page;
2630     }
2631     
2632     inline void remove_suid(struct inode *inode)
2633     {
2634     	unsigned int mode;
2635     
2636     	/* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2637     	mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2638     
2639     	/* was any of the uid bits set? */
2640     	mode &= inode->i_mode;
2641     	if (mode && !capable(CAP_FSETID)) {
2642     		inode->i_mode &= ~mode;
2643     		mark_inode_dirty(inode);
2644     	}
2645     }
2646     
2647     /*
2648      * Write to a file through the page cache. 
2649      *
2650      * We currently put everything into the page cache prior to writing it.
2651      * This is not a problem when writing full pages. With partial pages,
2652      * however, we first have to read the data into the cache, then
2653      * dirty the page, and finally schedule it for writing. Alternatively, we
2654      * could write-through just the portion of data that would go into that
2655      * page, but that would kill performance for applications that write data
2656      * line by line, and it's prone to race conditions.
2657      *
2658      * Note that this routine doesn't try to keep track of dirty pages. Each
2659      * file system has to do this all by itself, unfortunately.
2660      *							okir@monad.swb.de
2661      */
2662     ssize_t
2663     generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
2664     {
2665     	struct inode	*inode = file->f_dentry->d_inode; 
2666     	struct address_space *mapping = inode->i_mapping;
2667     	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2668     	loff_t		pos;
2669     	struct page	*page, *cached_page;
2670     	unsigned long	written;
2671     	long		status = 0;
2672     	int		err;
2673     	unsigned	bytes;
2674     
2675     	if ((ssize_t) count < 0)
2676     		return -EINVAL;
2677     
2678     	if (!access_ok(VERIFY_READ, buf, count))
2679     		return -EFAULT;
2680     
2681     	cached_page = NULL;
2682     
2683     	down(&inode->i_sem);
2684     
2685     	pos = *ppos;
2686     	err = -EINVAL;
2687     	if (pos < 0)
2688     		goto out;
2689     
2690     	err = file->f_error;
2691     	if (err) {
2692     		file->f_error = 0;
2693     		goto out;
2694     	}
2695     
2696     	written = 0;
2697     
2698     	/* FIXME: this is for backwards compatibility with 2.4 */
2699     	if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
2700     		pos = inode->i_size;
2701     
2702     	/*
2703     	 * Check whether we've reached the file size limit.
2704     	 */
2705     	err = -EFBIG;
2706     	
2707     	if (limit != RLIM_INFINITY) {
2708     		if (pos >= limit) {
2709     			send_sig(SIGXFSZ, current, 0);
2710     			goto out;
2711     		}
2712     		if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
2713     			/* send_sig(SIGXFSZ, current, 0); */
2714     			count = limit - (u32)pos;
2715     		}
2716     	}
2717     
2718     	/*
2719     	 *	LFS rule 
2720     	 */
2721     	if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
2722     		if (pos >= MAX_NON_LFS) {
2723     			send_sig(SIGXFSZ, current, 0);
2724     			goto out;
2725     		}
2726     		if (count > MAX_NON_LFS - (u32)pos) {
2727     			/* send_sig(SIGXFSZ, current, 0); */
2728     			count = MAX_NON_LFS - (u32)pos;
2729     		}
2730     	}
2731     
2732     	/*
2733     	 *	Are we about to exceed the fs block limit ?
2734     	 *
2735     	 *	If we have written data it becomes a short write
2736     	 *	If we have exceeded without writing data we send
2737     	 *	a signal and give them an EFBIG.
2738     	 *
2739     	 *	Linus frestrict idea will clean these up nicely..
2740     	 */
2741     	 
2742     	if (!S_ISBLK(inode->i_mode)) {
2743     		if (pos >= inode->i_sb->s_maxbytes)
2744     		{
2745     			if (count || pos > inode->i_sb->s_maxbytes) {
2746     				send_sig(SIGXFSZ, current, 0);
2747     				err = -EFBIG;
2748     				goto out;
2749     			}
2750     			/* zero-length writes at ->s_maxbytes are OK */
2751     		}
2752     
2753     		if (pos + count > inode->i_sb->s_maxbytes)
2754     			count = inode->i_sb->s_maxbytes - pos;
2755     	} else {
2756     		if (is_read_only(inode->i_rdev)) {
2757     			err = -EPERM;
2758     			goto out;
2759     		}
2760     		if (pos >= calc_rsize(inode)) {
2761     			if (count || pos > calc_rsize(inode)) {
2762     				/* FIXME: this is for backwards compatibility with 2.4 */
2763     				err = -ENOSPC;
2764     				goto out;
2765     			}
2766     			/* zero-length writes at blkdev end are OK */
2767     		}
2768     
2769     		if (pos + count > calc_rsize(inode))
2770     			count = calc_rsize(inode) - pos;
2771     	}
2772     
2773     	err = 0;
2774     	if (count == 0)
2775     		goto out;
2776     
2777     	remove_suid(inode);
2778     	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2779     	mark_inode_dirty_sync(inode);
2780     
2781     	if (file->f_flags & O_DIRECT)
2782     		goto o_direct;
2783     
2784     	do {
2785     		unsigned long index, offset;
2786     		long page_fault;
2787     		char *kaddr;
2788     
2789     		/*
2790     		 * Try to find the page in the cache. If it isn't there,
2791     		 * allocate a free page.
2792     		 */
2793     		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2794     		index = pos >> PAGE_CACHE_SHIFT;
2795     		bytes = PAGE_CACHE_SIZE - offset;
2796     		if (bytes > count)
2797     			bytes = count;
2798     
2799     		/*
2800     		 * Bring in the user page that we will copy from _first_.
2801     		 * Otherwise there's a nasty deadlock on copying from the
2802     		 * same page as we're writing to, without it being marked
2803     		 * up-to-date.
2804     		 */
2805     		{ volatile unsigned char dummy;
2806     			__get_user(dummy, buf);
2807     			__get_user(dummy, buf+bytes-1);
2808     		}
2809     
2810     		status = -ENOMEM;	/* we'll assign it later anyway */
2811     		page = __grab_cache_page(mapping, index, &cached_page);
2812     		if (!page)
2813     			break;
2814     
2815     		/* We have exclusive IO access to the page.. */
2816     		if (!PageLocked(page)) {
2817     			PAGE_BUG(page);
2818     		}
2819     
2820     		kaddr = kmap(page);
2821     		status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2822     		if (status)
2823     			goto unlock;
2824     		page_fault = __copy_from_user(kaddr+offset, buf, bytes);
2825     		flush_dcache_page(page);
2826     		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2827     		if (page_fault)
2828     			goto fail_write;
2829     		if (!status)
2830     			status = bytes;
2831     
2832     		if (status >= 0) {
2833     			written += status;
2834     			count -= status;
2835     			pos += status;
2836     			buf += status;
2837     		}
2838     unlock:
2839     		kunmap(page);
2840     		/* Mark it unlocked again and drop the page.. */
2841     		SetPageReferenced(page);
2842     		UnlockPage(page);
2843     		page_cache_release(page);
2844     
2845     		if (status < 0)
2846     			break;
2847     	} while (count);
2848     	*ppos = pos;
2849     
2850     	if (cached_page)
2851     		page_cache_release(cached_page);
2852     
2853     	/* For now, when the user asks for O_SYNC, we'll actually
2854     	 * provide O_DSYNC. */
2855     	if ((status >= 0) && (file->f_flags & O_SYNC))
2856     		status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
2857     	
2858     out_status:	
2859     	err = written ? written : status;
2860     out:
2861     
2862     	up(&inode->i_sem);
2863     	return err;
2864     fail_write:
2865     	status = -EFAULT;
2866     	goto unlock;
2867     
2868     o_direct:
2869     	written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
2870     	if (written > 0) {
2871     		loff_t end = pos + written;
2872     		if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
2873     			inode->i_size = end;
2874     			mark_inode_dirty(inode);
2875     		}
2876     		*ppos = end;
2877     		invalidate_inode_pages2(mapping);
2878     	}
2879     	/*
2880     	 * Sync the fs metadata but not the minor inode changes and
2881     	 * of course not the data as we did direct DMA for the IO.
2882     	 */
2883     	if (written >= 0 && file->f_flags & O_SYNC)
2884     		status = generic_osync_inode(inode, OSYNC_METADATA);
2885     	goto out_status;
2886     }
2887     
2888     void __init page_cache_init(unsigned long mempages)
2889     {
2890     	unsigned long htable_size, order;
2891     
2892     	htable_size = mempages;
2893     	htable_size *= sizeof(struct page *);
2894     	for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2895     		;
2896     
2897     	do {
2898     		unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2899     
2900     		page_hash_bits = 0;
2901     		while((tmp >>= 1UL) != 0UL)
2902     			page_hash_bits++;
2903     
2904     		page_hash_table = (struct page **)
2905     			__get_free_pages(GFP_ATOMIC, order);
2906     	} while(page_hash_table == NULL && --order > 0);
2907     
2908     	printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2909     	       (1 << page_hash_bits), order, (PAGE_SIZE << order));
2910     	if (!page_hash_table)
2911     		panic("Failed to allocate page hash table\n");
2912     	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2913     }
2914