File: /usr/src/linux/fs/inode.c

1     /*
2      * linux/fs/inode.c
3      *
4      * (C) 1997 Linus Torvalds
5      */
6     
7     #include <linux/config.h>
8     #include <linux/fs.h>
9     #include <linux/string.h>
10     #include <linux/mm.h>
11     #include <linux/dcache.h>
12     #include <linux/init.h>
13     #include <linux/quotaops.h>
14     #include <linux/slab.h>
15     #include <linux/cache.h>
16     #include <linux/swap.h>
17     #include <linux/swapctl.h>
18     #include <linux/prefetch.h>
19     #include <linux/locks.h>
20     
21     /*
22      * New inode.c implementation.
23      *
24      * This implementation has the basic premise of trying
25      * to be extremely low-overhead and SMP-safe, yet be
26      * simple enough to be "obviously correct".
27      *
28      * Famous last words.
29      */
30     
31     /* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
32     
33     /* #define INODE_PARANOIA 1 */
34     /* #define INODE_DEBUG 1 */
35     
36     /*
37      * Inode lookup is no longer as critical as it used to be:
38      * most of the lookups are going to be through the dcache.
39      */
40     #define I_HASHBITS	i_hash_shift
41     #define I_HASHMASK	i_hash_mask
42     
43     static unsigned int i_hash_mask;
44     static unsigned int i_hash_shift;
45     
46     /*
47      * Each inode can be on two separate lists. One is
48      * the hash list of the inode, used for lookups. The
49      * other linked list is the "type" list:
50      *  "in_use" - valid inode, i_count > 0, i_nlink > 0
51      *  "dirty"  - as "in_use" but also dirty
52      *  "unused" - valid inode, i_count = 0
53      *
54      * A "dirty" list is maintained for each super block,
55      * allowing for low-overhead inode sync() operations.
56      */
57     
58     static LIST_HEAD(inode_in_use);
59     static LIST_HEAD(inode_unused);
60     static struct list_head *inode_hashtable;
61     static LIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
62     
63     /*
64      * A simple spinlock to protect the list manipulations.
65      *
66      * NOTE! You also have to own the lock if you change
67      * the i_state of an inode while it is in use..
68      */
69     spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
70     
71     /*
72      * Statistics gathering..
73      */
74     struct inodes_stat_t inodes_stat;
75     
76     static kmem_cache_t * inode_cachep;
77     
78     #define alloc_inode() \
79     	 ((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL))
80     static void destroy_inode(struct inode *inode) 
81     {
82     	if (inode_has_buffers(inode))
83     		BUG();
84     	kmem_cache_free(inode_cachep, (inode));
85     }
86     
87     
88     /*
89      * These are initializations that only need to be done
90      * once, because the fields are idempotent across use
91      * of the inode, so let the slab aware of that.
92      */
93     static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
94     {
95     	struct inode * inode = (struct inode *) foo;
96     
97     	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
98     	    SLAB_CTOR_CONSTRUCTOR)
99     	{
100     		memset(inode, 0, sizeof(*inode));
101     		init_waitqueue_head(&inode->i_wait);
102     		INIT_LIST_HEAD(&inode->i_hash);
103     		INIT_LIST_HEAD(&inode->i_data.clean_pages);
104     		INIT_LIST_HEAD(&inode->i_data.dirty_pages);
105     		INIT_LIST_HEAD(&inode->i_data.locked_pages);
106     		INIT_LIST_HEAD(&inode->i_dentry);
107     		INIT_LIST_HEAD(&inode->i_dirty_buffers);
108     		INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
109     		INIT_LIST_HEAD(&inode->i_devices);
110     		sema_init(&inode->i_sem, 1);
111     		sema_init(&inode->i_zombie, 1);
112     		spin_lock_init(&inode->i_data.i_shared_lock);
113     	}
114     }
115     
116     /*
117      * Put the inode on the super block's dirty list.
118      *
119      * CAREFUL! We mark it dirty unconditionally, but
120      * move it onto the dirty list only if it is hashed.
121      * If it was not hashed, it will never be added to
122      * the dirty list even if it is later hashed, as it
123      * will have been marked dirty already.
124      *
125      * In short, make sure you hash any inodes _before_
126      * you start marking them dirty..
127      */
128      
129     /**
130      *	__mark_inode_dirty -	internal function
131      *	@inode: inode to mark
132      *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
133      *	Mark an inode as dirty. Callers should use mark_inode_dirty or
134      *  	mark_inode_dirty_sync.
135      */
136      
137     void __mark_inode_dirty(struct inode *inode, int flags)
138     {
139     	struct super_block * sb = inode->i_sb;
140     
141     	if (!sb)
142     		return;
143     
144     	/* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
145     	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
146     		if (sb->s_op && sb->s_op->dirty_inode)
147     			sb->s_op->dirty_inode(inode);
148     	}
149     
150     	/* avoid the locking if we can */
151     	if ((inode->i_state & flags) == flags)
152     		return;
153     
154     	spin_lock(&inode_lock);
155     	if ((inode->i_state & flags) != flags) {
156     		inode->i_state |= flags;
157     		/* Only add valid (ie hashed) inodes to the dirty list */
158     		if (!(inode->i_state & I_LOCK) && !list_empty(&inode->i_hash)) {
159     			list_del(&inode->i_list);
160     			list_add(&inode->i_list, &sb->s_dirty);
161     		}
162     	}
163     	spin_unlock(&inode_lock);
164     }
165     
166     static void __wait_on_inode(struct inode * inode)
167     {
168     	DECLARE_WAITQUEUE(wait, current);
169     
170     	add_wait_queue(&inode->i_wait, &wait);
171     repeat:
172     	set_current_state(TASK_UNINTERRUPTIBLE);
173     	if (inode->i_state & I_LOCK) {
174     		schedule();
175     		goto repeat;
176     	}
177     	remove_wait_queue(&inode->i_wait, &wait);
178     	current->state = TASK_RUNNING;
179     }
180     
181     static inline void wait_on_inode(struct inode *inode)
182     {
183     	if (inode->i_state & I_LOCK)
184     		__wait_on_inode(inode);
185     }
186     
187     
188     static inline void write_inode(struct inode *inode, int sync)
189     {
190     	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
191     		inode->i_sb->s_op->write_inode(inode, sync);
192     }
193     
194     static inline void __iget(struct inode * inode)
195     {
196     	if (atomic_read(&inode->i_count)) {
197     		atomic_inc(&inode->i_count);
198     		return;
199     	}
200     	atomic_inc(&inode->i_count);
201     	if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
202     		list_del(&inode->i_list);
203     		list_add(&inode->i_list, &inode_in_use);
204     	}
205     	inodes_stat.nr_unused--;
206     }
207     
208     static inline void __sync_one(struct inode *inode, int sync)
209     {
210     	unsigned dirty;
211     
212     	list_del(&inode->i_list);
213     	list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
214     
215     	if (inode->i_state & I_LOCK)
216     		BUG();
217     
218     	/* Set I_LOCK, reset I_DIRTY */
219     	dirty = inode->i_state & I_DIRTY;
220     	inode->i_state |= I_LOCK;
221     	inode->i_state &= ~I_DIRTY;
222     	spin_unlock(&inode_lock);
223     
224     	filemap_fdatasync(inode->i_mapping);
225     
226     	/* Don't write the inode if only I_DIRTY_PAGES was set */
227     	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
228     		write_inode(inode, sync);
229     
230     	filemap_fdatawait(inode->i_mapping);
231     
232     	spin_lock(&inode_lock);
233     	inode->i_state &= ~I_LOCK;
234     	if (!(inode->i_state & I_FREEING)) {
235     		struct list_head *to;
236     		if (inode->i_state & I_DIRTY)
237     			to = &inode->i_sb->s_dirty;
238     		else if (atomic_read(&inode->i_count))
239     			to = &inode_in_use;
240     		else
241     			to = &inode_unused;
242     		list_del(&inode->i_list);
243     		list_add(&inode->i_list, to);
244     	}
245     	wake_up(&inode->i_wait);
246     }
247     
248     static inline void sync_one(struct inode *inode, int sync)
249     {
250     	if (inode->i_state & I_LOCK) {
251     		__iget(inode);
252     		spin_unlock(&inode_lock);
253     		__wait_on_inode(inode);
254     		iput(inode);
255     		spin_lock(&inode_lock);
256     	} else {
257     		__sync_one(inode, sync);
258     	}
259     }
260     
261     static inline void sync_list(struct list_head *head)
262     {
263     	struct list_head * tmp;
264     
265     	while ((tmp = head->prev) != head) 
266     		__sync_one(list_entry(tmp, struct inode, i_list), 0);
267     }
268     
269     static inline void wait_on_locked(struct list_head *head)
270     {
271     	struct list_head * tmp;
272     	while ((tmp = head->prev) != head) {
273     		struct inode *inode = list_entry(tmp, struct inode, i_list);
274     		__iget(inode);
275     		spin_unlock(&inode_lock);
276     		__wait_on_inode(inode);
277     		iput(inode);
278     		spin_lock(&inode_lock);
279     	}
280     }
281     
282     static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes)
283     {
284     	struct list_head *tmp = head;
285     	struct inode *inode;
286     
287     	while (nr_inodes && (tmp = tmp->prev) != head) {
288     		inode = list_entry(tmp, struct inode, i_list);
289     
290     		if (!atomic_read(&inode->i_count)) {
291     			__sync_one(inode, 0);
292     			nr_inodes--;
293     
294     			/* 
295     			 * __sync_one moved the inode to another list,
296     			 * so we have to start looking from the list head.
297     			 */
298     			tmp = head;
299     		}
300     	}
301     
302     	return nr_inodes;
303     }
304     
305     void sync_inodes_sb(struct super_block *sb)
306     {
307     	spin_lock(&inode_lock);
308     	while (!list_empty(&sb->s_dirty)||!list_empty(&sb->s_locked_inodes)) {
309     		sync_list(&sb->s_dirty);
310     		wait_on_locked(&sb->s_locked_inodes);
311     	}
312     	spin_unlock(&inode_lock);
313     }
314     
315     /*
316      * Note:
317      * We don't need to grab a reference to superblock here. If it has non-empty
318      * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
319      * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are
320      * empty. Since __sync_one() regains inode_lock before it finally moves
321      * inode from superblock lists we are OK.
322      */
323     
324     void sync_unlocked_inodes(void)
325     {
326     	struct super_block * sb;
327     	spin_lock(&inode_lock);
328     	spin_lock(&sb_lock);
329     	sb = sb_entry(super_blocks.next);
330     	for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
331     		if (!list_empty(&sb->s_dirty)) {
332     			spin_unlock(&sb_lock);
333     			sync_list(&sb->s_dirty);
334     			spin_lock(&sb_lock);
335     		}
336     	}
337     	spin_unlock(&sb_lock);
338     	spin_unlock(&inode_lock);
339     }
340     
341     /*
342      * Find a superblock with inodes that need to be synced
343      */
344     
345     static struct super_block *get_super_to_sync(void)
346     {
347     	struct list_head *p;
348     restart:
349     	spin_lock(&inode_lock);
350     	spin_lock(&sb_lock);
351     	list_for_each(p, &super_blocks) {
352     		struct super_block *s = list_entry(p,struct super_block,s_list);
353     		if (list_empty(&s->s_dirty) && list_empty(&s->s_locked_inodes))
354     			continue;
355     		s->s_count++;
356     		spin_unlock(&sb_lock);
357     		spin_unlock(&inode_lock);
358     		down_read(&s->s_umount);
359     		if (!s->s_root) {
360     			drop_super(s);
361     			goto restart;
362     		}
363     		return s;
364     	}
365     	spin_unlock(&sb_lock);
366     	spin_unlock(&inode_lock);
367     	return NULL;
368     }
369     
370     /**
371      *	sync_inodes
372      *	@dev: device to sync the inodes from.
373      *
374      *	sync_inodes goes through the super block's dirty list, 
375      *	writes them out, and puts them back on the normal list.
376      */
377     
378     void sync_inodes(kdev_t dev)
379     {
380     	struct super_block * s;
381     
382     	/*
383     	 * Search the super_blocks array for the device(s) to sync.
384     	 */
385     	if (dev) {
386     		if ((s = get_super(dev)) != NULL) {
387     			sync_inodes_sb(s);
388     			drop_super(s);
389     		}
390     	} else {
391     		while ((s = get_super_to_sync()) != NULL) {
392     			sync_inodes_sb(s);
393     			drop_super(s);
394     		}
395     	}
396     }
397     
398     static void try_to_sync_unused_inodes(void * arg)
399     {
400     	struct super_block * sb;
401     	int nr_inodes = inodes_stat.nr_unused;
402     
403     	spin_lock(&inode_lock);
404     	spin_lock(&sb_lock);
405     	sb = sb_entry(super_blocks.next);
406     	for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
407     		spin_unlock(&sb_lock);
408     		nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes);
409     		spin_lock(&sb_lock);
410     	}
411     	spin_unlock(&sb_lock);
412     	spin_unlock(&inode_lock);
413     }
414     
415     static struct tq_struct unused_inodes_flush_task;
416     
417     /**
418      *	write_inode_now	-	write an inode to disk
419      *	@inode: inode to write to disk
420      *	@sync: whether the write should be synchronous or not
421      *
422      *	This function commits an inode to disk immediately if it is
423      *	dirty. This is primarily needed by knfsd.
424      */
425      
426     void write_inode_now(struct inode *inode, int sync)
427     {
428     	struct super_block * sb = inode->i_sb;
429     
430     	if (sb) {
431     		spin_lock(&inode_lock);
432     		while (inode->i_state & I_DIRTY)
433     			sync_one(inode, sync);
434     		spin_unlock(&inode_lock);
435     		if (sync)
436     			wait_on_inode(inode);
437     	}
438     	else
439     		printk(KERN_ERR "write_inode_now: no super block\n");
440     }
441     
442     /**
443      * generic_osync_inode - flush all dirty data for a given inode to disk
444      * @inode: inode to write
445      * @datasync: if set, don't bother flushing timestamps
446      *
447      * This can be called by file_write functions for files which have the
448      * O_SYNC flag set, to flush dirty writes to disk.  
449      */
450     
451     int generic_osync_inode(struct inode *inode, int what)
452     {
453     	int err = 0, err2 = 0, need_write_inode_now = 0;
454     	
455     	/* 
456     	 * WARNING
457     	 *
458     	 * Currently, the filesystem write path does not pass the
459     	 * filp down to the low-level write functions.  Therefore it
460     	 * is impossible for (say) __block_commit_write to know if
461     	 * the operation is O_SYNC or not.
462     	 *
463     	 * Ideally, O_SYNC writes would have the filesystem call
464     	 * ll_rw_block as it went to kick-start the writes, and we
465     	 * could call osync_inode_buffers() here to wait only for
466     	 * those IOs which have already been submitted to the device
467     	 * driver layer.  As it stands, if we did this we'd not write
468     	 * anything to disk since our writes have not been queued by
469     	 * this point: they are still on the dirty LRU.
470     	 * 
471     	 * So, currently we will call fsync_inode_buffers() instead,
472     	 * to flush _all_ dirty buffers for this inode to disk on 
473     	 * every O_SYNC write, not just the synchronous I/Os.  --sct
474     	 */
475     
476     	if (what & OSYNC_METADATA)
477     		err = fsync_inode_buffers(inode);
478     	if (what & OSYNC_DATA)
479     		err2 = fsync_inode_data_buffers(inode);
480     	if (!err)
481     		err = err2;
482     
483     	spin_lock(&inode_lock);
484     	if ((inode->i_state & I_DIRTY) &&
485     	    ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
486     		need_write_inode_now = 1;
487     	spin_unlock(&inode_lock);
488     
489     	if (need_write_inode_now)
490     		write_inode_now(inode, 1);
491     	else
492     		wait_on_inode(inode);
493     
494     	return err;
495     }
496     
497     /**
498      * clear_inode - clear an inode
499      * @inode: inode to clear
500      *
501      * This is called by the filesystem to tell us
502      * that the inode is no longer useful. We just
503      * terminate it with extreme prejudice.
504      */
505      
506     void clear_inode(struct inode *inode)
507     {
508     	invalidate_inode_buffers(inode);
509            
510     	if (inode->i_data.nrpages)
511     		BUG();
512     	if (!(inode->i_state & I_FREEING))
513     		BUG();
514     	if (inode->i_state & I_CLEAR)
515     		BUG();
516     	wait_on_inode(inode);
517     	DQUOT_DROP(inode);
518     	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->clear_inode)
519     		inode->i_sb->s_op->clear_inode(inode);
520     	if (inode->i_bdev)
521     		bd_forget(inode);
522     	else if (inode->i_cdev) {
523     		cdput(inode->i_cdev);
524     		inode->i_cdev = NULL;
525     	}
526     	inode->i_state = I_CLEAR;
527     }
528     
529     /*
530      * Dispose-list gets a local list with local inodes in it, so it doesn't
531      * need to worry about list corruption and SMP locks.
532      */
533     static void dispose_list(struct list_head * head)
534     {
535     	struct list_head * inode_entry;
536     	struct inode * inode;
537     
538     	while ((inode_entry = head->next) != head)
539     	{
540     		list_del(inode_entry);
541     
542     		inode = list_entry(inode_entry, struct inode, i_list);
543     		if (inode->i_data.nrpages)
544     			truncate_inode_pages(&inode->i_data, 0);
545     		clear_inode(inode);
546     		destroy_inode(inode);
547     		inodes_stat.nr_inodes--;
548     	}
549     }
550     
551     /*
552      * Invalidate all inodes for a device.
553      */
554     static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
555     {
556     	struct list_head *next;
557     	int busy = 0, count = 0;
558     
559     	next = head->next;
560     	for (;;) {
561     		struct list_head * tmp = next;
562     		struct inode * inode;
563     
564     		next = next->next;
565     		if (tmp == head)
566     			break;
567     		inode = list_entry(tmp, struct inode, i_list);
568     		if (inode->i_sb != sb)
569     			continue;
570     		invalidate_inode_buffers(inode);
571     		if (!atomic_read(&inode->i_count)) {
572     			list_del(&inode->i_hash);
573     			INIT_LIST_HEAD(&inode->i_hash);
574     			list_del(&inode->i_list);
575     			list_add(&inode->i_list, dispose);
576     			inode->i_state |= I_FREEING;
577     			count++;
578     			continue;
579     		}
580     		busy = 1;
581     	}
582     	/* only unused inodes may be cached with i_count zero */
583     	inodes_stat.nr_unused -= count;
584     	return busy;
585     }
586     
587     /*
588      * This is a two-stage process. First we collect all
589      * offending inodes onto the throw-away list, and in
590      * the second stage we actually dispose of them. This
591      * is because we don't want to sleep while messing
592      * with the global lists..
593      */
594      
595     /**
596      *	invalidate_inodes	- discard the inodes on a device
597      *	@sb: superblock
598      *
599      *	Discard all of the inodes for a given superblock. If the discard
600      *	fails because there are busy inodes then a non zero value is returned.
601      *	If the discard is successful all the inodes have been discarded.
602      */
603      
604     int invalidate_inodes(struct super_block * sb)
605     {
606     	int busy;
607     	LIST_HEAD(throw_away);
608     
609     	spin_lock(&inode_lock);
610     	busy = invalidate_list(&inode_in_use, sb, &throw_away);
611     	busy |= invalidate_list(&inode_unused, sb, &throw_away);
612     	busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
613     	busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away);
614     	spin_unlock(&inode_lock);
615     
616     	dispose_list(&throw_away);
617     
618     	return busy;
619     }
620      
621     int invalidate_device(kdev_t dev, int do_sync)
622     {
623     	struct super_block *sb;
624     	int res;
625     
626     	if (do_sync)
627     		fsync_dev(dev);
628     
629     	res = 0;
630     	sb = get_super(dev);
631     	if (sb) {
632     		/*
633     		 * no need to lock the super, get_super holds the
634     		 * read semaphore so the filesystem cannot go away
635     		 * under us (->put_super runs with the write lock
636     		 * hold).
637     		 */
638     		shrink_dcache_sb(sb);
639     		res = invalidate_inodes(sb);
640     		drop_super(sb);
641     	}
642     	invalidate_buffers(dev);
643     	return res;
644     }
645     
646     
647     /*
648      * This is called with the inode lock held. It searches
649      * the in-use for freeable inodes, which are moved to a
650      * temporary list and then placed on the unused list by
651      * dispose_list. 
652      *
653      * We don't expect to have to call this very often.
654      *
655      * N.B. The spinlock is released during the call to
656      *      dispose_list.
657      */
658     #define CAN_UNUSE(inode) \
659     	((((inode)->i_state | (inode)->i_data.nrpages) == 0)  && \
660     	 !inode_has_buffers(inode))
661     #define INODE(entry)	(list_entry(entry, struct inode, i_list))
662     
663     void prune_icache(int goal)
664     {
665     	LIST_HEAD(list);
666     	struct list_head *entry, *freeable = &list;
667     	int count;
668     	struct inode * inode;
669     
670     	spin_lock(&inode_lock);
671     
672     	count = 0;
673     	entry = inode_unused.prev;
674     	while (entry != &inode_unused)
675     	{
676     		struct list_head *tmp = entry;
677     
678     		entry = entry->prev;
679     		inode = INODE(tmp);
680     		if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK))
681     			BUG();
682     		if (!CAN_UNUSE(inode))
683     			continue;
684     		if (atomic_read(&inode->i_count))
685     			BUG();
686     		list_del(tmp);
687     		list_del(&inode->i_hash);
688     		INIT_LIST_HEAD(&inode->i_hash);
689     		list_add(tmp, freeable);
690     		inode->i_state |= I_FREEING;
691     		count++;
692     		if (!--goal)
693     			break;
694     	}
695     	inodes_stat.nr_unused -= count;
696     	spin_unlock(&inode_lock);
697     
698     	dispose_list(freeable);
699     
700     	/* 
701     	 * If we didn't freed enough clean inodes schedule
702     	 * a sync of the dirty inodes, we cannot do it
703     	 * from here or we're either synchronously dogslow
704     	 * or we deadlock with oom.
705     	 */
706     	if (goal)
707     		schedule_task(&unused_inodes_flush_task);
708     }
709     
710     int shrink_icache_memory(int priority, int gfp_mask)
711     {
712     	int count = 0;
713     
714     	/*
715     	 * Nasty deadlock avoidance..
716     	 *
717     	 * We may hold various FS locks, and we don't
718     	 * want to recurse into the FS that called us
719     	 * in clear_inode() and friends..
720     	 */
721     	if (!(gfp_mask & __GFP_FS))
722     		return 0;
723     
724     	count = inodes_stat.nr_unused / priority;
725     
726     	prune_icache(count);
727     	kmem_cache_shrink(inode_cachep);
728     	return 0;
729     }
730     
731     /*
732      * Called with the inode lock held.
733      * NOTE: we are not increasing the inode-refcount, you must call __iget()
734      * by hand after calling find_inode now! This simplifies iunique and won't
735      * add any additional branch in the common code.
736      */
737     static struct inode * find_inode(struct super_block * sb, unsigned long ino, struct list_head *head, find_inode_t find_actor, void *opaque)
738     {
739     	struct list_head *tmp;
740     	struct inode * inode;
741     
742     	tmp = head;
743     	for (;;) {
744     		tmp = tmp->next;
745     		inode = NULL;
746     		if (tmp == head)
747     			break;
748     		inode = list_entry(tmp, struct inode, i_hash);
749     		if (inode->i_ino != ino)
750     			continue;
751     		if (inode->i_sb != sb)
752     			continue;
753     		if (find_actor && !find_actor(inode, ino, opaque))
754     			continue;
755     		break;
756     	}
757     	return inode;
758     }
759     
760     /*
761      * This just initializes the inode fields
762      * to known values before returning the inode..
763      *
764      * i_sb, i_ino, i_count, i_state and the lists have
765      * been initialized elsewhere..
766      */
767     static void clean_inode(struct inode *inode)
768     {
769     	static struct address_space_operations empty_aops;
770     	static struct inode_operations empty_iops;
771     	static struct file_operations empty_fops;
772     	memset(&inode->u, 0, sizeof(inode->u));
773     	inode->i_sock = 0;
774     	inode->i_op = &empty_iops;
775     	inode->i_fop = &empty_fops;
776     	inode->i_nlink = 1;
777     	atomic_set(&inode->i_writecount, 0);
778     	inode->i_size = 0;
779     	inode->i_blocks = 0;
780     	inode->i_generation = 0;
781     	memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
782     	inode->i_pipe = NULL;
783     	inode->i_bdev = NULL;
784     	inode->i_cdev = NULL;
785     	inode->i_data.a_ops = &empty_aops;
786     	inode->i_data.host = inode;
787     	inode->i_data.gfp_mask = GFP_HIGHUSER;
788     	inode->i_mapping = &inode->i_data;
789     }
790     
791     /**
792      * get_empty_inode 	- obtain an inode
793      *
794      * This is called by things like the networking layer
795      * etc that want to get an inode without any inode
796      * number, or filesystems that allocate new inodes with
797      * no pre-existing information.
798      *
799      * On a successful return the inode pointer is returned. On a failure
800      * a %NULL pointer is returned. The returned inode is not on any superblock
801      * lists.
802      */
803      
804     struct inode * get_empty_inode(void)
805     {
806     	static unsigned long last_ino;
807     	struct inode * inode;
808     
809     	spin_lock_prefetch(&inode_lock);
810     	
811     	inode = alloc_inode();
812     	if (inode)
813     	{
814     		spin_lock(&inode_lock);
815     		inodes_stat.nr_inodes++;
816     		list_add(&inode->i_list, &inode_in_use);
817     		inode->i_sb = NULL;
818     		inode->i_dev = 0;
819     		inode->i_ino = ++last_ino;
820     		inode->i_flags = 0;
821     		atomic_set(&inode->i_count, 1);
822     		inode->i_state = 0;
823     		spin_unlock(&inode_lock);
824     		clean_inode(inode);
825     	}
826     	return inode;
827     }
828     
829     /*
830      * This is called without the inode lock held.. Be careful.
831      *
832      * We no longer cache the sb_flags in i_flags - see fs.h
833      *	-- rmk@arm.uk.linux.org
834      */
835     static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, struct list_head *head, find_inode_t find_actor, void *opaque)
836     {
837     	struct inode * inode;
838     
839     	inode = alloc_inode();
840     	if (inode) {
841     		struct inode * old;
842     
843     		spin_lock(&inode_lock);
844     		/* We released the lock, so.. */
845     		old = find_inode(sb, ino, head, find_actor, opaque);
846     		if (!old) {
847     			inodes_stat.nr_inodes++;
848     			list_add(&inode->i_list, &inode_in_use);
849     			list_add(&inode->i_hash, head);
850     			inode->i_sb = sb;
851     			inode->i_dev = sb->s_dev;
852     			inode->i_ino = ino;
853     			inode->i_flags = 0;
854     			atomic_set(&inode->i_count, 1);
855     			inode->i_state = I_LOCK;
856     			spin_unlock(&inode_lock);
857     
858     			clean_inode(inode);
859     
860     			/* reiserfs specific hack right here.  We don't
861     			** want this to last, and are looking for VFS changes
862     			** that will allow us to get rid of it.
863     			** -- mason@suse.com 
864     			*/
865     			if (sb->s_op->read_inode2) {
866     				sb->s_op->read_inode2(inode, opaque) ;
867     			} else {
868     				sb->s_op->read_inode(inode);
869     			}
870     
871     			/*
872     			 * This is special!  We do not need the spinlock
873     			 * when clearing I_LOCK, because we're guaranteed
874     			 * that nobody else tries to do anything about the
875     			 * state of the inode when it is locked, as we
876     			 * just created it (so there can be no old holders
877     			 * that haven't tested I_LOCK).
878     			 */
879     			inode->i_state &= ~I_LOCK;
880     			wake_up(&inode->i_wait);
881     
882     			return inode;
883     		}
884     
885     		/*
886     		 * Uhhuh, somebody else created the same inode under
887     		 * us. Use the old inode instead of the one we just
888     		 * allocated.
889     		 */
890     		__iget(old);
891     		spin_unlock(&inode_lock);
892     		destroy_inode(inode);
893     		inode = old;
894     		wait_on_inode(inode);
895     	}
896     	return inode;
897     }
898     
899     static inline unsigned long hash(struct super_block *sb, unsigned long i_ino)
900     {
901     	unsigned long tmp = i_ino + ((unsigned long) sb / L1_CACHE_BYTES);
902     	tmp = tmp + (tmp >> I_HASHBITS);
903     	return tmp & I_HASHMASK;
904     }
905     
906     /* Yeah, I know about quadratic hash. Maybe, later. */
907     
908     /**
909      *	iunique - get a unique inode number
910      *	@sb: superblock
911      *	@max_reserved: highest reserved inode number
912      *
913      *	Obtain an inode number that is unique on the system for a given
914      *	superblock. This is used by file systems that have no natural
915      *	permanent inode numbering system. An inode number is returned that
916      *	is higher than the reserved limit but unique.
917      *
918      *	BUGS:
919      *	With a large number of inodes live on the file system this function
920      *	currently becomes quite slow.
921      */
922      
923     ino_t iunique(struct super_block *sb, ino_t max_reserved)
924     {
925     	static ino_t counter = 0;
926     	struct inode *inode;
927     	struct list_head * head;
928     	ino_t res;
929     	spin_lock(&inode_lock);
930     retry:
931     	if (counter > max_reserved) {
932     		head = inode_hashtable + hash(sb,counter);
933     		inode = find_inode(sb, res = counter++, head, NULL, NULL);
934     		if (!inode) {
935     			spin_unlock(&inode_lock);
936     			return res;
937     		}
938     	} else {
939     		counter = max_reserved + 1;
940     	}
941     	goto retry;
942     	
943     }
944     
945     struct inode *igrab(struct inode *inode)
946     {
947     	spin_lock(&inode_lock);
948     	if (!(inode->i_state & I_FREEING))
949     		__iget(inode);
950     	else
951     		/*
952     		 * Handle the case where s_op->clear_inode is not been
953     		 * called yet, and somebody is calling igrab
954     		 * while the inode is getting freed.
955     		 */
956     		inode = NULL;
957     	spin_unlock(&inode_lock);
958     	if (inode)
959     		wait_on_inode(inode);
960     	return inode;
961     }
962     
963     
964     struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find_actor, void *opaque)
965     {
966     	struct list_head * head = inode_hashtable + hash(sb,ino);
967     	struct inode * inode;
968     
969     	spin_lock(&inode_lock);
970     	inode = find_inode(sb, ino, head, find_actor, opaque);
971     	if (inode) {
972     		__iget(inode);
973     		spin_unlock(&inode_lock);
974     		wait_on_inode(inode);
975     		return inode;
976     	}
977     	spin_unlock(&inode_lock);
978     
979     	/*
980     	 * get_new_inode() will do the right thing, re-trying the search
981     	 * in case it had to block at any point.
982     	 */
983     	return get_new_inode(sb, ino, head, find_actor, opaque);
984     }
985     
986     /**
987      *	insert_inode_hash - hash an inode
988      *	@inode: unhashed inode
989      *
990      *	Add an inode to the inode hash for this superblock. If the inode
991      *	has no superblock it is added to a separate anonymous chain.
992      */
993      
994     void insert_inode_hash(struct inode *inode)
995     {
996     	struct list_head *head = &anon_hash_chain;
997     	if (inode->i_sb)
998     		head = inode_hashtable + hash(inode->i_sb, inode->i_ino);
999     	spin_lock(&inode_lock);
1000     	list_add(&inode->i_hash, head);
1001     	spin_unlock(&inode_lock);
1002     }
1003     
1004     /**
1005      *	remove_inode_hash - remove an inode from the hash
1006      *	@inode: inode to unhash
1007      *
1008      *	Remove an inode from the superblock or anonymous hash.
1009      */
1010      
1011     void remove_inode_hash(struct inode *inode)
1012     {
1013     	spin_lock(&inode_lock);
1014     	list_del(&inode->i_hash);
1015     	INIT_LIST_HEAD(&inode->i_hash);
1016     	spin_unlock(&inode_lock);
1017     }
1018     
1019     /**
1020      *	iput	- put an inode 
1021      *	@inode: inode to put
1022      *
1023      *	Puts an inode, dropping its usage count. If the inode use count hits
1024      *	zero the inode is also then freed and may be destroyed.
1025      */
1026      
1027     void iput(struct inode *inode)
1028     {
1029     	if (inode) {
1030     		struct super_operations *op = NULL;
1031     
1032     		if (inode->i_state == I_CLEAR)
1033     			BUG();
1034     
1035     		if (inode->i_sb && inode->i_sb->s_op)
1036     			op = inode->i_sb->s_op;
1037     		if (op && op->put_inode)
1038     			op->put_inode(inode);
1039     
1040     		if (!atomic_dec_and_lock(&inode->i_count, &inode_lock))
1041     			return;
1042     
1043     		if (!inode->i_nlink) {
1044     			list_del(&inode->i_hash);
1045     			INIT_LIST_HEAD(&inode->i_hash);
1046     			list_del(&inode->i_list);
1047     			INIT_LIST_HEAD(&inode->i_list);
1048     			inode->i_state|=I_FREEING;
1049     			inodes_stat.nr_inodes--;
1050     			spin_unlock(&inode_lock);
1051     
1052     			if (inode->i_data.nrpages)
1053     				truncate_inode_pages(&inode->i_data, 0);
1054     
1055     			if (op && op->delete_inode) {
1056     				void (*delete)(struct inode *) = op->delete_inode;
1057     				if (!is_bad_inode(inode))
1058     					DQUOT_INIT(inode);
1059     				/* s_op->delete_inode internally recalls clear_inode() */
1060     				delete(inode);
1061     			} else
1062     				clear_inode(inode);
1063     			if (inode->i_state != I_CLEAR)
1064     				BUG();
1065     		} else {
1066     			if (!list_empty(&inode->i_hash)) {
1067     				if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
1068     					list_del(&inode->i_list);
1069     					list_add(&inode->i_list, &inode_unused);
1070     				}
1071     				inodes_stat.nr_unused++;
1072     				spin_unlock(&inode_lock);
1073     				return;
1074     			} else {
1075     				/* magic nfs path */
1076     				list_del(&inode->i_list);
1077     				INIT_LIST_HEAD(&inode->i_list);
1078     				inode->i_state|=I_FREEING;
1079     				inodes_stat.nr_inodes--;
1080     				spin_unlock(&inode_lock);
1081     				if (inode->i_data.nrpages)
1082     					truncate_inode_pages(&inode->i_data, 0);
1083     				clear_inode(inode);
1084     			}
1085     		}
1086     		destroy_inode(inode);
1087     	}
1088     }
1089     
1090     void force_delete(struct inode *inode)
1091     {
1092     	/*
1093     	 * Kill off unused inodes ... iput() will unhash and
1094     	 * delete the inode if we set i_nlink to zero.
1095     	 */
1096     	if (atomic_read(&inode->i_count) == 1)
1097     		inode->i_nlink = 0;
1098     }
1099     
1100     /**
1101      *	bmap	- find a block number in a file
1102      *	@inode: inode of file
1103      *	@block: block to find
1104      *
1105      *	Returns the block number on the device holding the inode that
1106      *	is the disk block number for the block of the file requested.
1107      *	That is, asked for block 4 of inode 1 the function will return the
1108      *	disk block relative to the disk start that holds that block of the 
1109      *	file.
1110      */
1111      
1112     int bmap(struct inode * inode, int block)
1113     {
1114     	int res = 0;
1115     	if (inode->i_mapping->a_ops->bmap)
1116     		res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
1117     	return res;
1118     }
1119     
1120     /*
1121      * Initialize the hash tables.
1122      */
1123     void __init inode_init(unsigned long mempages)
1124     {
1125     	struct list_head *head;
1126     	unsigned long order;
1127     	unsigned int nr_hash;
1128     	int i;
1129     
1130     	mempages >>= (14 - PAGE_SHIFT);
1131     	mempages *= sizeof(struct list_head);
1132     	for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
1133     		;
1134     
1135     	do {
1136     		unsigned long tmp;
1137     
1138     		nr_hash = (1UL << order) * PAGE_SIZE /
1139     			sizeof(struct list_head);
1140     		i_hash_mask = (nr_hash - 1);
1141     
1142     		tmp = nr_hash;
1143     		i_hash_shift = 0;
1144     		while ((tmp >>= 1UL) != 0UL)
1145     			i_hash_shift++;
1146     
1147     		inode_hashtable = (struct list_head *)
1148     			__get_free_pages(GFP_ATOMIC, order);
1149     	} while (inode_hashtable == NULL && --order >= 0);
1150     
1151     	printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1152     			nr_hash, order, (PAGE_SIZE << order));
1153     
1154     	if (!inode_hashtable)
1155     		panic("Failed to allocate inode hash table\n");
1156     
1157     	head = inode_hashtable;
1158     	i = nr_hash;
1159     	do {
1160     		INIT_LIST_HEAD(head);
1161     		head++;
1162     		i--;
1163     	} while (i);
1164     
1165     	/* inode slab cache */
1166     	inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
1167     					 0, SLAB_HWCACHE_ALIGN, init_once,
1168     					 NULL);
1169     	if (!inode_cachep)
1170     		panic("cannot create inode slab cache");
1171     
1172     	unused_inodes_flush_task.routine = try_to_sync_unused_inodes;
1173     }
1174     
1175     /**
1176      *	update_atime	-	update the access time
1177      *	@inode: inode accessed
1178      *
1179      *	Update the accessed time on an inode and mark it for writeback.
1180      *	This function automatically handles read only file systems and media,
1181      *	as well as the "noatime" flag and inode specific "noatime" markers.
1182      */
1183      
1184     void update_atime (struct inode *inode)
1185     {
1186     	if ( IS_NOATIME (inode) ) return;
1187     	if ( IS_NODIRATIME (inode) && S_ISDIR (inode->i_mode) ) return;
1188     	if ( IS_RDONLY (inode) ) return;
1189     	inode->i_atime = CURRENT_TIME;
1190     	mark_inode_dirty_sync (inode);
1191     }   /*  End Function update_atime  */
1192     
1193     
1194     /*
1195      *	Quota functions that want to walk the inode lists..
1196      */
1197     #ifdef CONFIG_QUOTA
1198     
1199     /* Functions back in dquot.c */
1200     void put_dquot_list(struct list_head *);
1201     int remove_inode_dquot_ref(struct inode *, short, struct list_head *);
1202     
1203     void remove_dquot_ref(struct super_block *sb, short type)
1204     {
1205     	struct inode *inode;
1206     	struct list_head *act_head;
1207     	LIST_HEAD(tofree_head);
1208     
1209     	if (!sb->dq_op)
1210     		return;	/* nothing to do */
1211     
1212     	/* We have to be protected against other CPUs */
1213     	spin_lock(&inode_lock);
1214      
1215     	list_for_each(act_head, &inode_in_use) {
1216     		inode = list_entry(act_head, struct inode, i_list);
1217     		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
1218     			remove_inode_dquot_ref(inode, type, &tofree_head);
1219     	}
1220     	list_for_each(act_head, &inode_unused) {
1221     		inode = list_entry(act_head, struct inode, i_list);
1222     		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
1223     			remove_inode_dquot_ref(inode, type, &tofree_head);
1224     	}
1225     	list_for_each(act_head, &sb->s_dirty) {
1226     		inode = list_entry(act_head, struct inode, i_list);
1227     		if (IS_QUOTAINIT(inode))
1228     			remove_inode_dquot_ref(inode, type, &tofree_head);
1229     	}
1230     	list_for_each(act_head, &sb->s_locked_inodes) {
1231     		inode = list_entry(act_head, struct inode, i_list);
1232     		if (IS_QUOTAINIT(inode))
1233     			remove_inode_dquot_ref(inode, type, &tofree_head);
1234     	}
1235     	spin_unlock(&inode_lock);
1236     
1237     	put_dquot_list(&tofree_head);
1238     }
1239     
1240     #endif
1241