File: /usr/src/linux/mm/swapfile.c

1     /*
2      *  linux/mm/swapfile.c
3      *
4      *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5      *  Swap reorganised 29.12.95, Stephen Tweedie
6      */
7     
8     #include <linux/slab.h>
9     #include <linux/smp_lock.h>
10     #include <linux/kernel_stat.h>
11     #include <linux/swap.h>
12     #include <linux/swapctl.h>
13     #include <linux/blkdev.h> /* for blk_size */
14     #include <linux/vmalloc.h>
15     #include <linux/pagemap.h>
16     #include <linux/shm.h>
17     #include <linux/compiler.h>
18     
19     #include <asm/pgtable.h>
20     
21     spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
22     unsigned int nr_swapfiles;
23     int total_swap_pages;
24     static int swap_overflow;
25     
26     static const char Bad_file[] = "Bad swap file entry ";
27     static const char Unused_file[] = "Unused swap file entry ";
28     static const char Bad_offset[] = "Bad swap offset entry ";
29     static const char Unused_offset[] = "Unused swap offset entry ";
30     
31     struct swap_list_t swap_list = {-1, -1};
32     
33     struct swap_info_struct swap_info[MAX_SWAPFILES];
34     
35     #define SWAPFILE_CLUSTER 256
36     
37     static inline int scan_swap_map(struct swap_info_struct *si)
38     {
39     	unsigned long offset;
40     	/* 
41     	 * We try to cluster swap pages by allocating them
42     	 * sequentially in swap.  Once we've allocated
43     	 * SWAPFILE_CLUSTER pages this way, however, we resort to
44     	 * first-free allocation, starting a new cluster.  This
45     	 * prevents us from scattering swap pages all over the entire
46     	 * swap partition, so that we reduce overall disk seek times
47     	 * between swap pages.  -- sct */
48     	if (si->cluster_nr) {
49     		while (si->cluster_next <= si->highest_bit) {
50     			offset = si->cluster_next++;
51     			if (si->swap_map[offset])
52     				continue;
53     			si->cluster_nr--;
54     			goto got_page;
55     		}
56     	}
57     	si->cluster_nr = SWAPFILE_CLUSTER;
58     
59     	/* try to find an empty (even not aligned) cluster. */
60     	offset = si->lowest_bit;
61      check_next_cluster:
62     	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
63     	{
64     		int nr;
65     		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
66     			if (si->swap_map[nr])
67     			{
68     				offset = nr+1;
69     				goto check_next_cluster;
70     			}
71     		/* We found a completly empty cluster, so start
72     		 * using it.
73     		 */
74     		goto got_page;
75     	}
76     	/* No luck, so now go finegrined as usual. -Andrea */
77     	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
78     		if (si->swap_map[offset])
79     			continue;
80     		si->lowest_bit = offset+1;
81     	got_page:
82     		if (offset == si->lowest_bit)
83     			si->lowest_bit++;
84     		if (offset == si->highest_bit)
85     			si->highest_bit--;
86     		if (si->lowest_bit > si->highest_bit) {
87     			si->lowest_bit = si->max;
88     			si->highest_bit = 0;
89     		}
90     		/* Initial count 1 for user reference + 1 for swap cache */
91     		si->swap_map[offset] = 2;
92     		nr_swap_pages--;
93     		si->cluster_next = offset+1;
94     		return offset;
95     	}
96     	si->lowest_bit = si->max;
97     	si->highest_bit = 0;
98     	return 0;
99     }
100     
101     /*
102      * Callers of get_swap_page must hold swap_list_lock across the call,
103      * and across the following add_to_swap_cache, to guard against races
104      * with read_swap_cache_async.
105      */
106     swp_entry_t get_swap_page(void)
107     {
108     	struct swap_info_struct * p;
109     	unsigned long offset;
110     	swp_entry_t entry;
111     	int type, wrapped = 0;
112     
113     	entry.val = 0;	/* Out of memory */
114     	type = swap_list.next;
115     	if (type < 0)
116     		goto out;
117     	if (nr_swap_pages <= 0)
118     		goto out;
119     
120     	while (1) {
121     		p = &swap_info[type];
122     		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
123     			swap_device_lock(p);
124     			offset = scan_swap_map(p);
125     			swap_device_unlock(p);
126     			if (offset) {
127     				entry = SWP_ENTRY(type,offset);
128     				type = swap_info[type].next;
129     				if (type < 0 ||
130     					p->prio != swap_info[type].prio) {
131     						swap_list.next = swap_list.head;
132     				} else {
133     					swap_list.next = type;
134     				}
135     				goto out;
136     			}
137     		}
138     		type = p->next;
139     		if (!wrapped) {
140     			if (type < 0 || p->prio != swap_info[type].prio) {
141     				type = swap_list.head;
142     				wrapped = 1;
143     			}
144     		} else
145     			if (type < 0)
146     				goto out;	/* out of swap space */
147     	}
148     out:
149     	return entry;
150     }
151     
152     /*
153      * Caller has made sure that the swapdevice corresponding to entry
154      * is still around or has not been recycled.
155      */
156     void swap_free(swp_entry_t entry)
157     {
158     	struct swap_info_struct * p;
159     	unsigned long offset, type;
160     
161     	if (!entry.val)
162     		goto out;
163     
164     	type = SWP_TYPE(entry);
165     	if (type >= nr_swapfiles)
166     		goto bad_nofile;
167     	p = & swap_info[type];
168     	if (!(p->flags & SWP_USED))
169     		goto bad_device;
170     	offset = SWP_OFFSET(entry);
171     	if (offset >= p->max)
172     		goto bad_offset;
173     	if (!p->swap_map[offset])
174     		goto bad_free;
175     	swap_list_lock();
176     	if (p->prio > swap_info[swap_list.next].prio)
177     		swap_list.next = type;
178     	swap_device_lock(p);
179     	if (p->swap_map[offset] < SWAP_MAP_MAX) {
180     		if (!--(p->swap_map[offset])) {
181     			if (offset < p->lowest_bit)
182     				p->lowest_bit = offset;
183     			if (offset > p->highest_bit)
184     				p->highest_bit = offset;
185     			nr_swap_pages++;
186     		}
187     	}
188     	swap_device_unlock(p);
189     	swap_list_unlock();
190     out:
191     	return;
192     
193     bad_nofile:
194     	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
195     	goto out;
196     bad_device:
197     	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
198     	goto out;
199     bad_offset:
200     	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
201     	goto out;
202     bad_free:
203     	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
204     	goto out;
205     }
206     
207     /*
208      * The swap entry has been read in advance, and we return 1 to indicate
209      * that the page has been used or is no longer needed.
210      *
211      * Always set the resulting pte to be nowrite (the same as COW pages
212      * after one process has exited).  We don't know just how many PTEs will
213      * share this swap entry, so be cautious and let do_wp_page work out
214      * what to do if a write is requested later.
215      */
216     /* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */
217     static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
218     	pte_t *dir, swp_entry_t entry, struct page* page)
219     {
220     	pte_t pte = *dir;
221     
222     	if (likely(pte_to_swp_entry(pte).val != entry.val))
223     		return;
224     	if (unlikely(pte_none(pte) || pte_present(pte)))
225     		return;
226     	get_page(page);
227     	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
228     	swap_free(entry);
229     	++vma->vm_mm->rss;
230     }
231     
232     /* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */
233     static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
234     	unsigned long address, unsigned long size, unsigned long offset,
235     	swp_entry_t entry, struct page* page)
236     {
237     	pte_t * pte;
238     	unsigned long end;
239     
240     	if (pmd_none(*dir))
241     		return;
242     	if (pmd_bad(*dir)) {
243     		pmd_ERROR(*dir);
244     		pmd_clear(dir);
245     		return;
246     	}
247     	pte = pte_offset(dir, address);
248     	offset += address & PMD_MASK;
249     	address &= ~PMD_MASK;
250     	end = address + size;
251     	if (end > PMD_SIZE)
252     		end = PMD_SIZE;
253     	do {
254     		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
255     		address += PAGE_SIZE;
256     		pte++;
257     	} while (address && (address < end));
258     }
259     
260     /* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */
261     static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
262     	unsigned long address, unsigned long size,
263     	swp_entry_t entry, struct page* page)
264     {
265     	pmd_t * pmd;
266     	unsigned long offset, end;
267     
268     	if (pgd_none(*dir))
269     		return;
270     	if (pgd_bad(*dir)) {
271     		pgd_ERROR(*dir);
272     		pgd_clear(dir);
273     		return;
274     	}
275     	pmd = pmd_offset(dir, address);
276     	offset = address & PGDIR_MASK;
277     	address &= ~PGDIR_MASK;
278     	end = address + size;
279     	if (end > PGDIR_SIZE)
280     		end = PGDIR_SIZE;
281     	if (address >= end)
282     		BUG();
283     	do {
284     		unuse_pmd(vma, pmd, address, end - address, offset, entry,
285     			  page);
286     		address = (address + PMD_SIZE) & PMD_MASK;
287     		pmd++;
288     	} while (address && (address < end));
289     }
290     
291     /* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */
292     static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
293     			swp_entry_t entry, struct page* page)
294     {
295     	unsigned long start = vma->vm_start, end = vma->vm_end;
296     
297     	if (start >= end)
298     		BUG();
299     	do {
300     		unuse_pgd(vma, pgdir, start, end - start, entry, page);
301     		start = (start + PGDIR_SIZE) & PGDIR_MASK;
302     		pgdir++;
303     	} while (start && (start < end));
304     }
305     
306     static void unuse_process(struct mm_struct * mm,
307     			swp_entry_t entry, struct page* page)
308     {
309     	struct vm_area_struct* vma;
310     
311     	/*
312     	 * Go through process' page directory.
313     	 */
314     	spin_lock(&mm->page_table_lock);
315     	for (vma = mm->mmap; vma; vma = vma->vm_next) {
316     		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
317     		unuse_vma(vma, pgd, entry, page);
318     	}
319     	spin_unlock(&mm->page_table_lock);
320     	return;
321     }
322     
323     /*
324      * Scan swap_map from current position to next entry still in use.
325      * Recycle to start on reaching the end, returning 0 when empty.
326      */
327     static int find_next_to_unuse(struct swap_info_struct *si, int prev)
328     {
329     	int max = si->max;
330     	int i = prev;
331     	int count;
332     
333     	/*
334     	 * No need for swap_device_lock(si) here: we're just looking
335     	 * for whether an entry is in use, not modifying it; false
336     	 * hits are okay, and sys_swapoff() has already prevented new
337     	 * allocations from this area (while holding swap_list_lock()).
338     	 */
339     	for (;;) {
340     		if (++i >= max) {
341     			if (!prev) {
342     				i = 0;
343     				break;
344     			}
345     			/*
346     			 * No entries in use at top of swap_map,
347     			 * loop back to start and recheck there.
348     			 */
349     			max = prev + 1;
350     			prev = 0;
351     			i = 1;
352     		}
353     		count = si->swap_map[i];
354     		if (count && count != SWAP_MAP_BAD)
355     			break;
356     	}
357     	return i;
358     }
359     
360     /*
361      * We completely avoid races by reading each swap page in advance,
362      * and then search for the process using it.  All the necessary
363      * page table adjustments can then be made atomically.
364      */
365     static int try_to_unuse(unsigned int type)
366     {
367     	struct swap_info_struct * si = &swap_info[type];
368     	struct mm_struct *start_mm;
369     	unsigned short *swap_map;
370     	unsigned short swcount;
371     	struct page *page;
372     	swp_entry_t entry;
373     	int i = 0;
374     	int retval = 0;
375     	int reset_overflow = 0;
376     
377     	/*
378     	 * When searching mms for an entry, a good strategy is to
379     	 * start at the first mm we freed the previous entry from
380     	 * (though actually we don't notice whether we or coincidence
381     	 * freed the entry).  Initialize this start_mm with a hold.
382     	 *
383     	 * A simpler strategy would be to start at the last mm we
384     	 * freed the previous entry from; but that would take less
385     	 * advantage of mmlist ordering (now preserved by swap_out()),
386     	 * which clusters forked address spaces together, most recent
387     	 * child immediately after parent.  If we race with dup_mmap(),
388     	 * we very much want to resolve parent before child, otherwise
389     	 * we may miss some entries: using last mm would invert that.
390     	 */
391     	start_mm = &init_mm;
392     	atomic_inc(&init_mm.mm_users);
393     
394     	/*
395     	 * Keep on scanning until all entries have gone.  Usually,
396     	 * one pass through swap_map is enough, but not necessarily:
397     	 * mmput() removes mm from mmlist before exit_mmap() and its
398     	 * zap_page_range().  That's not too bad, those entries are
399     	 * on their way out, and handled faster there than here.
400     	 * do_munmap() behaves similarly, taking the range out of mm's
401     	 * vma list before zap_page_range().  But unfortunately, when
402     	 * unmapping a part of a vma, it takes the whole out first,
403     	 * then reinserts what's left after (might even reschedule if
404     	 * open() method called) - so swap entries may be invisible
405     	 * to swapoff for a while, then reappear - but that is rare.
406     	 */
407     	while ((i = find_next_to_unuse(si, i))) {
408     		/* 
409     		 * Get a page for the entry, using the existing swap
410     		 * cache page if there is one.  Otherwise, get a clean
411     		 * page and read the swap into it. 
412     		 */
413     		swap_map = &si->swap_map[i];
414     		entry = SWP_ENTRY(type, i);
415     		page = read_swap_cache_async(entry);
416     		if (!page) {
417     			/*
418     			 * Either swap_duplicate() failed because entry
419     			 * has been freed independently, and will not be
420     			 * reused since sys_swapoff() already disabled
421     			 * allocation from here, or alloc_page() failed.
422     			 */
423     			if (!*swap_map)
424     				continue;
425     			retval = -ENOMEM;
426     			break;
427     		}
428     
429     		/*
430     		 * Don't hold on to start_mm if it looks like exiting.
431     		 * Can mmput ever block? if so, then we cannot risk
432     		 * it between deleting the page from the swap cache,
433     		 * and completing the search through mms (and cannot
434     		 * use it to avoid the long hold on mmlist_lock there).
435     		 */
436     		if (atomic_read(&start_mm->mm_users) == 1) {
437     			mmput(start_mm);
438     			start_mm = &init_mm;
439     			atomic_inc(&init_mm.mm_users);
440     		}
441     
442     		/*
443     		 * Wait for and lock page.  Remove it from swap cache
444     		 * so try_to_swap_out won't bump swap count.  Mark dirty
445     		 * so try_to_swap_out will preserve it without us having
446     		 * to mark any present ptes as dirty: so we can skip
447     		 * searching processes once swap count has all gone.
448     		 */
449     		lock_page(page);
450     		if (PageSwapCache(page))
451     			delete_from_swap_cache(page);
452     		SetPageDirty(page);
453     		UnlockPage(page);
454     		flush_page_to_ram(page);
455     
456     		/*
457     		 * Remove all references to entry, without blocking.
458     		 * Whenever we reach init_mm, there's no address space
459     		 * to search, but use it as a reminder to search shmem.
460     		 */
461     		swcount = *swap_map;
462     		if (swcount) {
463     			if (start_mm == &init_mm)
464     				shmem_unuse(entry, page);
465     			else
466     				unuse_process(start_mm, entry, page);
467     		}
468     		if (*swap_map) {
469     			int set_start_mm = (*swap_map >= swcount);
470     			struct list_head *p = &start_mm->mmlist;
471     			struct mm_struct *new_start_mm = start_mm;
472     			struct mm_struct *mm;
473     
474     			spin_lock(&mmlist_lock);
475     			while (*swap_map && (p = p->next) != &start_mm->mmlist) {
476     				mm = list_entry(p, struct mm_struct, mmlist);
477     				swcount = *swap_map;
478     				if (mm == &init_mm) {
479     					set_start_mm = 1;
480     					shmem_unuse(entry, page);
481     				} else
482     					unuse_process(mm, entry, page);
483     				if (set_start_mm && *swap_map < swcount) {
484     					new_start_mm = mm;
485     					set_start_mm = 0;
486     				}
487     			}
488     			atomic_inc(&new_start_mm->mm_users);
489     			spin_unlock(&mmlist_lock);
490     			mmput(start_mm);
491     			start_mm = new_start_mm;
492     		}
493     		page_cache_release(page);
494     
495     		/*
496     		 * How could swap count reach 0x7fff when the maximum
497     		 * pid is 0x7fff, and there's no way to repeat a swap
498     		 * page within an mm (except in shmem, where it's the
499     		 * shared object which takes the reference count)?
500     		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
501     		 *
502     		 * If that's wrong, then we should worry more about
503     		 * exit_mmap() and do_munmap() cases described above:
504     		 * we might be resetting SWAP_MAP_MAX too early here.
505     		 * We know "Undead"s can happen, they're okay, so don't
506     		 * report them; but do report if we reset SWAP_MAP_MAX.
507     		 */
508     		if (*swap_map == SWAP_MAP_MAX) {
509     			swap_list_lock();
510     			swap_device_lock(si);
511     			nr_swap_pages++;
512     			*swap_map = 0;
513     			swap_device_unlock(si);
514     			swap_list_unlock();
515     			reset_overflow = 1;
516     		}
517     
518     		/*
519     		 * Make sure that we aren't completely killing
520     		 * interactive performance.  Interruptible check on
521     		 * signal_pending() would be nice, but changes the spec?
522     		 */
523     		if (current->need_resched)
524     			schedule();
525     		else {
526     			unlock_kernel();
527     			lock_kernel();
528     		}
529     	}
530     
531     	mmput(start_mm);
532     	if (reset_overflow) {
533     		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
534     		swap_overflow = 0;
535     	}
536     	return retval;
537     }
538     
539     asmlinkage long sys_swapoff(const char * specialfile)
540     {
541     	struct swap_info_struct * p = NULL;
542     	unsigned short *swap_map;
543     	struct nameidata nd;
544     	int i, type, prev;
545     	int err;
546     	
547     	if (!capable(CAP_SYS_ADMIN))
548     		return -EPERM;
549     
550     	err = user_path_walk(specialfile, &nd);
551     	if (err)
552     		goto out;
553     
554     	lock_kernel();
555     	prev = -1;
556     	swap_list_lock();
557     	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
558     		p = swap_info + type;
559     		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
560     			if (p->swap_file == nd.dentry)
561     			  break;
562     		}
563     		prev = type;
564     	}
565     	err = -EINVAL;
566     	if (type < 0) {
567     		swap_list_unlock();
568     		goto out_dput;
569     	}
570     
571     	if (prev < 0) {
572     		swap_list.head = p->next;
573     	} else {
574     		swap_info[prev].next = p->next;
575     	}
576     	if (type == swap_list.next) {
577     		/* just pick something that's safe... */
578     		swap_list.next = swap_list.head;
579     	}
580     	nr_swap_pages -= p->pages;
581     	total_swap_pages -= p->pages;
582     	p->flags = SWP_USED;
583     	swap_list_unlock();
584     	err = try_to_unuse(type);
585     	if (err) {
586     		/* re-insert swap space back into swap_list */
587     		swap_list_lock();
588     		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
589     			if (p->prio >= swap_info[i].prio)
590     				break;
591     		p->next = i;
592     		if (prev < 0)
593     			swap_list.head = swap_list.next = p - swap_info;
594     		else
595     			swap_info[prev].next = p - swap_info;
596     		nr_swap_pages += p->pages;
597     		total_swap_pages += p->pages;
598     		p->flags = SWP_WRITEOK;
599     		swap_list_unlock();
600     		goto out_dput;
601     	}
602     	if (p->swap_device)
603     		blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
604     	path_release(&nd);
605     
606     	swap_list_lock();
607     	swap_device_lock(p);
608     	nd.mnt = p->swap_vfsmnt;
609     	nd.dentry = p->swap_file;
610     	p->swap_vfsmnt = NULL;
611     	p->swap_file = NULL;
612     	p->swap_device = 0;
613     	p->max = 0;
614     	swap_map = p->swap_map;
615     	p->swap_map = NULL;
616     	p->flags = 0;
617     	swap_device_unlock(p);
618     	swap_list_unlock();
619     	vfree(swap_map);
620     	err = 0;
621     
622     out_dput:
623     	unlock_kernel();
624     	path_release(&nd);
625     out:
626     	return err;
627     }
628     
629     int get_swaparea_info(char *buf)
630     {
631     	char * page = (char *) __get_free_page(GFP_KERNEL);
632     	struct swap_info_struct *ptr = swap_info;
633     	int i, j, len = 0, usedswap;
634     
635     	if (!page)
636     		return -ENOMEM;
637     
638     	len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
639     	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
640     		if ((ptr->flags & SWP_USED) && ptr->swap_map) {
641     			char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
642     						page, PAGE_SIZE);
643     
644     			len += sprintf(buf + len, "%-31s ", path);
645     
646     			if (!ptr->swap_device)
647     				len += sprintf(buf + len, "file\t\t");
648     			else
649     				len += sprintf(buf + len, "partition\t");
650     
651     			usedswap = 0;
652     			for (j = 0; j < ptr->max; ++j)
653     				switch (ptr->swap_map[j]) {
654     					case SWAP_MAP_BAD:
655     					case 0:
656     						continue;
657     					default:
658     						usedswap++;
659     				}
660     			len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), 
661     				usedswap << (PAGE_SHIFT - 10), ptr->prio);
662     		}
663     	}
664     	free_page((unsigned long) page);
665     	return len;
666     }
667     
668     int is_swap_partition(kdev_t dev) {
669     	struct swap_info_struct *ptr = swap_info;
670     	int i;
671     
672     	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
673     		if (ptr->flags & SWP_USED)
674     			if (ptr->swap_device == dev)
675     				return 1;
676     	}
677     	return 0;
678     }
679     
680     /*
681      * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
682      *
683      * The swapon system call
684      */
685     asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
686     {
687     	struct swap_info_struct * p;
688     	struct nameidata nd;
689     	struct inode * swap_inode;
690     	unsigned int type;
691     	int i, j, prev;
692     	int error;
693     	static int least_priority = 0;
694     	union swap_header *swap_header = 0;
695     	int swap_header_version;
696     	int nr_good_pages = 0;
697     	unsigned long maxpages = 1;
698     	int swapfilesize;
699     	struct block_device *bdev = NULL;
700     	unsigned short *swap_map;
701     	
702     	if (!capable(CAP_SYS_ADMIN))
703     		return -EPERM;
704     	lock_kernel();
705     	swap_list_lock();
706     	p = swap_info;
707     	for (type = 0 ; type < nr_swapfiles ; type++,p++)
708     		if (!(p->flags & SWP_USED))
709     			break;
710     	error = -EPERM;
711     	if (type >= MAX_SWAPFILES) {
712     		swap_list_unlock();
713     		goto out;
714     	}
715     	if (type >= nr_swapfiles)
716     		nr_swapfiles = type+1;
717     	p->flags = SWP_USED;
718     	p->swap_file = NULL;
719     	p->swap_vfsmnt = NULL;
720     	p->swap_device = 0;
721     	p->swap_map = NULL;
722     	p->lowest_bit = 0;
723     	p->highest_bit = 0;
724     	p->cluster_nr = 0;
725     	p->sdev_lock = SPIN_LOCK_UNLOCKED;
726     	p->next = -1;
727     	if (swap_flags & SWAP_FLAG_PREFER) {
728     		p->prio =
729     		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
730     	} else {
731     		p->prio = --least_priority;
732     	}
733     	swap_list_unlock();
734     	error = user_path_walk(specialfile, &nd);
735     	if (error)
736     		goto bad_swap_2;
737     
738     	p->swap_file = nd.dentry;
739     	p->swap_vfsmnt = nd.mnt;
740     	swap_inode = nd.dentry->d_inode;
741     	error = -EINVAL;
742     
743     	if (S_ISBLK(swap_inode->i_mode)) {
744     		kdev_t dev = swap_inode->i_rdev;
745     		struct block_device_operations *bdops;
746     
747     		p->swap_device = dev;
748     		set_blocksize(dev, PAGE_SIZE);
749     		
750     		bd_acquire(swap_inode);
751     		bdev = swap_inode->i_bdev;
752     		bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
753     		if (bdops) bdev->bd_op = bdops;
754     
755     		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
756     		if (error)
757     			goto bad_swap_2;
758     		set_blocksize(dev, PAGE_SIZE);
759     		error = -ENODEV;
760     		if (!dev || (blk_size[MAJOR(dev)] &&
761     		     !blk_size[MAJOR(dev)][MINOR(dev)]))
762     			goto bad_swap;
763     		swapfilesize = 0;
764     		if (blk_size[MAJOR(dev)])
765     			swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
766     				>> (PAGE_SHIFT - 10);
767     	} else if (S_ISREG(swap_inode->i_mode))
768     		swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
769     	else
770     		goto bad_swap;
771     
772     	error = -EBUSY;
773     	for (i = 0 ; i < nr_swapfiles ; i++) {
774     		struct swap_info_struct *q = &swap_info[i];
775     		if (i == type || !q->swap_file)
776     			continue;
777     		if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
778     			goto bad_swap;
779     	}
780     
781     	swap_header = (void *) __get_free_page(GFP_USER);
782     	if (!swap_header) {
783     		printk("Unable to start swapping: out of memory :-)\n");
784     		error = -ENOMEM;
785     		goto bad_swap;
786     	}
787     
788     	lock_page(virt_to_page(swap_header));
789     	rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
790     
791     	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
792     		swap_header_version = 1;
793     	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
794     		swap_header_version = 2;
795     	else {
796     		printk("Unable to find swap-space signature\n");
797     		error = -EINVAL;
798     		goto bad_swap;
799     	}
800     	
801     	switch (swap_header_version) {
802     	case 1:
803     		memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
804     		j = 0;
805     		p->lowest_bit = 0;
806     		p->highest_bit = 0;
807     		for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
808     			if (test_bit(i,(char *) swap_header)) {
809     				if (!p->lowest_bit)
810     					p->lowest_bit = i;
811     				p->highest_bit = i;
812     				maxpages = i+1;
813     				j++;
814     			}
815     		}
816     		nr_good_pages = j;
817     		p->swap_map = vmalloc(maxpages * sizeof(short));
818     		if (!p->swap_map) {
819     			error = -ENOMEM;		
820     			goto bad_swap;
821     		}
822     		for (i = 1 ; i < maxpages ; i++) {
823     			if (test_bit(i,(char *) swap_header))
824     				p->swap_map[i] = 0;
825     			else
826     				p->swap_map[i] = SWAP_MAP_BAD;
827     		}
828     		break;
829     
830     	case 2:
831     		/* Check the swap header's sub-version and the size of
832                        the swap file and bad block lists */
833     		if (swap_header->info.version != 1) {
834     			printk(KERN_WARNING
835     			       "Unable to handle swap header version %d\n",
836     			       swap_header->info.version);
837     			error = -EINVAL;
838     			goto bad_swap;
839     		}
840     
841     		p->lowest_bit  = 1;
842     		maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
843     		if (maxpages > swap_header->info.last_page)
844     			maxpages = swap_header->info.last_page;
845     		p->highest_bit = maxpages - 1;
846     
847     		error = -EINVAL;
848     		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
849     			goto bad_swap;
850     		
851     		/* OK, set up the swap map and apply the bad block list */
852     		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
853     			error = -ENOMEM;
854     			goto bad_swap;
855     		}
856     
857     		error = 0;
858     		memset(p->swap_map, 0, maxpages * sizeof(short));
859     		for (i=0; i<swap_header->info.nr_badpages; i++) {
860     			int page = swap_header->info.badpages[i];
861     			if (page <= 0 || page >= swap_header->info.last_page)
862     				error = -EINVAL;
863     			else
864     				p->swap_map[page] = SWAP_MAP_BAD;
865     		}
866     		nr_good_pages = swap_header->info.last_page -
867     				swap_header->info.nr_badpages -
868     				1 /* header page */;
869     		if (error) 
870     			goto bad_swap;
871     	}
872     	
873     	if (swapfilesize && maxpages > swapfilesize) {
874     		printk(KERN_WARNING
875     		       "Swap area shorter than signature indicates\n");
876     		error = -EINVAL;
877     		goto bad_swap;
878     	}
879     	if (!nr_good_pages) {
880     		printk(KERN_WARNING "Empty swap-file\n");
881     		error = -EINVAL;
882     		goto bad_swap;
883     	}
884     	p->swap_map[0] = SWAP_MAP_BAD;
885     	swap_list_lock();
886     	swap_device_lock(p);
887     	p->max = maxpages;
888     	p->flags = SWP_WRITEOK;
889     	p->pages = nr_good_pages;
890     	nr_swap_pages += nr_good_pages;
891     	total_swap_pages += nr_good_pages;
892     	printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
893     	       nr_good_pages<<(PAGE_SHIFT-10), p->prio);
894     
895     	/* insert swap space into swap_list: */
896     	prev = -1;
897     	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
898     		if (p->prio >= swap_info[i].prio) {
899     			break;
900     		}
901     		prev = i;
902     	}
903     	p->next = i;
904     	if (prev < 0) {
905     		swap_list.head = swap_list.next = p - swap_info;
906     	} else {
907     		swap_info[prev].next = p - swap_info;
908     	}
909     	swap_device_unlock(p);
910     	swap_list_unlock();
911     	error = 0;
912     	goto out;
913     bad_swap:
914     	if (bdev)
915     		blkdev_put(bdev, BDEV_SWAP);
916     bad_swap_2:
917     	swap_list_lock();
918     	swap_map = p->swap_map;
919     	nd.mnt = p->swap_vfsmnt;
920     	nd.dentry = p->swap_file;
921     	p->swap_device = 0;
922     	p->swap_file = NULL;
923     	p->swap_vfsmnt = NULL;
924     	p->swap_map = NULL;
925     	p->flags = 0;
926     	if (!(swap_flags & SWAP_FLAG_PREFER))
927     		++least_priority;
928     	swap_list_unlock();
929     	if (swap_map)
930     		vfree(swap_map);
931     	path_release(&nd);
932     out:
933     	if (swap_header)
934     		free_page((long) swap_header);
935     	unlock_kernel();
936     	return error;
937     }
938     
939     void si_swapinfo(struct sysinfo *val)
940     {
941     	unsigned int i;
942     	unsigned long nr_to_be_unused = 0;
943     
944     	swap_list_lock();
945     	for (i = 0; i < nr_swapfiles; i++) {
946     		unsigned int j;
947     		if (swap_info[i].flags != SWP_USED)
948     			continue;
949     		for (j = 0; j < swap_info[i].max; ++j) {
950     			switch (swap_info[i].swap_map[j]) {
951     				case 0:
952     				case SWAP_MAP_BAD:
953     					continue;
954     				default:
955     					nr_to_be_unused++;
956     			}
957     		}
958     	}
959     	val->freeswap = nr_swap_pages + nr_to_be_unused;
960     	val->totalswap = total_swap_pages + nr_to_be_unused;
961     	swap_list_unlock();
962     }
963     
964     /*
965      * Verify that a swap entry is valid and increment its swap map count.
966      *
967      * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
968      * "permanent", but will be reclaimed by the next swapoff.
969      */
970     int swap_duplicate(swp_entry_t entry)
971     {
972     	struct swap_info_struct * p;
973     	unsigned long offset, type;
974     	int result = 0;
975     
976     	type = SWP_TYPE(entry);
977     	if (type >= nr_swapfiles)
978     		goto bad_file;
979     	p = type + swap_info;
980     	offset = SWP_OFFSET(entry);
981     
982     	swap_device_lock(p);
983     	if (offset < p->max && p->swap_map[offset]) {
984     		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
985     			p->swap_map[offset]++;
986     			result = 1;
987     		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
988     			if (swap_overflow++ < 5)
989     				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
990     			p->swap_map[offset] = SWAP_MAP_MAX;
991     			result = 1;
992     		}
993     	}
994     	swap_device_unlock(p);
995     out:
996     	return result;
997     
998     bad_file:
999     	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1000     	goto out;
1001     }
1002     
1003     /*
1004      * Page lock needs to be held in all cases to prevent races with
1005      * swap file deletion.
1006      */
1007     int swap_count(struct page *page)
1008     {
1009     	struct swap_info_struct * p;
1010     	unsigned long offset, type;
1011     	swp_entry_t entry;
1012     	int retval = 0;
1013     
1014     	entry.val = page->index;
1015     	if (!entry.val)
1016     		goto bad_entry;
1017     	type = SWP_TYPE(entry);
1018     	if (type >= nr_swapfiles)
1019     		goto bad_file;
1020     	p = type + swap_info;
1021     	offset = SWP_OFFSET(entry);
1022     	if (offset >= p->max)
1023     		goto bad_offset;
1024     	if (!p->swap_map[offset])
1025     		goto bad_unused;
1026     	retval = p->swap_map[offset];
1027     out:
1028     	return retval;
1029     
1030     bad_entry:
1031     	printk(KERN_ERR "swap_count: null entry!\n");
1032     	goto out;
1033     bad_file:
1034     	printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val);
1035     	goto out;
1036     bad_offset:
1037     	printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val);
1038     	goto out;
1039     bad_unused:
1040     	printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val);
1041     	goto out;
1042     }
1043     
1044     /*
1045      * Prior swap_duplicate protects against swap device deletion.
1046      */
1047     void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 
1048     			kdev_t *dev, struct inode **swapf)
1049     {
1050     	unsigned long type;
1051     	struct swap_info_struct *p;
1052     
1053     	type = SWP_TYPE(entry);
1054     	if (type >= nr_swapfiles) {
1055     		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
1056     		return;
1057     	}
1058     
1059     	p = &swap_info[type];
1060     	*offset = SWP_OFFSET(entry);
1061     	if (*offset >= p->max && *offset != 0) {
1062     		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
1063     		return;
1064     	}
1065     	if (p->swap_map && !p->swap_map[*offset]) {
1066     		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
1067     		return;
1068     	}
1069     	if (!(p->flags & SWP_USED)) {
1070     		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
1071     		return;
1072     	}
1073     
1074     	if (p->swap_device) {
1075     		*dev = p->swap_device;
1076     	} else if (p->swap_file) {
1077     		*swapf = p->swap_file->d_inode;
1078     	} else {
1079     		printk(KERN_ERR "rw_swap_page: no swap file or device\n");
1080     	}
1081     	return;
1082     }
1083     
1084     /*
1085      * swap_device_lock prevents swap_map being freed. Don't grab an extra
1086      * reference on the swaphandle, it doesn't matter if it becomes unused.
1087      */
1088     int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1089     {
1090     	int ret = 0, i = 1 << page_cluster;
1091     	unsigned long toff;
1092     	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
1093     
1094     	if (!page_cluster)	/* no readahead */
1095     		return 0;
1096     	toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
1097     	if (!toff)		/* first page is swap header */
1098     		toff++, i--;
1099     	*offset = toff;
1100     
1101     	swap_device_lock(swapdev);
1102     	do {
1103     		/* Don't read-ahead past the end of the swap area */
1104     		if (toff >= swapdev->max)
1105     			break;
1106     		/* Don't read in free or bad pages */
1107     		if (!swapdev->swap_map[toff])
1108     			break;
1109     		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
1110     			break;
1111     		toff++;
1112     		ret++;
1113     	} while (--i);
1114     	swap_device_unlock(swapdev);
1115     	return ret;
1116     }
1117