File: /usr/src/linux/mm/swapfile.c
1 /*
2 * linux/mm/swapfile.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */
7
8 #include <linux/slab.h>
9 #include <linux/smp_lock.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/blkdev.h> /* for blk_size */
14 #include <linux/vmalloc.h>
15 #include <linux/pagemap.h>
16 #include <linux/shm.h>
17 #include <linux/compiler.h>
18
19 #include <asm/pgtable.h>
20
21 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
22 unsigned int nr_swapfiles;
23 int total_swap_pages;
24 static int swap_overflow;
25
26 static const char Bad_file[] = "Bad swap file entry ";
27 static const char Unused_file[] = "Unused swap file entry ";
28 static const char Bad_offset[] = "Bad swap offset entry ";
29 static const char Unused_offset[] = "Unused swap offset entry ";
30
31 struct swap_list_t swap_list = {-1, -1};
32
33 struct swap_info_struct swap_info[MAX_SWAPFILES];
34
35 #define SWAPFILE_CLUSTER 256
36
37 static inline int scan_swap_map(struct swap_info_struct *si)
38 {
39 unsigned long offset;
40 /*
41 * We try to cluster swap pages by allocating them
42 * sequentially in swap. Once we've allocated
43 * SWAPFILE_CLUSTER pages this way, however, we resort to
44 * first-free allocation, starting a new cluster. This
45 * prevents us from scattering swap pages all over the entire
46 * swap partition, so that we reduce overall disk seek times
47 * between swap pages. -- sct */
48 if (si->cluster_nr) {
49 while (si->cluster_next <= si->highest_bit) {
50 offset = si->cluster_next++;
51 if (si->swap_map[offset])
52 continue;
53 si->cluster_nr--;
54 goto got_page;
55 }
56 }
57 si->cluster_nr = SWAPFILE_CLUSTER;
58
59 /* try to find an empty (even not aligned) cluster. */
60 offset = si->lowest_bit;
61 check_next_cluster:
62 if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
63 {
64 int nr;
65 for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
66 if (si->swap_map[nr])
67 {
68 offset = nr+1;
69 goto check_next_cluster;
70 }
71 /* We found a completly empty cluster, so start
72 * using it.
73 */
74 goto got_page;
75 }
76 /* No luck, so now go finegrined as usual. -Andrea */
77 for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
78 if (si->swap_map[offset])
79 continue;
80 si->lowest_bit = offset+1;
81 got_page:
82 if (offset == si->lowest_bit)
83 si->lowest_bit++;
84 if (offset == si->highest_bit)
85 si->highest_bit--;
86 if (si->lowest_bit > si->highest_bit) {
87 si->lowest_bit = si->max;
88 si->highest_bit = 0;
89 }
90 /* Initial count 1 for user reference + 1 for swap cache */
91 si->swap_map[offset] = 2;
92 nr_swap_pages--;
93 si->cluster_next = offset+1;
94 return offset;
95 }
96 si->lowest_bit = si->max;
97 si->highest_bit = 0;
98 return 0;
99 }
100
101 /*
102 * Callers of get_swap_page must hold swap_list_lock across the call,
103 * and across the following add_to_swap_cache, to guard against races
104 * with read_swap_cache_async.
105 */
106 swp_entry_t get_swap_page(void)
107 {
108 struct swap_info_struct * p;
109 unsigned long offset;
110 swp_entry_t entry;
111 int type, wrapped = 0;
112
113 entry.val = 0; /* Out of memory */
114 type = swap_list.next;
115 if (type < 0)
116 goto out;
117 if (nr_swap_pages <= 0)
118 goto out;
119
120 while (1) {
121 p = &swap_info[type];
122 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
123 swap_device_lock(p);
124 offset = scan_swap_map(p);
125 swap_device_unlock(p);
126 if (offset) {
127 entry = SWP_ENTRY(type,offset);
128 type = swap_info[type].next;
129 if (type < 0 ||
130 p->prio != swap_info[type].prio) {
131 swap_list.next = swap_list.head;
132 } else {
133 swap_list.next = type;
134 }
135 goto out;
136 }
137 }
138 type = p->next;
139 if (!wrapped) {
140 if (type < 0 || p->prio != swap_info[type].prio) {
141 type = swap_list.head;
142 wrapped = 1;
143 }
144 } else
145 if (type < 0)
146 goto out; /* out of swap space */
147 }
148 out:
149 return entry;
150 }
151
152 /*
153 * Caller has made sure that the swapdevice corresponding to entry
154 * is still around or has not been recycled.
155 */
156 void swap_free(swp_entry_t entry)
157 {
158 struct swap_info_struct * p;
159 unsigned long offset, type;
160
161 if (!entry.val)
162 goto out;
163
164 type = SWP_TYPE(entry);
165 if (type >= nr_swapfiles)
166 goto bad_nofile;
167 p = & swap_info[type];
168 if (!(p->flags & SWP_USED))
169 goto bad_device;
170 offset = SWP_OFFSET(entry);
171 if (offset >= p->max)
172 goto bad_offset;
173 if (!p->swap_map[offset])
174 goto bad_free;
175 swap_list_lock();
176 if (p->prio > swap_info[swap_list.next].prio)
177 swap_list.next = type;
178 swap_device_lock(p);
179 if (p->swap_map[offset] < SWAP_MAP_MAX) {
180 if (!--(p->swap_map[offset])) {
181 if (offset < p->lowest_bit)
182 p->lowest_bit = offset;
183 if (offset > p->highest_bit)
184 p->highest_bit = offset;
185 nr_swap_pages++;
186 }
187 }
188 swap_device_unlock(p);
189 swap_list_unlock();
190 out:
191 return;
192
193 bad_nofile:
194 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
195 goto out;
196 bad_device:
197 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
198 goto out;
199 bad_offset:
200 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
201 goto out;
202 bad_free:
203 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
204 goto out;
205 }
206
207 /*
208 * The swap entry has been read in advance, and we return 1 to indicate
209 * that the page has been used or is no longer needed.
210 *
211 * Always set the resulting pte to be nowrite (the same as COW pages
212 * after one process has exited). We don't know just how many PTEs will
213 * share this swap entry, so be cautious and let do_wp_page work out
214 * what to do if a write is requested later.
215 */
216 /* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */
217 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
218 pte_t *dir, swp_entry_t entry, struct page* page)
219 {
220 pte_t pte = *dir;
221
222 if (likely(pte_to_swp_entry(pte).val != entry.val))
223 return;
224 if (unlikely(pte_none(pte) || pte_present(pte)))
225 return;
226 get_page(page);
227 set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
228 swap_free(entry);
229 ++vma->vm_mm->rss;
230 }
231
232 /* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */
233 static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
234 unsigned long address, unsigned long size, unsigned long offset,
235 swp_entry_t entry, struct page* page)
236 {
237 pte_t * pte;
238 unsigned long end;
239
240 if (pmd_none(*dir))
241 return;
242 if (pmd_bad(*dir)) {
243 pmd_ERROR(*dir);
244 pmd_clear(dir);
245 return;
246 }
247 pte = pte_offset(dir, address);
248 offset += address & PMD_MASK;
249 address &= ~PMD_MASK;
250 end = address + size;
251 if (end > PMD_SIZE)
252 end = PMD_SIZE;
253 do {
254 unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
255 address += PAGE_SIZE;
256 pte++;
257 } while (address && (address < end));
258 }
259
260 /* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */
261 static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
262 unsigned long address, unsigned long size,
263 swp_entry_t entry, struct page* page)
264 {
265 pmd_t * pmd;
266 unsigned long offset, end;
267
268 if (pgd_none(*dir))
269 return;
270 if (pgd_bad(*dir)) {
271 pgd_ERROR(*dir);
272 pgd_clear(dir);
273 return;
274 }
275 pmd = pmd_offset(dir, address);
276 offset = address & PGDIR_MASK;
277 address &= ~PGDIR_MASK;
278 end = address + size;
279 if (end > PGDIR_SIZE)
280 end = PGDIR_SIZE;
281 if (address >= end)
282 BUG();
283 do {
284 unuse_pmd(vma, pmd, address, end - address, offset, entry,
285 page);
286 address = (address + PMD_SIZE) & PMD_MASK;
287 pmd++;
288 } while (address && (address < end));
289 }
290
291 /* BKL, mmlist_lock and vma->vm_mm->page_table_lock are held */
292 static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
293 swp_entry_t entry, struct page* page)
294 {
295 unsigned long start = vma->vm_start, end = vma->vm_end;
296
297 if (start >= end)
298 BUG();
299 do {
300 unuse_pgd(vma, pgdir, start, end - start, entry, page);
301 start = (start + PGDIR_SIZE) & PGDIR_MASK;
302 pgdir++;
303 } while (start && (start < end));
304 }
305
306 static void unuse_process(struct mm_struct * mm,
307 swp_entry_t entry, struct page* page)
308 {
309 struct vm_area_struct* vma;
310
311 /*
312 * Go through process' page directory.
313 */
314 spin_lock(&mm->page_table_lock);
315 for (vma = mm->mmap; vma; vma = vma->vm_next) {
316 pgd_t * pgd = pgd_offset(mm, vma->vm_start);
317 unuse_vma(vma, pgd, entry, page);
318 }
319 spin_unlock(&mm->page_table_lock);
320 return;
321 }
322
323 /*
324 * Scan swap_map from current position to next entry still in use.
325 * Recycle to start on reaching the end, returning 0 when empty.
326 */
327 static int find_next_to_unuse(struct swap_info_struct *si, int prev)
328 {
329 int max = si->max;
330 int i = prev;
331 int count;
332
333 /*
334 * No need for swap_device_lock(si) here: we're just looking
335 * for whether an entry is in use, not modifying it; false
336 * hits are okay, and sys_swapoff() has already prevented new
337 * allocations from this area (while holding swap_list_lock()).
338 */
339 for (;;) {
340 if (++i >= max) {
341 if (!prev) {
342 i = 0;
343 break;
344 }
345 /*
346 * No entries in use at top of swap_map,
347 * loop back to start and recheck there.
348 */
349 max = prev + 1;
350 prev = 0;
351 i = 1;
352 }
353 count = si->swap_map[i];
354 if (count && count != SWAP_MAP_BAD)
355 break;
356 }
357 return i;
358 }
359
360 /*
361 * We completely avoid races by reading each swap page in advance,
362 * and then search for the process using it. All the necessary
363 * page table adjustments can then be made atomically.
364 */
365 static int try_to_unuse(unsigned int type)
366 {
367 struct swap_info_struct * si = &swap_info[type];
368 struct mm_struct *start_mm;
369 unsigned short *swap_map;
370 unsigned short swcount;
371 struct page *page;
372 swp_entry_t entry;
373 int i = 0;
374 int retval = 0;
375 int reset_overflow = 0;
376
377 /*
378 * When searching mms for an entry, a good strategy is to
379 * start at the first mm we freed the previous entry from
380 * (though actually we don't notice whether we or coincidence
381 * freed the entry). Initialize this start_mm with a hold.
382 *
383 * A simpler strategy would be to start at the last mm we
384 * freed the previous entry from; but that would take less
385 * advantage of mmlist ordering (now preserved by swap_out()),
386 * which clusters forked address spaces together, most recent
387 * child immediately after parent. If we race with dup_mmap(),
388 * we very much want to resolve parent before child, otherwise
389 * we may miss some entries: using last mm would invert that.
390 */
391 start_mm = &init_mm;
392 atomic_inc(&init_mm.mm_users);
393
394 /*
395 * Keep on scanning until all entries have gone. Usually,
396 * one pass through swap_map is enough, but not necessarily:
397 * mmput() removes mm from mmlist before exit_mmap() and its
398 * zap_page_range(). That's not too bad, those entries are
399 * on their way out, and handled faster there than here.
400 * do_munmap() behaves similarly, taking the range out of mm's
401 * vma list before zap_page_range(). But unfortunately, when
402 * unmapping a part of a vma, it takes the whole out first,
403 * then reinserts what's left after (might even reschedule if
404 * open() method called) - so swap entries may be invisible
405 * to swapoff for a while, then reappear - but that is rare.
406 */
407 while ((i = find_next_to_unuse(si, i))) {
408 /*
409 * Get a page for the entry, using the existing swap
410 * cache page if there is one. Otherwise, get a clean
411 * page and read the swap into it.
412 */
413 swap_map = &si->swap_map[i];
414 entry = SWP_ENTRY(type, i);
415 page = read_swap_cache_async(entry);
416 if (!page) {
417 /*
418 * Either swap_duplicate() failed because entry
419 * has been freed independently, and will not be
420 * reused since sys_swapoff() already disabled
421 * allocation from here, or alloc_page() failed.
422 */
423 if (!*swap_map)
424 continue;
425 retval = -ENOMEM;
426 break;
427 }
428
429 /*
430 * Don't hold on to start_mm if it looks like exiting.
431 * Can mmput ever block? if so, then we cannot risk
432 * it between deleting the page from the swap cache,
433 * and completing the search through mms (and cannot
434 * use it to avoid the long hold on mmlist_lock there).
435 */
436 if (atomic_read(&start_mm->mm_users) == 1) {
437 mmput(start_mm);
438 start_mm = &init_mm;
439 atomic_inc(&init_mm.mm_users);
440 }
441
442 /*
443 * Wait for and lock page. Remove it from swap cache
444 * so try_to_swap_out won't bump swap count. Mark dirty
445 * so try_to_swap_out will preserve it without us having
446 * to mark any present ptes as dirty: so we can skip
447 * searching processes once swap count has all gone.
448 */
449 lock_page(page);
450 if (PageSwapCache(page))
451 delete_from_swap_cache(page);
452 SetPageDirty(page);
453 UnlockPage(page);
454 flush_page_to_ram(page);
455
456 /*
457 * Remove all references to entry, without blocking.
458 * Whenever we reach init_mm, there's no address space
459 * to search, but use it as a reminder to search shmem.
460 */
461 swcount = *swap_map;
462 if (swcount) {
463 if (start_mm == &init_mm)
464 shmem_unuse(entry, page);
465 else
466 unuse_process(start_mm, entry, page);
467 }
468 if (*swap_map) {
469 int set_start_mm = (*swap_map >= swcount);
470 struct list_head *p = &start_mm->mmlist;
471 struct mm_struct *new_start_mm = start_mm;
472 struct mm_struct *mm;
473
474 spin_lock(&mmlist_lock);
475 while (*swap_map && (p = p->next) != &start_mm->mmlist) {
476 mm = list_entry(p, struct mm_struct, mmlist);
477 swcount = *swap_map;
478 if (mm == &init_mm) {
479 set_start_mm = 1;
480 shmem_unuse(entry, page);
481 } else
482 unuse_process(mm, entry, page);
483 if (set_start_mm && *swap_map < swcount) {
484 new_start_mm = mm;
485 set_start_mm = 0;
486 }
487 }
488 atomic_inc(&new_start_mm->mm_users);
489 spin_unlock(&mmlist_lock);
490 mmput(start_mm);
491 start_mm = new_start_mm;
492 }
493 page_cache_release(page);
494
495 /*
496 * How could swap count reach 0x7fff when the maximum
497 * pid is 0x7fff, and there's no way to repeat a swap
498 * page within an mm (except in shmem, where it's the
499 * shared object which takes the reference count)?
500 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
501 *
502 * If that's wrong, then we should worry more about
503 * exit_mmap() and do_munmap() cases described above:
504 * we might be resetting SWAP_MAP_MAX too early here.
505 * We know "Undead"s can happen, they're okay, so don't
506 * report them; but do report if we reset SWAP_MAP_MAX.
507 */
508 if (*swap_map == SWAP_MAP_MAX) {
509 swap_list_lock();
510 swap_device_lock(si);
511 nr_swap_pages++;
512 *swap_map = 0;
513 swap_device_unlock(si);
514 swap_list_unlock();
515 reset_overflow = 1;
516 }
517
518 /*
519 * Make sure that we aren't completely killing
520 * interactive performance. Interruptible check on
521 * signal_pending() would be nice, but changes the spec?
522 */
523 if (current->need_resched)
524 schedule();
525 else {
526 unlock_kernel();
527 lock_kernel();
528 }
529 }
530
531 mmput(start_mm);
532 if (reset_overflow) {
533 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
534 swap_overflow = 0;
535 }
536 return retval;
537 }
538
539 asmlinkage long sys_swapoff(const char * specialfile)
540 {
541 struct swap_info_struct * p = NULL;
542 unsigned short *swap_map;
543 struct nameidata nd;
544 int i, type, prev;
545 int err;
546
547 if (!capable(CAP_SYS_ADMIN))
548 return -EPERM;
549
550 err = user_path_walk(specialfile, &nd);
551 if (err)
552 goto out;
553
554 lock_kernel();
555 prev = -1;
556 swap_list_lock();
557 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
558 p = swap_info + type;
559 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
560 if (p->swap_file == nd.dentry)
561 break;
562 }
563 prev = type;
564 }
565 err = -EINVAL;
566 if (type < 0) {
567 swap_list_unlock();
568 goto out_dput;
569 }
570
571 if (prev < 0) {
572 swap_list.head = p->next;
573 } else {
574 swap_info[prev].next = p->next;
575 }
576 if (type == swap_list.next) {
577 /* just pick something that's safe... */
578 swap_list.next = swap_list.head;
579 }
580 nr_swap_pages -= p->pages;
581 total_swap_pages -= p->pages;
582 p->flags = SWP_USED;
583 swap_list_unlock();
584 err = try_to_unuse(type);
585 if (err) {
586 /* re-insert swap space back into swap_list */
587 swap_list_lock();
588 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
589 if (p->prio >= swap_info[i].prio)
590 break;
591 p->next = i;
592 if (prev < 0)
593 swap_list.head = swap_list.next = p - swap_info;
594 else
595 swap_info[prev].next = p - swap_info;
596 nr_swap_pages += p->pages;
597 total_swap_pages += p->pages;
598 p->flags = SWP_WRITEOK;
599 swap_list_unlock();
600 goto out_dput;
601 }
602 if (p->swap_device)
603 blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
604 path_release(&nd);
605
606 swap_list_lock();
607 swap_device_lock(p);
608 nd.mnt = p->swap_vfsmnt;
609 nd.dentry = p->swap_file;
610 p->swap_vfsmnt = NULL;
611 p->swap_file = NULL;
612 p->swap_device = 0;
613 p->max = 0;
614 swap_map = p->swap_map;
615 p->swap_map = NULL;
616 p->flags = 0;
617 swap_device_unlock(p);
618 swap_list_unlock();
619 vfree(swap_map);
620 err = 0;
621
622 out_dput:
623 unlock_kernel();
624 path_release(&nd);
625 out:
626 return err;
627 }
628
629 int get_swaparea_info(char *buf)
630 {
631 char * page = (char *) __get_free_page(GFP_KERNEL);
632 struct swap_info_struct *ptr = swap_info;
633 int i, j, len = 0, usedswap;
634
635 if (!page)
636 return -ENOMEM;
637
638 len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
639 for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
640 if ((ptr->flags & SWP_USED) && ptr->swap_map) {
641 char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
642 page, PAGE_SIZE);
643
644 len += sprintf(buf + len, "%-31s ", path);
645
646 if (!ptr->swap_device)
647 len += sprintf(buf + len, "file\t\t");
648 else
649 len += sprintf(buf + len, "partition\t");
650
651 usedswap = 0;
652 for (j = 0; j < ptr->max; ++j)
653 switch (ptr->swap_map[j]) {
654 case SWAP_MAP_BAD:
655 case 0:
656 continue;
657 default:
658 usedswap++;
659 }
660 len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10),
661 usedswap << (PAGE_SHIFT - 10), ptr->prio);
662 }
663 }
664 free_page((unsigned long) page);
665 return len;
666 }
667
668 int is_swap_partition(kdev_t dev) {
669 struct swap_info_struct *ptr = swap_info;
670 int i;
671
672 for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
673 if (ptr->flags & SWP_USED)
674 if (ptr->swap_device == dev)
675 return 1;
676 }
677 return 0;
678 }
679
680 /*
681 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
682 *
683 * The swapon system call
684 */
685 asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
686 {
687 struct swap_info_struct * p;
688 struct nameidata nd;
689 struct inode * swap_inode;
690 unsigned int type;
691 int i, j, prev;
692 int error;
693 static int least_priority = 0;
694 union swap_header *swap_header = 0;
695 int swap_header_version;
696 int nr_good_pages = 0;
697 unsigned long maxpages = 1;
698 int swapfilesize;
699 struct block_device *bdev = NULL;
700 unsigned short *swap_map;
701
702 if (!capable(CAP_SYS_ADMIN))
703 return -EPERM;
704 lock_kernel();
705 swap_list_lock();
706 p = swap_info;
707 for (type = 0 ; type < nr_swapfiles ; type++,p++)
708 if (!(p->flags & SWP_USED))
709 break;
710 error = -EPERM;
711 if (type >= MAX_SWAPFILES) {
712 swap_list_unlock();
713 goto out;
714 }
715 if (type >= nr_swapfiles)
716 nr_swapfiles = type+1;
717 p->flags = SWP_USED;
718 p->swap_file = NULL;
719 p->swap_vfsmnt = NULL;
720 p->swap_device = 0;
721 p->swap_map = NULL;
722 p->lowest_bit = 0;
723 p->highest_bit = 0;
724 p->cluster_nr = 0;
725 p->sdev_lock = SPIN_LOCK_UNLOCKED;
726 p->next = -1;
727 if (swap_flags & SWAP_FLAG_PREFER) {
728 p->prio =
729 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
730 } else {
731 p->prio = --least_priority;
732 }
733 swap_list_unlock();
734 error = user_path_walk(specialfile, &nd);
735 if (error)
736 goto bad_swap_2;
737
738 p->swap_file = nd.dentry;
739 p->swap_vfsmnt = nd.mnt;
740 swap_inode = nd.dentry->d_inode;
741 error = -EINVAL;
742
743 if (S_ISBLK(swap_inode->i_mode)) {
744 kdev_t dev = swap_inode->i_rdev;
745 struct block_device_operations *bdops;
746
747 p->swap_device = dev;
748 set_blocksize(dev, PAGE_SIZE);
749
750 bd_acquire(swap_inode);
751 bdev = swap_inode->i_bdev;
752 bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
753 if (bdops) bdev->bd_op = bdops;
754
755 error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
756 if (error)
757 goto bad_swap_2;
758 set_blocksize(dev, PAGE_SIZE);
759 error = -ENODEV;
760 if (!dev || (blk_size[MAJOR(dev)] &&
761 !blk_size[MAJOR(dev)][MINOR(dev)]))
762 goto bad_swap;
763 swapfilesize = 0;
764 if (blk_size[MAJOR(dev)])
765 swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
766 >> (PAGE_SHIFT - 10);
767 } else if (S_ISREG(swap_inode->i_mode))
768 swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
769 else
770 goto bad_swap;
771
772 error = -EBUSY;
773 for (i = 0 ; i < nr_swapfiles ; i++) {
774 struct swap_info_struct *q = &swap_info[i];
775 if (i == type || !q->swap_file)
776 continue;
777 if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
778 goto bad_swap;
779 }
780
781 swap_header = (void *) __get_free_page(GFP_USER);
782 if (!swap_header) {
783 printk("Unable to start swapping: out of memory :-)\n");
784 error = -ENOMEM;
785 goto bad_swap;
786 }
787
788 lock_page(virt_to_page(swap_header));
789 rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
790
791 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
792 swap_header_version = 1;
793 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
794 swap_header_version = 2;
795 else {
796 printk("Unable to find swap-space signature\n");
797 error = -EINVAL;
798 goto bad_swap;
799 }
800
801 switch (swap_header_version) {
802 case 1:
803 memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
804 j = 0;
805 p->lowest_bit = 0;
806 p->highest_bit = 0;
807 for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
808 if (test_bit(i,(char *) swap_header)) {
809 if (!p->lowest_bit)
810 p->lowest_bit = i;
811 p->highest_bit = i;
812 maxpages = i+1;
813 j++;
814 }
815 }
816 nr_good_pages = j;
817 p->swap_map = vmalloc(maxpages * sizeof(short));
818 if (!p->swap_map) {
819 error = -ENOMEM;
820 goto bad_swap;
821 }
822 for (i = 1 ; i < maxpages ; i++) {
823 if (test_bit(i,(char *) swap_header))
824 p->swap_map[i] = 0;
825 else
826 p->swap_map[i] = SWAP_MAP_BAD;
827 }
828 break;
829
830 case 2:
831 /* Check the swap header's sub-version and the size of
832 the swap file and bad block lists */
833 if (swap_header->info.version != 1) {
834 printk(KERN_WARNING
835 "Unable to handle swap header version %d\n",
836 swap_header->info.version);
837 error = -EINVAL;
838 goto bad_swap;
839 }
840
841 p->lowest_bit = 1;
842 maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
843 if (maxpages > swap_header->info.last_page)
844 maxpages = swap_header->info.last_page;
845 p->highest_bit = maxpages - 1;
846
847 error = -EINVAL;
848 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
849 goto bad_swap;
850
851 /* OK, set up the swap map and apply the bad block list */
852 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
853 error = -ENOMEM;
854 goto bad_swap;
855 }
856
857 error = 0;
858 memset(p->swap_map, 0, maxpages * sizeof(short));
859 for (i=0; i<swap_header->info.nr_badpages; i++) {
860 int page = swap_header->info.badpages[i];
861 if (page <= 0 || page >= swap_header->info.last_page)
862 error = -EINVAL;
863 else
864 p->swap_map[page] = SWAP_MAP_BAD;
865 }
866 nr_good_pages = swap_header->info.last_page -
867 swap_header->info.nr_badpages -
868 1 /* header page */;
869 if (error)
870 goto bad_swap;
871 }
872
873 if (swapfilesize && maxpages > swapfilesize) {
874 printk(KERN_WARNING
875 "Swap area shorter than signature indicates\n");
876 error = -EINVAL;
877 goto bad_swap;
878 }
879 if (!nr_good_pages) {
880 printk(KERN_WARNING "Empty swap-file\n");
881 error = -EINVAL;
882 goto bad_swap;
883 }
884 p->swap_map[0] = SWAP_MAP_BAD;
885 swap_list_lock();
886 swap_device_lock(p);
887 p->max = maxpages;
888 p->flags = SWP_WRITEOK;
889 p->pages = nr_good_pages;
890 nr_swap_pages += nr_good_pages;
891 total_swap_pages += nr_good_pages;
892 printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
893 nr_good_pages<<(PAGE_SHIFT-10), p->prio);
894
895 /* insert swap space into swap_list: */
896 prev = -1;
897 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
898 if (p->prio >= swap_info[i].prio) {
899 break;
900 }
901 prev = i;
902 }
903 p->next = i;
904 if (prev < 0) {
905 swap_list.head = swap_list.next = p - swap_info;
906 } else {
907 swap_info[prev].next = p - swap_info;
908 }
909 swap_device_unlock(p);
910 swap_list_unlock();
911 error = 0;
912 goto out;
913 bad_swap:
914 if (bdev)
915 blkdev_put(bdev, BDEV_SWAP);
916 bad_swap_2:
917 swap_list_lock();
918 swap_map = p->swap_map;
919 nd.mnt = p->swap_vfsmnt;
920 nd.dentry = p->swap_file;
921 p->swap_device = 0;
922 p->swap_file = NULL;
923 p->swap_vfsmnt = NULL;
924 p->swap_map = NULL;
925 p->flags = 0;
926 if (!(swap_flags & SWAP_FLAG_PREFER))
927 ++least_priority;
928 swap_list_unlock();
929 if (swap_map)
930 vfree(swap_map);
931 path_release(&nd);
932 out:
933 if (swap_header)
934 free_page((long) swap_header);
935 unlock_kernel();
936 return error;
937 }
938
939 void si_swapinfo(struct sysinfo *val)
940 {
941 unsigned int i;
942 unsigned long nr_to_be_unused = 0;
943
944 swap_list_lock();
945 for (i = 0; i < nr_swapfiles; i++) {
946 unsigned int j;
947 if (swap_info[i].flags != SWP_USED)
948 continue;
949 for (j = 0; j < swap_info[i].max; ++j) {
950 switch (swap_info[i].swap_map[j]) {
951 case 0:
952 case SWAP_MAP_BAD:
953 continue;
954 default:
955 nr_to_be_unused++;
956 }
957 }
958 }
959 val->freeswap = nr_swap_pages + nr_to_be_unused;
960 val->totalswap = total_swap_pages + nr_to_be_unused;
961 swap_list_unlock();
962 }
963
964 /*
965 * Verify that a swap entry is valid and increment its swap map count.
966 *
967 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
968 * "permanent", but will be reclaimed by the next swapoff.
969 */
970 int swap_duplicate(swp_entry_t entry)
971 {
972 struct swap_info_struct * p;
973 unsigned long offset, type;
974 int result = 0;
975
976 type = SWP_TYPE(entry);
977 if (type >= nr_swapfiles)
978 goto bad_file;
979 p = type + swap_info;
980 offset = SWP_OFFSET(entry);
981
982 swap_device_lock(p);
983 if (offset < p->max && p->swap_map[offset]) {
984 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
985 p->swap_map[offset]++;
986 result = 1;
987 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
988 if (swap_overflow++ < 5)
989 printk(KERN_WARNING "swap_dup: swap entry overflow\n");
990 p->swap_map[offset] = SWAP_MAP_MAX;
991 result = 1;
992 }
993 }
994 swap_device_unlock(p);
995 out:
996 return result;
997
998 bad_file:
999 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1000 goto out;
1001 }
1002
1003 /*
1004 * Page lock needs to be held in all cases to prevent races with
1005 * swap file deletion.
1006 */
1007 int swap_count(struct page *page)
1008 {
1009 struct swap_info_struct * p;
1010 unsigned long offset, type;
1011 swp_entry_t entry;
1012 int retval = 0;
1013
1014 entry.val = page->index;
1015 if (!entry.val)
1016 goto bad_entry;
1017 type = SWP_TYPE(entry);
1018 if (type >= nr_swapfiles)
1019 goto bad_file;
1020 p = type + swap_info;
1021 offset = SWP_OFFSET(entry);
1022 if (offset >= p->max)
1023 goto bad_offset;
1024 if (!p->swap_map[offset])
1025 goto bad_unused;
1026 retval = p->swap_map[offset];
1027 out:
1028 return retval;
1029
1030 bad_entry:
1031 printk(KERN_ERR "swap_count: null entry!\n");
1032 goto out;
1033 bad_file:
1034 printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val);
1035 goto out;
1036 bad_offset:
1037 printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val);
1038 goto out;
1039 bad_unused:
1040 printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val);
1041 goto out;
1042 }
1043
1044 /*
1045 * Prior swap_duplicate protects against swap device deletion.
1046 */
1047 void get_swaphandle_info(swp_entry_t entry, unsigned long *offset,
1048 kdev_t *dev, struct inode **swapf)
1049 {
1050 unsigned long type;
1051 struct swap_info_struct *p;
1052
1053 type = SWP_TYPE(entry);
1054 if (type >= nr_swapfiles) {
1055 printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
1056 return;
1057 }
1058
1059 p = &swap_info[type];
1060 *offset = SWP_OFFSET(entry);
1061 if (*offset >= p->max && *offset != 0) {
1062 printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
1063 return;
1064 }
1065 if (p->swap_map && !p->swap_map[*offset]) {
1066 printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
1067 return;
1068 }
1069 if (!(p->flags & SWP_USED)) {
1070 printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
1071 return;
1072 }
1073
1074 if (p->swap_device) {
1075 *dev = p->swap_device;
1076 } else if (p->swap_file) {
1077 *swapf = p->swap_file->d_inode;
1078 } else {
1079 printk(KERN_ERR "rw_swap_page: no swap file or device\n");
1080 }
1081 return;
1082 }
1083
1084 /*
1085 * swap_device_lock prevents swap_map being freed. Don't grab an extra
1086 * reference on the swaphandle, it doesn't matter if it becomes unused.
1087 */
1088 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1089 {
1090 int ret = 0, i = 1 << page_cluster;
1091 unsigned long toff;
1092 struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
1093
1094 if (!page_cluster) /* no readahead */
1095 return 0;
1096 toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
1097 if (!toff) /* first page is swap header */
1098 toff++, i--;
1099 *offset = toff;
1100
1101 swap_device_lock(swapdev);
1102 do {
1103 /* Don't read-ahead past the end of the swap area */
1104 if (toff >= swapdev->max)
1105 break;
1106 /* Don't read in free or bad pages */
1107 if (!swapdev->swap_map[toff])
1108 break;
1109 if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
1110 break;
1111 toff++;
1112 ret++;
1113 } while (--i);
1114 swap_device_unlock(swapdev);
1115 return ret;
1116 }
1117