File: /usr/src/linux/mm/vmscan.c
1 /*
2 * linux/mm/vmscan.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 *
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
12 * Multiqueue VM started 5.8.00, Rik van Riel.
13 */
14
15 #include <linux/slab.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/smp_lock.h>
20 #include <linux/pagemap.h>
21 #include <linux/init.h>
22 #include <linux/highmem.h>
23 #include <linux/file.h>
24 #include <linux/compiler.h>
25
26 #include <asm/pgalloc.h>
27
28 /*
29 * The "priority" of VM scanning is how much of the queues we
30 * will scan in one go. A value of 6 for DEF_PRIORITY implies
31 * that we'll scan 1/64th of the queues ("queue_length >> 6")
32 * during a normal aging round.
33 */
34 #define DEF_PRIORITY (6)
35
36 /*
37 * The swap-out function returns 1 if it successfully
38 * scanned all the pages it was asked to (`count').
39 * It returns zero if it couldn't do anything,
40 *
41 * rss may decrease because pages are shared, but this
42 * doesn't count as having freed a page.
43 */
44
45 /* mm->page_table_lock is held. mmap_sem is not held */
46 static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
47 {
48 pte_t pte;
49 swp_entry_t entry;
50 int right_classzone;
51
52 /* Don't look at this pte if it's been accessed recently. */
53 if (ptep_test_and_clear_young(page_table)) {
54 flush_tlb_page(vma, address);
55 return 0;
56 }
57
58 if (TryLockPage(page))
59 return 0;
60
61 right_classzone = 1;
62 if (!memclass(page->zone, classzone))
63 right_classzone = 0;
64
65 /* From this point on, the odds are that we're going to
66 * nuke this pte, so read and clear the pte. This hook
67 * is needed on CPUs which update the accessed and dirty
68 * bits in hardware.
69 */
70 flush_cache_page(vma, address);
71 pte = ptep_get_and_clear(page_table);
72 flush_tlb_page(vma, address);
73
74 /*
75 * Is the page already in the swap cache? If so, then
76 * we can just drop our reference to it without doing
77 * any IO - it's already up-to-date on disk.
78 */
79 if (PageSwapCache(page)) {
80 entry.val = page->index;
81 if (pte_dirty(pte))
82 set_page_dirty(page);
83 swap_duplicate(entry);
84 set_swap_pte:
85 set_pte(page_table, swp_entry_to_pte(entry));
86 drop_pte:
87 mm->rss--;
88 UnlockPage(page);
89 {
90 int freeable = page_count(page) - !!page->buffers <= 2;
91 page_cache_release(page);
92 return freeable & right_classzone;
93 }
94 }
95
96 /*
97 * Is it a clean page? Then it must be recoverable
98 * by just paging it in again, and we can just drop
99 * it.. or if it's dirty but has backing store,
100 * just mark the page dirty and drop it.
101 *
102 * However, this won't actually free any real
103 * memory, as the page will just be in the page cache
104 * somewhere, and as such we should just continue
105 * our scan.
106 *
107 * Basically, this just makes it possible for us to do
108 * some real work in the future in "refill_inactive()".
109 */
110 if (page->mapping) {
111 if (pte_dirty(pte))
112 set_page_dirty(page);
113 goto drop_pte;
114 }
115 /*
116 * Check PageDirty as well as pte_dirty: page may
117 * have been brought back from swap by swapoff.
118 */
119 if (!pte_dirty(pte) && !PageDirty(page))
120 goto drop_pte;
121
122 /*
123 * This is a dirty, swappable page. First of all,
124 * get a suitable swap entry for it, and make sure
125 * we have the swap cache set up to associate the
126 * page with that swap entry.
127 */
128 swap_list_lock();
129 entry = get_swap_page();
130 if (entry.val) {
131 /* Add it to the swap cache and mark it dirty */
132 add_to_swap_cache(page, entry);
133 swap_list_unlock();
134 set_page_dirty(page);
135 goto set_swap_pte;
136 }
137
138 /* No swap space left */
139 swap_list_unlock();
140 set_pte(page_table, pte);
141 UnlockPage(page);
142 return 0;
143 }
144
145 /* mm->page_table_lock is held. mmap_sem is not held */
146 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
147 {
148 pte_t * pte;
149 unsigned long pmd_end;
150
151 if (pmd_none(*dir))
152 return count;
153 if (pmd_bad(*dir)) {
154 pmd_ERROR(*dir);
155 pmd_clear(dir);
156 return count;
157 }
158
159 pte = pte_offset(dir, address);
160
161 pmd_end = (address + PMD_SIZE) & PMD_MASK;
162 if (end > pmd_end)
163 end = pmd_end;
164
165 do {
166 if (pte_present(*pte)) {
167 struct page *page = pte_page(*pte);
168
169 if (VALID_PAGE(page) && !PageReserved(page)) {
170 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
171 if (!count) {
172 address += PAGE_SIZE;
173 break;
174 }
175 }
176 }
177 address += PAGE_SIZE;
178 pte++;
179 } while (address && (address < end));
180 mm->swap_address = address;
181 return count;
182 }
183
184 /* mm->page_table_lock is held. mmap_sem is not held */
185 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
186 {
187 pmd_t * pmd;
188 unsigned long pgd_end;
189
190 if (pgd_none(*dir))
191 return count;
192 if (pgd_bad(*dir)) {
193 pgd_ERROR(*dir);
194 pgd_clear(dir);
195 return count;
196 }
197
198 pmd = pmd_offset(dir, address);
199
200 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
201 if (pgd_end && (end > pgd_end))
202 end = pgd_end;
203
204 do {
205 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
206 if (!count)
207 break;
208 address = (address + PMD_SIZE) & PMD_MASK;
209 pmd++;
210 } while (address && (address < end));
211 return count;
212 }
213
214 /* mm->page_table_lock is held. mmap_sem is not held */
215 static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
216 {
217 pgd_t *pgdir;
218 unsigned long end;
219
220 /* Don't swap out areas which are locked down */
221 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
222 return count;
223
224 pgdir = pgd_offset(mm, address);
225
226 end = vma->vm_end;
227 if (address >= end)
228 BUG();
229 do {
230 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
231 if (!count)
232 break;
233 address = (address + PGDIR_SIZE) & PGDIR_MASK;
234 pgdir++;
235 } while (address && (address < end));
236 return count;
237 }
238
239 /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
240 struct mm_struct *swap_mm = &init_mm;
241
242 /*
243 * Returns remaining count of pages to be swapped out by followup call.
244 */
245 static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
246 {
247 unsigned long address;
248 struct vm_area_struct* vma;
249
250 /*
251 * Find the proper vm-area after freezing the vma chain
252 * and ptes.
253 */
254 spin_lock(&mm->page_table_lock);
255 address = mm->swap_address;
256 if (address == TASK_SIZE || swap_mm != mm) {
257 /* We raced: don't count this mm but try again */
258 ++*mmcounter;
259 goto out_unlock;
260 }
261 vma = find_vma(mm, address);
262 if (vma) {
263 if (address < vma->vm_start)
264 address = vma->vm_start;
265
266 for (;;) {
267 count = swap_out_vma(mm, vma, address, count, classzone);
268 vma = vma->vm_next;
269 if (!vma)
270 break;
271 if (!count)
272 goto out_unlock;
273 address = vma->vm_start;
274 }
275 }
276 /* Indicate that we reached the end of address space */
277 mm->swap_address = TASK_SIZE;
278
279 out_unlock:
280 spin_unlock(&mm->page_table_lock);
281 return count;
282 }
283
284 static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
285 static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
286 {
287 int counter;
288 struct mm_struct *mm;
289
290 /* Then, look at the other mm's */
291 counter = mmlist_nr / priority;
292 do {
293 if (unlikely(current->need_resched)) {
294 __set_current_state(TASK_RUNNING);
295 schedule();
296 }
297
298 spin_lock(&mmlist_lock);
299 mm = swap_mm;
300 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
301 mm->swap_address = 0;
302 mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
303 if (mm == swap_mm)
304 goto empty;
305 swap_mm = mm;
306 }
307
308 /* Make sure the mm doesn't disappear when we drop the lock.. */
309 atomic_inc(&mm->mm_users);
310 spin_unlock(&mmlist_lock);
311
312 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
313
314 mmput(mm);
315
316 if (!nr_pages)
317 return 1;
318 } while (--counter >= 0);
319
320 return 0;
321
322 empty:
323 spin_unlock(&mmlist_lock);
324 return 0;
325 }
326
327 static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask));
328 static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)
329 {
330 struct list_head * entry;
331
332 spin_lock(&pagemap_lru_lock);
333 while (max_scan && (entry = inactive_list.prev) != &inactive_list) {
334 struct page * page;
335 swp_entry_t swap;
336
337 if (unlikely(current->need_resched)) {
338 spin_unlock(&pagemap_lru_lock);
339 __set_current_state(TASK_RUNNING);
340 schedule();
341 spin_lock(&pagemap_lru_lock);
342 continue;
343 }
344
345 page = list_entry(entry, struct page, lru);
346
347 if (unlikely(!PageInactive(page) && !PageActive(page)))
348 BUG();
349
350 list_del(entry);
351 list_add(entry, &inactive_list);
352 if (PageTestandClearReferenced(page))
353 continue;
354
355 max_scan--;
356
357 if (unlikely(!memclass(page->zone, classzone)))
358 continue;
359
360 /* Racy check to avoid trylocking when not worthwhile */
361 if (!page->buffers && page_count(page) != 1)
362 continue;
363
364 /*
365 * The page is locked. IO in progress?
366 * Move it to the back of the list.
367 */
368 if (unlikely(TryLockPage(page)))
369 continue;
370
371 if (PageDirty(page) && is_page_cache_freeable(page)) {
372 /*
373 * It is not critical here to write it only if
374 * the page is unmapped beause any direct writer
375 * like O_DIRECT would set the PG_dirty bitflag
376 * on the phisical page after having successfully
377 * pinned it and after the I/O to the page is finished,
378 * so the direct writes to the page cannot get lost.
379 */
380 int (*writepage)(struct page *);
381
382 writepage = page->mapping->a_ops->writepage;
383 if ((gfp_mask & __GFP_FS) && writepage) {
384 ClearPageDirty(page);
385 page_cache_get(page);
386 spin_unlock(&pagemap_lru_lock);
387
388 writepage(page);
389 page_cache_release(page);
390
391 spin_lock(&pagemap_lru_lock);
392 continue;
393 }
394 }
395
396 /*
397 * If the page has buffers, try to free the buffer mappings
398 * associated with this page. If we succeed we try to free
399 * the page as well.
400 */
401 if (page->buffers) {
402 spin_unlock(&pagemap_lru_lock);
403
404 /* avoid to free a locked page */
405 page_cache_get(page);
406
407 if (try_to_free_buffers(page, gfp_mask)) {
408 if (!page->mapping) {
409 /*
410 * Account we successfully freed a page
411 * of buffer cache.
412 */
413 atomic_dec(&buffermem_pages);
414
415 /*
416 * We must not allow an anon page
417 * with no buffers to be visible on
418 * the LRU, so we unlock the page after
419 * taking the lru lock
420 */
421 spin_lock(&pagemap_lru_lock);
422 UnlockPage(page);
423 __lru_cache_del(page);
424
425 /* effectively free the page here */
426 page_cache_release(page);
427
428 if (--nr_pages)
429 continue;
430 break;
431 } else {
432 /*
433 * The page is still in pagecache so undo the stuff
434 * before the try_to_free_buffers since we've not
435 * finished and we can now try the next step.
436 */
437 page_cache_release(page);
438
439 spin_lock(&pagemap_lru_lock);
440 }
441 } else {
442 /* failed to drop the buffers so stop here */
443 UnlockPage(page);
444 page_cache_release(page);
445
446 spin_lock(&pagemap_lru_lock);
447 continue;
448 }
449 }
450
451 if (unlikely(!page->mapping))
452 BUG();
453
454 if (unlikely(!spin_trylock(&pagecache_lock))) {
455 /* we hold the page lock so the page cannot go away from under us */
456 spin_unlock(&pagemap_lru_lock);
457
458 spin_lock(&pagecache_lock);
459 spin_lock(&pagemap_lru_lock);
460 }
461
462 /*
463 * this is the non-racy check, it is critical to check
464 * PageDirty _after_ we made sure the page is freeable
465 * so not in use by anybody.
466 */
467 if (!is_page_cache_freeable(page) || PageDirty(page)) {
468 spin_unlock(&pagecache_lock);
469 UnlockPage(page);
470 continue;
471 }
472
473 /* point of no return */
474 if (likely(!PageSwapCache(page))) {
475 swap.val = 0;
476 __remove_inode_page(page);
477 } else {
478 swap.val = page->index;
479 __delete_from_swap_cache(page);
480 }
481 spin_unlock(&pagecache_lock);
482
483 __lru_cache_del(page);
484
485 if (unlikely(swap.val != 0)) {
486 /* must drop lru lock if getting swap_list lock */
487 spin_unlock(&pagemap_lru_lock);
488 swap_free(swap);
489 spin_lock(&pagemap_lru_lock);
490 }
491
492 UnlockPage(page);
493
494 /* effectively free the page here */
495 page_cache_release(page);
496
497 if (--nr_pages)
498 continue;
499 break;
500 }
501 spin_unlock(&pagemap_lru_lock);
502
503 return nr_pages;
504 }
505
506 /*
507 * This moves pages from the active list to
508 * the inactive list.
509 *
510 * We move them the other way when we see the
511 * reference bit on the page.
512 */
513 static void refill_inactive(int nr_pages)
514 {
515 struct list_head * entry;
516
517 spin_lock(&pagemap_lru_lock);
518 entry = active_list.prev;
519 while (nr_pages-- && entry != &active_list) {
520 struct page * page;
521
522 page = list_entry(entry, struct page, lru);
523 entry = entry->prev;
524 if (PageTestandClearReferenced(page)) {
525 list_del(&page->lru);
526 list_add(&page->lru, &active_list);
527 continue;
528 }
529
530 del_page_from_active_list(page);
531 add_page_to_inactive_list(page);
532 }
533 spin_unlock(&pagemap_lru_lock);
534 }
535
536 static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
537 static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
538 {
539 int max_scan = nr_inactive_pages / priority;
540
541 nr_pages -= kmem_cache_reap(gfp_mask);
542 if (nr_pages <= 0)
543 return 0;
544
545 /* Do we want to age the active list? */
546 if (nr_inactive_pages < nr_active_pages*2)
547 refill_inactive(nr_pages);
548
549 nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask);
550 if (nr_pages <= 0)
551 return 0;
552
553 shrink_dcache_memory(priority, gfp_mask);
554 shrink_icache_memory(priority, gfp_mask);
555
556 return nr_pages;
557 }
558
559 int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order)
560 {
561 int priority = DEF_PRIORITY;
562 int ret = 0;
563
564 do {
565 int nr_pages = SWAP_CLUSTER_MAX;
566 nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages);
567 if (nr_pages <= 0)
568 return 1;
569
570 ret |= swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX << 2);
571 } while (--priority);
572
573 return ret;
574 }
575
576 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
577
578 static int check_classzone_need_balance(zone_t * classzone)
579 {
580 zone_t * first_classzone;
581
582 first_classzone = classzone->zone_pgdat->node_zones;
583 while (classzone >= first_classzone) {
584 if (classzone->free_pages > classzone->pages_high)
585 return 0;
586 classzone--;
587 }
588 return 1;
589 }
590
591 static int kswapd_balance_pgdat(pg_data_t * pgdat)
592 {
593 int need_more_balance = 0, i;
594 zone_t * zone;
595
596 for (i = pgdat->nr_zones-1; i >= 0; i--) {
597 zone = pgdat->node_zones + i;
598 if (unlikely(current->need_resched))
599 schedule();
600 if (!zone->need_balance)
601 continue;
602 if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
603 zone->need_balance = 0;
604 __set_current_state(TASK_INTERRUPTIBLE);
605 schedule_timeout(HZ*5);
606 continue;
607 }
608 if (check_classzone_need_balance(zone))
609 need_more_balance = 1;
610 else
611 zone->need_balance = 0;
612 }
613
614 return need_more_balance;
615 }
616
617 static void kswapd_balance(void)
618 {
619 int need_more_balance;
620 pg_data_t * pgdat;
621
622 do {
623 need_more_balance = 0;
624 pgdat = pgdat_list;
625 do
626 need_more_balance |= kswapd_balance_pgdat(pgdat);
627 while ((pgdat = pgdat->node_next));
628 } while (need_more_balance);
629 }
630
631 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
632 {
633 zone_t * zone;
634 int i;
635
636 for (i = pgdat->nr_zones-1; i >= 0; i--) {
637 zone = pgdat->node_zones + i;
638 if (!zone->need_balance)
639 continue;
640 return 0;
641 }
642
643 return 1;
644 }
645
646 static int kswapd_can_sleep(void)
647 {
648 pg_data_t * pgdat;
649
650 pgdat = pgdat_list;
651 do {
652 if (kswapd_can_sleep_pgdat(pgdat))
653 continue;
654 return 0;
655 } while ((pgdat = pgdat->node_next));
656
657 return 1;
658 }
659
660 /*
661 * The background pageout daemon, started as a kernel thread
662 * from the init process.
663 *
664 * This basically trickles out pages so that we have _some_
665 * free memory available even if there is no other activity
666 * that frees anything up. This is needed for things like routing
667 * etc, where we otherwise might have all activity going on in
668 * asynchronous contexts that cannot page things out.
669 *
670 * If there are applications that are active memory-allocators
671 * (most normal use), this basically shouldn't matter.
672 */
673 int kswapd(void *unused)
674 {
675 struct task_struct *tsk = current;
676 DECLARE_WAITQUEUE(wait, tsk);
677
678 daemonize();
679 strcpy(tsk->comm, "kswapd");
680 sigfillset(&tsk->blocked);
681
682 /*
683 * Tell the memory management that we're a "memory allocator",
684 * and that if we need more memory we should get access to it
685 * regardless (see "__alloc_pages()"). "kswapd" should
686 * never get caught in the normal page freeing logic.
687 *
688 * (Kswapd normally doesn't need memory anyway, but sometimes
689 * you need a small amount of memory in order to be able to
690 * page out something else, and this flag essentially protects
691 * us from recursively trying to free more memory as we're
692 * trying to free the first piece of memory in the first place).
693 */
694 tsk->flags |= PF_MEMALLOC;
695
696 /*
697 * Kswapd main loop.
698 */
699 for (;;) {
700 __set_current_state(TASK_INTERRUPTIBLE);
701 add_wait_queue(&kswapd_wait, &wait);
702
703 mb();
704 if (kswapd_can_sleep())
705 schedule();
706
707 __set_current_state(TASK_RUNNING);
708 remove_wait_queue(&kswapd_wait, &wait);
709
710 /*
711 * If we actually get into a low-memory situation,
712 * the processes needing more memory will wake us
713 * up on a more timely basis.
714 */
715 kswapd_balance();
716 run_task_queue(&tq_disk);
717 }
718 }
719
720 static int __init kswapd_init(void)
721 {
722 printk("Starting kswapd\n");
723 swap_setup();
724 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
725 return 0;
726 }
727
728 module_init(kswapd_init)
729