File: /usr/src/linux/mm/vmscan.c

1     /*
2      *  linux/mm/vmscan.c
3      *
4      *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5      *
6      *  Swap reorganised 29.12.95, Stephen Tweedie.
7      *  kswapd added: 7.1.96  sct
8      *  Removed kswapd_ctl limits, and swap out as many pages as needed
9      *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10      *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
11      *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
12      *  Multiqueue VM started 5.8.00, Rik van Riel.
13      */
14     
15     #include <linux/slab.h>
16     #include <linux/kernel_stat.h>
17     #include <linux/swap.h>
18     #include <linux/swapctl.h>
19     #include <linux/smp_lock.h>
20     #include <linux/pagemap.h>
21     #include <linux/init.h>
22     #include <linux/highmem.h>
23     #include <linux/file.h>
24     #include <linux/compiler.h>
25     
26     #include <asm/pgalloc.h>
27     
28     /*
29      * The "priority" of VM scanning is how much of the queues we
30      * will scan in one go. A value of 6 for DEF_PRIORITY implies
31      * that we'll scan 1/64th of the queues ("queue_length >> 6")
32      * during a normal aging round.
33      */
34     #define DEF_PRIORITY (6)
35     
36     /*
37      * The swap-out function returns 1 if it successfully
38      * scanned all the pages it was asked to (`count').
39      * It returns zero if it couldn't do anything,
40      *
41      * rss may decrease because pages are shared, but this
42      * doesn't count as having freed a page.
43      */
44     
45     /* mm->page_table_lock is held. mmap_sem is not held */
46     static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
47     {
48     	pte_t pte;
49     	swp_entry_t entry;
50     	int right_classzone;
51     
52     	/* Don't look at this pte if it's been accessed recently. */
53     	if (ptep_test_and_clear_young(page_table)) {
54     		flush_tlb_page(vma, address);
55     		return 0;
56     	}
57     
58     	if (TryLockPage(page))
59     		return 0;
60     
61     	right_classzone = 1;
62     	if (!memclass(page->zone, classzone))
63     		right_classzone = 0;
64     
65     	/* From this point on, the odds are that we're going to
66     	 * nuke this pte, so read and clear the pte.  This hook
67     	 * is needed on CPUs which update the accessed and dirty
68     	 * bits in hardware.
69     	 */
70     	flush_cache_page(vma, address);
71     	pte = ptep_get_and_clear(page_table);
72     	flush_tlb_page(vma, address);
73     
74     	/*
75     	 * Is the page already in the swap cache? If so, then
76     	 * we can just drop our reference to it without doing
77     	 * any IO - it's already up-to-date on disk.
78     	 */
79     	if (PageSwapCache(page)) {
80     		entry.val = page->index;
81     		if (pte_dirty(pte))
82     			set_page_dirty(page);
83     		swap_duplicate(entry);
84     set_swap_pte:
85     		set_pte(page_table, swp_entry_to_pte(entry));
86     drop_pte:
87     		mm->rss--;
88     		UnlockPage(page);
89     		{
90     			int freeable = page_count(page) - !!page->buffers <= 2;
91     			page_cache_release(page);
92     			return freeable & right_classzone;
93     		}
94     	}
95     
96     	/*
97     	 * Is it a clean page? Then it must be recoverable
98     	 * by just paging it in again, and we can just drop
99     	 * it..  or if it's dirty but has backing store,
100     	 * just mark the page dirty and drop it.
101     	 *
102     	 * However, this won't actually free any real
103     	 * memory, as the page will just be in the page cache
104     	 * somewhere, and as such we should just continue
105     	 * our scan.
106     	 *
107     	 * Basically, this just makes it possible for us to do
108     	 * some real work in the future in "refill_inactive()".
109     	 */
110     	if (page->mapping) {
111     		if (pte_dirty(pte))
112     			set_page_dirty(page);
113     		goto drop_pte;
114     	}
115     	/*
116     	 * Check PageDirty as well as pte_dirty: page may
117     	 * have been brought back from swap by swapoff.
118     	 */
119     	if (!pte_dirty(pte) && !PageDirty(page))
120     		goto drop_pte;
121     
122     	/*
123     	 * This is a dirty, swappable page.  First of all,
124     	 * get a suitable swap entry for it, and make sure
125     	 * we have the swap cache set up to associate the
126     	 * page with that swap entry.
127     	 */
128     	swap_list_lock();
129     	entry = get_swap_page();
130     	if (entry.val) {
131     		/* Add it to the swap cache and mark it dirty */
132     		add_to_swap_cache(page, entry);
133     		swap_list_unlock();
134     		set_page_dirty(page);
135     		goto set_swap_pte;
136     	}
137     
138     	/* No swap space left */
139     	swap_list_unlock();
140     	set_pte(page_table, pte);
141     	UnlockPage(page);
142     	return 0;
143     }
144     
145     /* mm->page_table_lock is held. mmap_sem is not held */
146     static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
147     {
148     	pte_t * pte;
149     	unsigned long pmd_end;
150     
151     	if (pmd_none(*dir))
152     		return count;
153     	if (pmd_bad(*dir)) {
154     		pmd_ERROR(*dir);
155     		pmd_clear(dir);
156     		return count;
157     	}
158     	
159     	pte = pte_offset(dir, address);
160     	
161     	pmd_end = (address + PMD_SIZE) & PMD_MASK;
162     	if (end > pmd_end)
163     		end = pmd_end;
164     
165     	do {
166     		if (pte_present(*pte)) {
167     			struct page *page = pte_page(*pte);
168     
169     			if (VALID_PAGE(page) && !PageReserved(page)) {
170     				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
171     				if (!count) {
172     					address += PAGE_SIZE;
173     					break;
174     				}
175     			}
176     		}
177     		address += PAGE_SIZE;
178     		pte++;
179     	} while (address && (address < end));
180     	mm->swap_address = address;
181     	return count;
182     }
183     
184     /* mm->page_table_lock is held. mmap_sem is not held */
185     static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
186     {
187     	pmd_t * pmd;
188     	unsigned long pgd_end;
189     
190     	if (pgd_none(*dir))
191     		return count;
192     	if (pgd_bad(*dir)) {
193     		pgd_ERROR(*dir);
194     		pgd_clear(dir);
195     		return count;
196     	}
197     
198     	pmd = pmd_offset(dir, address);
199     
200     	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;	
201     	if (pgd_end && (end > pgd_end))
202     		end = pgd_end;
203     	
204     	do {
205     		count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
206     		if (!count)
207     			break;
208     		address = (address + PMD_SIZE) & PMD_MASK;
209     		pmd++;
210     	} while (address && (address < end));
211     	return count;
212     }
213     
214     /* mm->page_table_lock is held. mmap_sem is not held */
215     static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
216     {
217     	pgd_t *pgdir;
218     	unsigned long end;
219     
220     	/* Don't swap out areas which are locked down */
221     	if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
222     		return count;
223     
224     	pgdir = pgd_offset(mm, address);
225     
226     	end = vma->vm_end;
227     	if (address >= end)
228     		BUG();
229     	do {
230     		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
231     		if (!count)
232     			break;
233     		address = (address + PGDIR_SIZE) & PGDIR_MASK;
234     		pgdir++;
235     	} while (address && (address < end));
236     	return count;
237     }
238     
239     /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
240     struct mm_struct *swap_mm = &init_mm;
241     
242     /*
243      * Returns remaining count of pages to be swapped out by followup call.
244      */
245     static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
246     {
247     	unsigned long address;
248     	struct vm_area_struct* vma;
249     
250     	/*
251     	 * Find the proper vm-area after freezing the vma chain 
252     	 * and ptes.
253     	 */
254     	spin_lock(&mm->page_table_lock);
255     	address = mm->swap_address;
256     	if (address == TASK_SIZE || swap_mm != mm) {
257     		/* We raced: don't count this mm but try again */
258     		++*mmcounter;
259     		goto out_unlock;
260     	}
261     	vma = find_vma(mm, address);
262     	if (vma) {
263     		if (address < vma->vm_start)
264     			address = vma->vm_start;
265     
266     		for (;;) {
267     			count = swap_out_vma(mm, vma, address, count, classzone);
268     			vma = vma->vm_next;
269     			if (!vma)
270     				break;
271     			if (!count)
272     				goto out_unlock;
273     			address = vma->vm_start;
274     		}
275     	}
276     	/* Indicate that we reached the end of address space */
277     	mm->swap_address = TASK_SIZE;
278     
279     out_unlock:
280     	spin_unlock(&mm->page_table_lock);
281     	return count;
282     }
283     
284     static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
285     static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
286     {
287     	int counter;
288     	struct mm_struct *mm;
289     
290     	/* Then, look at the other mm's */
291     	counter = mmlist_nr / priority;
292     	do {
293     		if (unlikely(current->need_resched)) {
294     			__set_current_state(TASK_RUNNING);
295     			schedule();
296     		}
297     
298     		spin_lock(&mmlist_lock);
299     		mm = swap_mm;
300     		while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
301     			mm->swap_address = 0;
302     			mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
303     			if (mm == swap_mm)
304     				goto empty;
305     			swap_mm = mm;
306     		}
307     
308     		/* Make sure the mm doesn't disappear when we drop the lock.. */
309     		atomic_inc(&mm->mm_users);
310     		spin_unlock(&mmlist_lock);
311     
312     		nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
313     
314     		mmput(mm);
315     
316     		if (!nr_pages)
317     			return 1;
318     	} while (--counter >= 0);
319     
320     	return 0;
321     
322     empty:
323     	spin_unlock(&mmlist_lock);
324     	return 0;
325     }
326     
327     static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask));
328     static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)
329     {
330     	struct list_head * entry;
331     
332     	spin_lock(&pagemap_lru_lock);
333     	while (max_scan && (entry = inactive_list.prev) != &inactive_list) {
334     		struct page * page;
335     		swp_entry_t swap;
336     
337     		if (unlikely(current->need_resched)) {
338     			spin_unlock(&pagemap_lru_lock);
339     			__set_current_state(TASK_RUNNING);
340     			schedule();
341     			spin_lock(&pagemap_lru_lock);
342     			continue;
343     		}
344     
345     		page = list_entry(entry, struct page, lru);
346     
347     		if (unlikely(!PageInactive(page) && !PageActive(page)))
348     			BUG();
349     
350     		list_del(entry);
351     		list_add(entry, &inactive_list);
352     		if (PageTestandClearReferenced(page))
353     			continue;
354     
355     		max_scan--;
356     
357     		if (unlikely(!memclass(page->zone, classzone)))
358     			continue;
359     
360     		/* Racy check to avoid trylocking when not worthwhile */
361     		if (!page->buffers && page_count(page) != 1)
362     			continue;
363     
364     		/*
365     		 * The page is locked. IO in progress?
366     		 * Move it to the back of the list.
367     		 */
368     		if (unlikely(TryLockPage(page)))
369     			continue;
370     
371     		if (PageDirty(page) && is_page_cache_freeable(page)) {
372     			/*
373     			 * It is not critical here to write it only if
374     			 * the page is unmapped beause any direct writer
375     			 * like O_DIRECT would set the PG_dirty bitflag
376     			 * on the phisical page after having successfully
377     			 * pinned it and after the I/O to the page is finished,
378     			 * so the direct writes to the page cannot get lost.
379     			 */
380     			int (*writepage)(struct page *);
381     
382     			writepage = page->mapping->a_ops->writepage;
383     			if ((gfp_mask & __GFP_FS) && writepage) {
384     				ClearPageDirty(page);
385     				page_cache_get(page);
386     				spin_unlock(&pagemap_lru_lock);
387     
388     				writepage(page);
389     				page_cache_release(page);
390     
391     				spin_lock(&pagemap_lru_lock);
392     				continue;
393     			}
394     		}
395     
396     		/*
397     		 * If the page has buffers, try to free the buffer mappings
398     		 * associated with this page. If we succeed we try to free
399     		 * the page as well.
400     		 */
401     		if (page->buffers) {
402     			spin_unlock(&pagemap_lru_lock);
403     
404     			/* avoid to free a locked page */
405     			page_cache_get(page);
406     
407     			if (try_to_free_buffers(page, gfp_mask)) {
408     				if (!page->mapping) {
409     					/*
410     					 * Account we successfully freed a page
411     					 * of buffer cache.
412     					 */
413     					atomic_dec(&buffermem_pages);
414     
415     					/*
416     					 * We must not allow an anon page
417     					 * with no buffers to be visible on
418     					 * the LRU, so we unlock the page after
419     					 * taking the lru lock
420     					 */
421     					spin_lock(&pagemap_lru_lock);
422     					UnlockPage(page);
423     					__lru_cache_del(page);
424     
425     					/* effectively free the page here */
426     					page_cache_release(page);
427     
428     					if (--nr_pages)
429     						continue;
430     					break;
431     				} else {
432     					/*
433     					 * The page is still in pagecache so undo the stuff
434     					 * before the try_to_free_buffers since we've not
435     					 * finished and we can now try the next step.
436     					 */
437     					page_cache_release(page);
438     
439     					spin_lock(&pagemap_lru_lock);
440     				}
441     			} else {
442     				/* failed to drop the buffers so stop here */
443     				UnlockPage(page);
444     				page_cache_release(page);
445     
446     				spin_lock(&pagemap_lru_lock);
447     				continue;
448     			}
449     		}
450     
451     		if (unlikely(!page->mapping))
452     			BUG();
453     
454     		if (unlikely(!spin_trylock(&pagecache_lock))) {
455     			/* we hold the page lock so the page cannot go away from under us */
456     			spin_unlock(&pagemap_lru_lock);
457     
458     			spin_lock(&pagecache_lock);
459     			spin_lock(&pagemap_lru_lock);
460     		}
461     
462     		/*
463     		 * this is the non-racy check, it is critical to check
464     		 * PageDirty _after_ we made sure the page is freeable
465     		 * so not in use by anybody.
466     		 */
467     		if (!is_page_cache_freeable(page) || PageDirty(page)) {
468     			spin_unlock(&pagecache_lock);
469     			UnlockPage(page);
470     			continue;
471     		}
472     
473     		/* point of no return */
474     		if (likely(!PageSwapCache(page))) {
475     			swap.val = 0;
476     			__remove_inode_page(page);
477     		} else {
478     			swap.val = page->index;
479     			__delete_from_swap_cache(page);
480     		}
481     		spin_unlock(&pagecache_lock);
482     
483     		__lru_cache_del(page);
484     
485     		if (unlikely(swap.val != 0)) {
486     			/* must drop lru lock if getting swap_list lock */
487     			spin_unlock(&pagemap_lru_lock);
488     			swap_free(swap);
489     			spin_lock(&pagemap_lru_lock);
490     		}
491     
492     		UnlockPage(page);
493     
494     		/* effectively free the page here */
495     		page_cache_release(page);
496     
497     		if (--nr_pages)
498     			continue;
499     		break;
500     	}
501     	spin_unlock(&pagemap_lru_lock);
502     
503     	return nr_pages;
504     }
505     
506     /*
507      * This moves pages from the active list to
508      * the inactive list.
509      *
510      * We move them the other way when we see the
511      * reference bit on the page.
512      */
513     static void refill_inactive(int nr_pages)
514     {
515     	struct list_head * entry;
516     
517     	spin_lock(&pagemap_lru_lock);
518     	entry = active_list.prev;
519     	while (nr_pages-- && entry != &active_list) {
520     		struct page * page;
521     
522     		page = list_entry(entry, struct page, lru);
523     		entry = entry->prev;
524     		if (PageTestandClearReferenced(page)) {
525     			list_del(&page->lru);
526     			list_add(&page->lru, &active_list);
527     			continue;
528     		}
529     
530     		del_page_from_active_list(page);
531     		add_page_to_inactive_list(page);
532     	}
533     	spin_unlock(&pagemap_lru_lock);
534     }
535     
536     static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
537     static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
538     {
539     	int max_scan = nr_inactive_pages / priority;
540     
541     	nr_pages -= kmem_cache_reap(gfp_mask);
542     	if (nr_pages <= 0)
543     		return 0;
544     
545     	/* Do we want to age the active list? */
546     	if (nr_inactive_pages < nr_active_pages*2)
547     		refill_inactive(nr_pages);
548     
549     	nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask);
550     	if (nr_pages <= 0)
551     		return 0;
552     
553     	shrink_dcache_memory(priority, gfp_mask);
554     	shrink_icache_memory(priority, gfp_mask);
555     
556     	return nr_pages;
557     }
558     
559     int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order)
560     {
561     	int priority = DEF_PRIORITY;
562     	int ret = 0;
563     
564     	do {
565     		int nr_pages = SWAP_CLUSTER_MAX;
566     		nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages);
567     		if (nr_pages <= 0)
568     			return 1;
569     
570     		ret |= swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX << 2);
571     	} while (--priority);
572     
573     	return ret;
574     }
575     
576     DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
577     
578     static int check_classzone_need_balance(zone_t * classzone)
579     {
580     	zone_t * first_classzone;
581     
582     	first_classzone = classzone->zone_pgdat->node_zones;
583     	while (classzone >= first_classzone) {
584     		if (classzone->free_pages > classzone->pages_high)
585     			return 0;
586     		classzone--;
587     	}
588     	return 1;
589     }
590     
591     static int kswapd_balance_pgdat(pg_data_t * pgdat)
592     {
593     	int need_more_balance = 0, i;
594     	zone_t * zone;
595     
596     	for (i = pgdat->nr_zones-1; i >= 0; i--) {
597     		zone = pgdat->node_zones + i;
598     		if (unlikely(current->need_resched))
599     			schedule();
600     		if (!zone->need_balance)
601     			continue;
602     		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
603     			zone->need_balance = 0;
604     			__set_current_state(TASK_INTERRUPTIBLE);
605     			schedule_timeout(HZ*5);
606     			continue;
607     		}
608     		if (check_classzone_need_balance(zone))
609     			need_more_balance = 1;
610     		else
611     			zone->need_balance = 0;
612     	}
613     
614     	return need_more_balance;
615     }
616     
617     static void kswapd_balance(void)
618     {
619     	int need_more_balance;
620     	pg_data_t * pgdat;
621     
622     	do {
623     		need_more_balance = 0;
624     		pgdat = pgdat_list;
625     		do
626     			need_more_balance |= kswapd_balance_pgdat(pgdat);
627     		while ((pgdat = pgdat->node_next));
628     	} while (need_more_balance);
629     }
630     
631     static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
632     {
633     	zone_t * zone;
634     	int i;
635     
636     	for (i = pgdat->nr_zones-1; i >= 0; i--) {
637     		zone = pgdat->node_zones + i;
638     		if (!zone->need_balance)
639     			continue;
640     		return 0;
641     	}
642     
643     	return 1;
644     }
645     
646     static int kswapd_can_sleep(void)
647     {
648     	pg_data_t * pgdat;
649     
650     	pgdat = pgdat_list;
651     	do {
652     		if (kswapd_can_sleep_pgdat(pgdat))
653     			continue;
654     		return 0;
655     	} while ((pgdat = pgdat->node_next));
656     
657     	return 1;
658     }
659     
660     /*
661      * The background pageout daemon, started as a kernel thread
662      * from the init process. 
663      *
664      * This basically trickles out pages so that we have _some_
665      * free memory available even if there is no other activity
666      * that frees anything up. This is needed for things like routing
667      * etc, where we otherwise might have all activity going on in
668      * asynchronous contexts that cannot page things out.
669      *
670      * If there are applications that are active memory-allocators
671      * (most normal use), this basically shouldn't matter.
672      */
673     int kswapd(void *unused)
674     {
675     	struct task_struct *tsk = current;
676     	DECLARE_WAITQUEUE(wait, tsk);
677     
678     	daemonize();
679     	strcpy(tsk->comm, "kswapd");
680     	sigfillset(&tsk->blocked);
681     	
682     	/*
683     	 * Tell the memory management that we're a "memory allocator",
684     	 * and that if we need more memory we should get access to it
685     	 * regardless (see "__alloc_pages()"). "kswapd" should
686     	 * never get caught in the normal page freeing logic.
687     	 *
688     	 * (Kswapd normally doesn't need memory anyway, but sometimes
689     	 * you need a small amount of memory in order to be able to
690     	 * page out something else, and this flag essentially protects
691     	 * us from recursively trying to free more memory as we're
692     	 * trying to free the first piece of memory in the first place).
693     	 */
694     	tsk->flags |= PF_MEMALLOC;
695     
696     	/*
697     	 * Kswapd main loop.
698     	 */
699     	for (;;) {
700     		__set_current_state(TASK_INTERRUPTIBLE);
701     		add_wait_queue(&kswapd_wait, &wait);
702     
703     		mb();
704     		if (kswapd_can_sleep())
705     			schedule();
706     
707     		__set_current_state(TASK_RUNNING);
708     		remove_wait_queue(&kswapd_wait, &wait);
709     
710     		/*
711     		 * If we actually get into a low-memory situation,
712     		 * the processes needing more memory will wake us
713     		 * up on a more timely basis.
714     		 */
715     		kswapd_balance();
716     		run_task_queue(&tq_disk);
717     	}
718     }
719     
720     static int __init kswapd_init(void)
721     {
722     	printk("Starting kswapd\n");
723     	swap_setup();
724     	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
725     	return 0;
726     }
727     
728     module_init(kswapd_init)
729