File: /usr/src/linux/mm/highmem.c

1     /*
2      * High memory handling common code and variables.
3      *
4      * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
5      *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
6      *
7      *
8      * Redesigned the x86 32-bit VM architecture to deal with
9      * 64-bit physical space. With current x86 CPUs this
10      * means up to 64 Gigabytes physical RAM.
11      *
12      * Rewrote high memory support to move the page cache into
13      * high memory. Implemented permanent (schedulable) kmaps
14      * based on Linus' idea.
15      *
16      * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
17      */
18     
19     #include <linux/mm.h>
20     #include <linux/pagemap.h>
21     #include <linux/highmem.h>
22     #include <linux/swap.h>
23     #include <linux/slab.h>
24     
25     /*
26      * Virtual_count is not a pure "count".
27      *  0 means that it is not mapped, and has not been mapped
28      *    since a TLB flush - it is usable.
29      *  1 means that there are no users, but it has been mapped
30      *    since the last TLB flush - so we can't use it.
31      *  n means that there are (n-1) current users of it.
32      */
33     static int pkmap_count[LAST_PKMAP];
34     static unsigned int last_pkmap_nr;
35     static spinlock_t kmap_lock = SPIN_LOCK_UNLOCKED;
36     
37     pte_t * pkmap_page_table;
38     
39     static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
40     
41     static void flush_all_zero_pkmaps(void)
42     {
43     	int i;
44     
45     	flush_cache_all();
46     
47     	for (i = 0; i < LAST_PKMAP; i++) {
48     		struct page *page;
49     
50     		/*
51     		 * zero means we don't have anything to do,
52     		 * >1 means that it is still in use. Only
53     		 * a count of 1 means that it is free but
54     		 * needs to be unmapped
55     		 */
56     		if (pkmap_count[i] != 1)
57     			continue;
58     		pkmap_count[i] = 0;
59     
60     		/* sanity check */
61     		if (pte_none(pkmap_page_table[i]))
62     			BUG();
63     
64     		/*
65     		 * Don't need an atomic fetch-and-clear op here;
66     		 * no-one has the page mapped, and cannot get at
67     		 * its virtual address (and hence PTE) without first
68     		 * getting the kmap_lock (which is held here).
69     		 * So no dangers, even with speculative execution.
70     		 */
71     		page = pte_page(pkmap_page_table[i]);
72     		pte_clear(&pkmap_page_table[i]);
73     
74     		page->virtual = NULL;
75     	}
76     	flush_tlb_all();
77     }
78     
79     static inline unsigned long map_new_virtual(struct page *page)
80     {
81     	unsigned long vaddr;
82     	int count;
83     
84     start:
85     	count = LAST_PKMAP;
86     	/* Find an empty entry */
87     	for (;;) {
88     		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
89     		if (!last_pkmap_nr) {
90     			flush_all_zero_pkmaps();
91     			count = LAST_PKMAP;
92     		}
93     		if (!pkmap_count[last_pkmap_nr])
94     			break;	/* Found a usable entry */
95     		if (--count)
96     			continue;
97     
98     		/*
99     		 * Sleep for somebody else to unmap their entries
100     		 */
101     		{
102     			DECLARE_WAITQUEUE(wait, current);
103     
104     			current->state = TASK_UNINTERRUPTIBLE;
105     			add_wait_queue(&pkmap_map_wait, &wait);
106     			spin_unlock(&kmap_lock);
107     			schedule();
108     			remove_wait_queue(&pkmap_map_wait, &wait);
109     			spin_lock(&kmap_lock);
110     
111     			/* Somebody else might have mapped it while we slept */
112     			if (page->virtual)
113     				return (unsigned long) page->virtual;
114     
115     			/* Re-start */
116     			goto start;
117     		}
118     	}
119     	vaddr = PKMAP_ADDR(last_pkmap_nr);
120     	set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
121     
122     	pkmap_count[last_pkmap_nr] = 1;
123     	page->virtual = (void *) vaddr;
124     
125     	return vaddr;
126     }
127     
128     void *kmap_high(struct page *page)
129     {
130     	unsigned long vaddr;
131     
132     	/*
133     	 * For highmem pages, we can't trust "virtual" until
134     	 * after we have the lock.
135     	 *
136     	 * We cannot call this from interrupts, as it may block
137     	 */
138     	spin_lock(&kmap_lock);
139     	vaddr = (unsigned long) page->virtual;
140     	if (!vaddr)
141     		vaddr = map_new_virtual(page);
142     	pkmap_count[PKMAP_NR(vaddr)]++;
143     	if (pkmap_count[PKMAP_NR(vaddr)] < 2)
144     		BUG();
145     	spin_unlock(&kmap_lock);
146     	return (void*) vaddr;
147     }
148     
149     void kunmap_high(struct page *page)
150     {
151     	unsigned long vaddr;
152     	unsigned long nr;
153     	int need_wakeup;
154     
155     	spin_lock(&kmap_lock);
156     	vaddr = (unsigned long) page->virtual;
157     	if (!vaddr)
158     		BUG();
159     	nr = PKMAP_NR(vaddr);
160     
161     	/*
162     	 * A count must never go down to zero
163     	 * without a TLB flush!
164     	 */
165     	need_wakeup = 0;
166     	switch (--pkmap_count[nr]) {
167     	case 0:
168     		BUG();
169     	case 1:
170     		/*
171     		 * Avoid an unnecessary wake_up() function call.
172     		 * The common case is pkmap_count[] == 1, but
173     		 * no waiters.
174     		 * The tasks queued in the wait-queue are guarded
175     		 * by both the lock in the wait-queue-head and by
176     		 * the kmap_lock.  As the kmap_lock is held here,
177     		 * no need for the wait-queue-head's lock.  Simply
178     		 * test if the queue is empty.
179     		 */
180     		need_wakeup = waitqueue_active(&pkmap_map_wait);
181     	}
182     	spin_unlock(&kmap_lock);
183     
184     	/* do wake-up, if needed, race-free outside of the spin lock */
185     	if (need_wakeup)
186     		wake_up(&pkmap_map_wait);
187     }
188     
189     #define POOL_SIZE 32
190     
191     /*
192      * This lock gets no contention at all, normally.
193      */
194     static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED;
195     
196     int nr_emergency_pages;
197     static LIST_HEAD(emergency_pages);
198     
199     int nr_emergency_bhs;
200     static LIST_HEAD(emergency_bhs);
201     
202     /*
203      * Simple bounce buffer support for highmem pages.
204      * This will be moved to the block layer in 2.5.
205      */
206     
207     static inline void copy_from_high_bh (struct buffer_head *to,
208     			 struct buffer_head *from)
209     {
210     	struct page *p_from;
211     	char *vfrom;
212     
213     	p_from = from->b_page;
214     
215     	vfrom = kmap_atomic(p_from, KM_USER0);
216     	memcpy(to->b_data, vfrom + bh_offset(from), to->b_size);
217     	kunmap_atomic(vfrom, KM_USER0);
218     }
219     
220     static inline void copy_to_high_bh_irq (struct buffer_head *to,
221     			 struct buffer_head *from)
222     {
223     	struct page *p_to;
224     	char *vto;
225     	unsigned long flags;
226     
227     	p_to = to->b_page;
228     	__save_flags(flags);
229     	__cli();
230     	vto = kmap_atomic(p_to, KM_BOUNCE_READ);
231     	memcpy(vto + bh_offset(to), from->b_data, to->b_size);
232     	kunmap_atomic(vto, KM_BOUNCE_READ);
233     	__restore_flags(flags);
234     }
235     
236     static inline void bounce_end_io (struct buffer_head *bh, int uptodate)
237     {
238     	struct page *page;
239     	struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
240     	unsigned long flags;
241     
242     	bh_orig->b_end_io(bh_orig, uptodate);
243     
244     	page = bh->b_page;
245     
246     	spin_lock_irqsave(&emergency_lock, flags);
247     	if (nr_emergency_pages >= POOL_SIZE)
248     		__free_page(page);
249     	else {
250     		/*
251     		 * We are abusing page->list to manage
252     		 * the highmem emergency pool:
253     		 */
254     		list_add(&page->list, &emergency_pages);
255     		nr_emergency_pages++;
256     	}
257     	
258     	if (nr_emergency_bhs >= POOL_SIZE) {
259     #ifdef HIGHMEM_DEBUG
260     		/* Don't clobber the constructed slab cache */
261     		init_waitqueue_head(&bh->b_wait);
262     #endif
263     		kmem_cache_free(bh_cachep, bh);
264     	} else {
265     		/*
266     		 * Ditto in the bh case, here we abuse b_inode_buffers:
267     		 */
268     		list_add(&bh->b_inode_buffers, &emergency_bhs);
269     		nr_emergency_bhs++;
270     	}
271     	spin_unlock_irqrestore(&emergency_lock, flags);
272     }
273     
274     static __init int init_emergency_pool(void)
275     {
276     	struct sysinfo i;
277             si_meminfo(&i);
278             si_swapinfo(&i);
279             
280             if (!i.totalhigh)
281             	return 0;
282     
283     	spin_lock_irq(&emergency_lock);
284     	while (nr_emergency_pages < POOL_SIZE) {
285     		struct page * page = alloc_page(GFP_ATOMIC);
286     		if (!page) {
287     			printk("couldn't refill highmem emergency pages");
288     			break;
289     		}
290     		list_add(&page->list, &emergency_pages);
291     		nr_emergency_pages++;
292     	}
293     	while (nr_emergency_bhs < POOL_SIZE) {
294     		struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
295     		if (!bh) {
296     			printk("couldn't refill highmem emergency bhs");
297     			break;
298     		}
299     		list_add(&bh->b_inode_buffers, &emergency_bhs);
300     		nr_emergency_bhs++;
301     	}
302     	spin_unlock_irq(&emergency_lock);
303     	printk("allocated %d pages and %d bhs reserved for the highmem bounces\n",
304     	       nr_emergency_pages, nr_emergency_bhs);
305     
306     	return 0;
307     }
308     
309     __initcall(init_emergency_pool);
310     
311     static void bounce_end_io_write (struct buffer_head *bh, int uptodate)
312     {
313     	bounce_end_io(bh, uptodate);
314     }
315     
316     static void bounce_end_io_read (struct buffer_head *bh, int uptodate)
317     {
318     	struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
319     
320     	if (uptodate)
321     		copy_to_high_bh_irq(bh_orig, bh);
322     	bounce_end_io(bh, uptodate);
323     }
324     
325     struct page *alloc_bounce_page (void)
326     {
327     	struct list_head *tmp;
328     	struct page *page;
329     
330     repeat_alloc:
331     	page = alloc_page(GFP_NOHIGHIO);
332     	if (page)
333     		return page;
334     	/*
335     	 * No luck. First, kick the VM so it doesnt idle around while
336     	 * we are using up our emergency rations.
337     	 */
338     	wakeup_bdflush();
339     
340     	/*
341     	 * Try to allocate from the emergency pool.
342     	 */
343     	tmp = &emergency_pages;
344     	spin_lock_irq(&emergency_lock);
345     	if (!list_empty(tmp)) {
346     		page = list_entry(tmp->next, struct page, list);
347     		list_del(tmp->next);
348     		nr_emergency_pages--;
349     	}
350     	spin_unlock_irq(&emergency_lock);
351     	if (page)
352     		return page;
353     
354     	/* we need to wait I/O completion */
355     	run_task_queue(&tq_disk);
356     
357     	current->policy |= SCHED_YIELD;
358     	__set_current_state(TASK_RUNNING);
359     	schedule();
360     	goto repeat_alloc;
361     }
362     
363     struct buffer_head *alloc_bounce_bh (void)
364     {
365     	struct list_head *tmp;
366     	struct buffer_head *bh;
367     
368     repeat_alloc:
369     	bh = kmem_cache_alloc(bh_cachep, SLAB_NOHIGHIO);
370     	if (bh)
371     		return bh;
372     	/*
373     	 * No luck. First, kick the VM so it doesnt idle around while
374     	 * we are using up our emergency rations.
375     	 */
376     	wakeup_bdflush();
377     
378     	/*
379     	 * Try to allocate from the emergency pool.
380     	 */
381     	tmp = &emergency_bhs;
382     	spin_lock_irq(&emergency_lock);
383     	if (!list_empty(tmp)) {
384     		bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
385     		list_del(tmp->next);
386     		nr_emergency_bhs--;
387     	}
388     	spin_unlock_irq(&emergency_lock);
389     	if (bh)
390     		return bh;
391     
392     	/* we need to wait I/O completion */
393     	run_task_queue(&tq_disk);
394     
395     	current->policy |= SCHED_YIELD;
396     	__set_current_state(TASK_RUNNING);
397     	schedule();
398     	goto repeat_alloc;
399     }
400     
401     struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
402     {
403     	struct page *page;
404     	struct buffer_head *bh;
405     
406     	if (!PageHighMem(bh_orig->b_page))
407     		return bh_orig;
408     
409     	bh = alloc_bounce_bh();
410     	/*
411     	 * This is wasteful for 1k buffers, but this is a stopgap measure
412     	 * and we are being ineffective anyway. This approach simplifies
413     	 * things immensly. On boxes with more than 4GB RAM this should
414     	 * not be an issue anyway.
415     	 */
416     	page = alloc_bounce_page();
417     
418     	set_bh_page(bh, page, 0);
419     
420     	bh->b_next = NULL;
421     	bh->b_blocknr = bh_orig->b_blocknr;
422     	bh->b_size = bh_orig->b_size;
423     	bh->b_list = -1;
424     	bh->b_dev = bh_orig->b_dev;
425     	bh->b_count = bh_orig->b_count;
426     	bh->b_rdev = bh_orig->b_rdev;
427     	bh->b_state = bh_orig->b_state;
428     #ifdef HIGHMEM_DEBUG
429     	bh->b_flushtime = jiffies;
430     	bh->b_next_free = NULL;
431     	bh->b_prev_free = NULL;
432     	/* bh->b_this_page */
433     	bh->b_reqnext = NULL;
434     	bh->b_pprev = NULL;
435     #endif
436     	/* bh->b_page */
437     	if (rw == WRITE) {
438     		bh->b_end_io = bounce_end_io_write;
439     		copy_from_high_bh(bh, bh_orig);
440     	} else
441     		bh->b_end_io = bounce_end_io_read;
442     	bh->b_private = (void *)bh_orig;
443     	bh->b_rsector = bh_orig->b_rsector;
444     #ifdef HIGHMEM_DEBUG
445     	memset(&bh->b_wait, -1, sizeof(bh->b_wait));
446     #endif
447     
448     	return bh;
449     }
450     
451