File: /usr/src/linux/arch/i386/mm/fault.c
1 /*
2 * linux/arch/i386/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
6
7 #include <linux/signal.h>
8 #include <linux/sched.h>
9 #include <linux/kernel.h>
10 #include <linux/errno.h>
11 #include <linux/string.h>
12 #include <linux/types.h>
13 #include <linux/ptrace.h>
14 #include <linux/mman.h>
15 #include <linux/mm.h>
16 #include <linux/smp.h>
17 #include <linux/smp_lock.h>
18 #include <linux/interrupt.h>
19 #include <linux/init.h>
20 #include <linux/vt_kern.h> /* For unblank_screen() */
21
22 #include <asm/system.h>
23 #include <asm/uaccess.h>
24 #include <asm/pgalloc.h>
25 #include <asm/hardirq.h>
26
27 extern void die(const char *,struct pt_regs *,long);
28
29 extern int console_loglevel;
30
31 /*
32 * Ugly, ugly, but the goto's result in better assembly..
33 */
34 int __verify_write(const void * addr, unsigned long size)
35 {
36 struct vm_area_struct * vma;
37 unsigned long start = (unsigned long) addr;
38
39 if (!size)
40 return 1;
41
42 vma = find_vma(current->mm, start);
43 if (!vma)
44 goto bad_area;
45 if (vma->vm_start > start)
46 goto check_stack;
47
48 good_area:
49 if (!(vma->vm_flags & VM_WRITE))
50 goto bad_area;
51 size--;
52 size += start & ~PAGE_MASK;
53 size >>= PAGE_SHIFT;
54 start &= PAGE_MASK;
55
56 for (;;) {
57 survive:
58 {
59 int fault = handle_mm_fault(current->mm, vma, start, 1);
60 if (!fault)
61 goto bad_area;
62 if (fault < 0)
63 goto out_of_memory;
64 }
65 if (!size)
66 break;
67 size--;
68 start += PAGE_SIZE;
69 if (start < vma->vm_end)
70 continue;
71 vma = vma->vm_next;
72 if (!vma || vma->vm_start != start)
73 goto bad_area;
74 if (!(vma->vm_flags & VM_WRITE))
75 goto bad_area;;
76 }
77 return 1;
78
79 check_stack:
80 if (!(vma->vm_flags & VM_GROWSDOWN))
81 goto bad_area;
82 if (expand_stack(vma, start) == 0)
83 goto good_area;
84
85 bad_area:
86 return 0;
87
88 out_of_memory:
89 if (current->pid == 1) {
90 current->policy |= SCHED_YIELD;
91 schedule();
92 goto survive;
93 }
94 goto bad_area;
95 }
96
97 extern spinlock_t timerlist_lock;
98
99 /*
100 * Unlock any spinlocks which will prevent us from getting the
101 * message out (timerlist_lock is acquired through the
102 * console unblank code)
103 */
104 void bust_spinlocks(int yes)
105 {
106 spin_lock_init(&timerlist_lock);
107 if (yes) {
108 oops_in_progress = 1;
109 #ifdef CONFIG_SMP
110 global_irq_lock = 0; /* Many serial drivers do __global_cli() */
111 #endif
112 } else {
113 int loglevel_save = console_loglevel;
114 #ifdef CONFIG_VT
115 unblank_screen();
116 #endif
117 oops_in_progress = 0;
118 /*
119 * OK, the message is on the console. Now we call printk()
120 * without oops_in_progress set so that printk will give klogd
121 * a poke. Hold onto your hats...
122 */
123 console_loglevel = 15; /* NMI oopser may have shut the console up */
124 printk(" ");
125 console_loglevel = loglevel_save;
126 }
127 }
128
129 void do_BUG(const char *file, int line)
130 {
131 bust_spinlocks(1);
132 printk("kernel BUG at %s:%d!\n", file, line);
133 }
134
135 asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
136 extern unsigned long idt;
137
138 /*
139 * This routine handles page faults. It determines the address,
140 * and the problem, and then passes it off to one of the appropriate
141 * routines.
142 *
143 * error_code:
144 * bit 0 == 0 means no page found, 1 means protection fault
145 * bit 1 == 0 means read, 1 means write
146 * bit 2 == 0 means kernel, 1 means user-mode
147 */
148 asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
149 {
150 struct task_struct *tsk;
151 struct mm_struct *mm;
152 struct vm_area_struct * vma;
153 unsigned long address;
154 unsigned long page;
155 unsigned long fixup;
156 int write;
157 siginfo_t info;
158
159 /* get the address */
160 __asm__("movl %%cr2,%0":"=r" (address));
161
162 /* It's safe to allow irq's after cr2 has been saved */
163 if (regs->eflags & X86_EFLAGS_IF)
164 local_irq_enable();
165
166 tsk = current;
167
168 /*
169 * We fault-in kernel-space virtual memory on-demand. The
170 * 'reference' page table is init_mm.pgd.
171 *
172 * NOTE! We MUST NOT take any locks for this case. We may
173 * be in an interrupt or a critical region, and should
174 * only copy the information from the master page table,
175 * nothing more.
176 *
177 * This verifies that the fault happens in kernel space
178 * (error_code & 4) == 0, and that the fault was not a
179 * protection error (error_code & 1) == 0.
180 */
181 if (address >= TASK_SIZE && !(error_code & 5))
182 goto vmalloc_fault;
183
184 mm = tsk->mm;
185 info.si_code = SEGV_MAPERR;
186
187 /*
188 * If we're in an interrupt or have no user
189 * context, we must not take the fault..
190 */
191 if (in_interrupt() || !mm)
192 goto no_context;
193
194 down_read(&mm->mmap_sem);
195
196 vma = find_vma(mm, address);
197 if (!vma)
198 goto bad_area;
199 if (vma->vm_start <= address)
200 goto good_area;
201 if (!(vma->vm_flags & VM_GROWSDOWN))
202 goto bad_area;
203 if (error_code & 4) {
204 /*
205 * accessing the stack below %esp is always a bug.
206 * The "+ 32" is there due to some instructions (like
207 * pusha) doing post-decrement on the stack and that
208 * doesn't show up until later..
209 */
210 if (address + 32 < regs->esp)
211 goto bad_area;
212 }
213 if (expand_stack(vma, address))
214 goto bad_area;
215 /*
216 * Ok, we have a good vm_area for this memory access, so
217 * we can handle it..
218 */
219 good_area:
220 info.si_code = SEGV_ACCERR;
221 write = 0;
222 switch (error_code & 3) {
223 default: /* 3: write, present */
224 #ifdef TEST_VERIFY_AREA
225 if (regs->cs == KERNEL_CS)
226 printk("WP fault at %08lx\n", regs->eip);
227 #endif
228 /* fall through */
229 case 2: /* write, not present */
230 if (!(vma->vm_flags & VM_WRITE))
231 goto bad_area;
232 write++;
233 break;
234 case 1: /* read, present */
235 goto bad_area;
236 case 0: /* read, not present */
237 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
238 goto bad_area;
239 }
240
241 survive:
242 /*
243 * If for any reason at all we couldn't handle the fault,
244 * make sure we exit gracefully rather than endlessly redo
245 * the fault.
246 */
247 switch (handle_mm_fault(mm, vma, address, write)) {
248 case 1:
249 tsk->min_flt++;
250 break;
251 case 2:
252 tsk->maj_flt++;
253 break;
254 case 0:
255 goto do_sigbus;
256 default:
257 goto out_of_memory;
258 }
259
260 /*
261 * Did it hit the DOS screen memory VA from vm86 mode?
262 */
263 if (regs->eflags & VM_MASK) {
264 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
265 if (bit < 32)
266 tsk->thread.screen_bitmap |= 1 << bit;
267 }
268 up_read(&mm->mmap_sem);
269 return;
270
271 /*
272 * Something tried to access memory that isn't in our memory map..
273 * Fix it, but check if it's kernel or user first..
274 */
275 bad_area:
276 up_read(&mm->mmap_sem);
277
278 /* User mode accesses just cause a SIGSEGV */
279 if (error_code & 4) {
280 tsk->thread.cr2 = address;
281 tsk->thread.error_code = error_code;
282 tsk->thread.trap_no = 14;
283 info.si_signo = SIGSEGV;
284 info.si_errno = 0;
285 /* info.si_code has been set above */
286 info.si_addr = (void *)address;
287 force_sig_info(SIGSEGV, &info, tsk);
288 return;
289 }
290
291 /*
292 * Pentium F0 0F C7 C8 bug workaround.
293 */
294 if (boot_cpu_data.f00f_bug) {
295 unsigned long nr;
296
297 nr = (address - idt) >> 3;
298
299 if (nr == 6) {
300 do_invalid_op(regs, 0);
301 return;
302 }
303 }
304
305 no_context:
306 /* Are we prepared to handle this kernel fault? */
307 if ((fixup = search_exception_table(regs->eip)) != 0) {
308 regs->eip = fixup;
309 return;
310 }
311
312 /*
313 * Oops. The kernel tried to access some bad page. We'll have to
314 * terminate things with extreme prejudice.
315 */
316
317 bust_spinlocks(1);
318
319 if (address < PAGE_SIZE)
320 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
321 else
322 printk(KERN_ALERT "Unable to handle kernel paging request");
323 printk(" at virtual address %08lx\n",address);
324 printk(" printing eip:\n");
325 printk("%08lx\n", regs->eip);
326 asm("movl %%cr3,%0":"=r" (page));
327 page = ((unsigned long *) __va(page))[address >> 22];
328 printk(KERN_ALERT "*pde = %08lx\n", page);
329 if (page & 1) {
330 page &= PAGE_MASK;
331 address &= 0x003ff000;
332 page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
333 printk(KERN_ALERT "*pte = %08lx\n", page);
334 }
335 die("Oops", regs, error_code);
336 bust_spinlocks(0);
337 do_exit(SIGKILL);
338
339 /*
340 * We ran out of memory, or some other thing happened to us that made
341 * us unable to handle the page fault gracefully.
342 */
343 out_of_memory:
344 up_read(&mm->mmap_sem);
345 if (tsk->pid == 1) {
346 tsk->policy |= SCHED_YIELD;
347 schedule();
348 down_read(&mm->mmap_sem);
349 goto survive;
350 }
351 printk("VM: killing process %s\n", tsk->comm);
352 if (error_code & 4)
353 do_exit(SIGKILL);
354 goto no_context;
355
356 do_sigbus:
357 up_read(&mm->mmap_sem);
358
359 /*
360 * Send a sigbus, regardless of whether we were in kernel
361 * or user mode.
362 */
363 tsk->thread.cr2 = address;
364 tsk->thread.error_code = error_code;
365 tsk->thread.trap_no = 14;
366 info.si_signo = SIGBUS;
367 info.si_errno = 0;
368 info.si_code = BUS_ADRERR;
369 info.si_addr = (void *)address;
370 force_sig_info(SIGBUS, &info, tsk);
371
372 /* Kernel mode? Handle exceptions or die */
373 if (!(error_code & 4))
374 goto no_context;
375 return;
376
377 vmalloc_fault:
378 {
379 /*
380 * Synchronize this task's top level page-table
381 * with the 'reference' page table.
382 *
383 * Do _not_ use "tsk" here. We might be inside
384 * an interrupt in the middle of a task switch..
385 */
386 int offset = __pgd_offset(address);
387 pgd_t *pgd, *pgd_k;
388 pmd_t *pmd, *pmd_k;
389 pte_t *pte_k;
390
391 asm("movl %%cr3,%0":"=r" (pgd));
392 pgd = offset + (pgd_t *)__va(pgd);
393 pgd_k = init_mm.pgd + offset;
394
395 if (!pgd_present(*pgd_k))
396 goto no_context;
397 set_pgd(pgd, *pgd_k);
398
399 pmd = pmd_offset(pgd, address);
400 pmd_k = pmd_offset(pgd_k, address);
401 if (!pmd_present(*pmd_k))
402 goto no_context;
403 set_pmd(pmd, *pmd_k);
404
405 pte_k = pte_offset(pmd_k, address);
406 if (!pte_present(*pte_k))
407 goto no_context;
408 return;
409 }
410 }
411