File: /usr/src/linux/arch/i386/kernel/process.c

1     /*
2      *  linux/arch/i386/kernel/process.c
3      *
4      *  Copyright (C) 1995  Linus Torvalds
5      *
6      *  Pentium III FXSR, SSE support
7      *	Gareth Hughes <gareth@valinux.com>, May 2000
8      */
9     
10     /*
11      * This file handles the architecture-dependent parts of process handling..
12      */
13     
14     #define __KERNEL_SYSCALLS__
15     #include <stdarg.h>
16     
17     #include <linux/errno.h>
18     #include <linux/sched.h>
19     #include <linux/kernel.h>
20     #include <linux/mm.h>
21     #include <linux/smp.h>
22     #include <linux/smp_lock.h>
23     #include <linux/stddef.h>
24     #include <linux/unistd.h>
25     #include <linux/ptrace.h>
26     #include <linux/slab.h>
27     #include <linux/vmalloc.h>
28     #include <linux/user.h>
29     #include <linux/a.out.h>
30     #include <linux/interrupt.h>
31     #include <linux/config.h>
32     #include <linux/delay.h>
33     #include <linux/reboot.h>
34     #include <linux/init.h>
35     #include <linux/mc146818rtc.h>
36     
37     #include <asm/uaccess.h>
38     #include <asm/pgtable.h>
39     #include <asm/system.h>
40     #include <asm/io.h>
41     #include <asm/ldt.h>
42     #include <asm/processor.h>
43     #include <asm/i387.h>
44     #include <asm/desc.h>
45     #include <asm/mmu_context.h>
46     #ifdef CONFIG_MATH_EMULATION
47     #include <asm/math_emu.h>
48     #endif
49     
50     #include <linux/irq.h>
51     
52     asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
53     
54     int hlt_counter;
55     
56     /*
57      * Powermanagement idle function, if any..
58      */
59     void (*pm_idle)(void);
60     
61     /*
62      * Power off function, if any
63      */
64     void (*pm_power_off)(void);
65     
66     void disable_hlt(void)
67     {
68     	hlt_counter++;
69     }
70     
71     void enable_hlt(void)
72     {
73     	hlt_counter--;
74     }
75     
76     /*
77      * We use this if we don't have any better
78      * idle routine..
79      */
80     static void default_idle(void)
81     {
82     	if (current_cpu_data.hlt_works_ok && !hlt_counter) {
83     		__cli();
84     		if (!current->need_resched)
85     			safe_halt();
86     		else
87     			__sti();
88     	}
89     }
90     
91     /*
92      * On SMP it's slightly faster (but much more power-consuming!)
93      * to poll the ->need_resched flag instead of waiting for the
94      * cross-CPU IPI to arrive. Use this option with caution.
95      */
96     static void poll_idle (void)
97     {
98     	int oldval;
99     
100     	__sti();
101     
102     	/*
103     	 * Deal with another CPU just having chosen a thread to
104     	 * run here:
105     	 */
106     	oldval = xchg(&current->need_resched, -1);
107     
108     	if (!oldval)
109     		asm volatile(
110     			"2:"
111     			"cmpl $-1, %0;"
112     			"rep; nop;"
113     			"je 2b;"
114     				: :"m" (current->need_resched));
115     }
116     
117     /*
118      * The idle thread. There's no useful work to be
119      * done, so just try to conserve power and have a
120      * low exit latency (ie sit in a loop waiting for
121      * somebody to say that they'd like to reschedule)
122      */
123     void cpu_idle (void)
124     {
125     	/* endless idle loop with no priority at all */
126     	init_idle();
127     	current->nice = 20;
128     	current->counter = -100;
129     
130     	while (1) {
131     		void (*idle)(void) = pm_idle;
132     		if (!idle)
133     			idle = default_idle;
134     		while (!current->need_resched)
135     			idle();
136     		schedule();
137     		check_pgt_cache();
138     	}
139     }
140     
141     static int __init idle_setup (char *str)
142     {
143     	if (!strncmp(str, "poll", 4)) {
144     		printk("using polling idle threads.\n");
145     		pm_idle = poll_idle;
146     	}
147     
148     	return 1;
149     }
150     
151     __setup("idle=", idle_setup);
152     
153     static long no_idt[2];
154     static int reboot_mode;
155     int reboot_thru_bios;
156     
157     #ifdef CONFIG_SMP
158     int reboot_smp = 0;
159     static int reboot_cpu = -1;
160     /* shamelessly grabbed from lib/vsprintf.c for readability */
161     #define is_digit(c)	((c) >= '0' && (c) <= '9')
162     #endif
163     static int __init reboot_setup(char *str)
164     {
165     	while(1) {
166     		switch (*str) {
167     		case 'w': /* "warm" reboot (no memory testing etc) */
168     			reboot_mode = 0x1234;
169     			break;
170     		case 'c': /* "cold" reboot (with memory testing etc) */
171     			reboot_mode = 0x0;
172     			break;
173     		case 'b': /* "bios" reboot by jumping through the BIOS */
174     			reboot_thru_bios = 1;
175     			break;
176     		case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
177     			reboot_thru_bios = 0;
178     			break;
179     #ifdef CONFIG_SMP
180     		case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
181     			reboot_smp = 1;
182     			if (is_digit(*(str+1))) {
183     				reboot_cpu = (int) (*(str+1) - '0');
184     				if (is_digit(*(str+2))) 
185     					reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
186     			}
187     				/* we will leave sorting out the final value 
188     				when we are ready to reboot, since we might not
189      				have set up boot_cpu_id or smp_num_cpu */
190     			break;
191     #endif
192     		}
193     		if((str = strchr(str,',')) != NULL)
194     			str++;
195     		else
196     			break;
197     	}
198     	return 1;
199     }
200     
201     __setup("reboot=", reboot_setup);
202     
203     /* The following code and data reboots the machine by switching to real
204        mode and jumping to the BIOS reset entry point, as if the CPU has
205        really been reset.  The previous version asked the keyboard
206        controller to pulse the CPU reset line, which is more thorough, but
207        doesn't work with at least one type of 486 motherboard.  It is easy
208        to stop this code working; hence the copious comments. */
209     
210     static unsigned long long
211     real_mode_gdt_entries [3] =
212     {
213     	0x0000000000000000ULL,	/* Null descriptor */
214     	0x00009a000000ffffULL,	/* 16-bit real-mode 64k code at 0x00000000 */
215     	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
216     };
217     
218     static struct
219     {
220     	unsigned short       size __attribute__ ((packed));
221     	unsigned long long * base __attribute__ ((packed));
222     }
223     real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
224     real_mode_idt = { 0x3ff, 0 };
225     
226     /* This is 16-bit protected mode code to disable paging and the cache,
227        switch to real mode and jump to the BIOS reset code.
228     
229        The instruction that switches to real mode by writing to CR0 must be
230        followed immediately by a far jump instruction, which set CS to a
231        valid value for real mode, and flushes the prefetch queue to avoid
232        running instructions that have already been decoded in protected
233        mode.
234     
235        Clears all the flags except ET, especially PG (paging), PE
236        (protected-mode enable) and TS (task switch for coprocessor state
237        save).  Flushes the TLB after paging has been disabled.  Sets CD and
238        NW, to disable the cache on a 486, and invalidates the cache.  This
239        is more like the state of a 486 after reset.  I don't know if
240        something else should be done for other chips.
241     
242        More could be done here to set up the registers as if a CPU reset had
243        occurred; hopefully real BIOSs don't assume much. */
244     
245     static unsigned char real_mode_switch [] =
246     {
247     	0x66, 0x0f, 0x20, 0xc0,			/*    movl  %cr0,%eax        */
248     	0x66, 0x83, 0xe0, 0x11,			/*    andl  $0x00000011,%eax */
249     	0x66, 0x0d, 0x00, 0x00, 0x00, 0x60,	/*    orl   $0x60000000,%eax */
250     	0x66, 0x0f, 0x22, 0xc0,			/*    movl  %eax,%cr0        */
251     	0x66, 0x0f, 0x22, 0xd8,			/*    movl  %eax,%cr3        */
252     	0x66, 0x0f, 0x20, 0xc3,			/*    movl  %cr0,%ebx        */
253     	0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60,	/*    andl  $0x60000000,%ebx */
254     	0x74, 0x02,				/*    jz    f                */
255     	0x0f, 0x08,				/*    invd                   */
256     	0x24, 0x10,				/* f: andb  $0x10,al         */
257     	0x66, 0x0f, 0x22, 0xc0			/*    movl  %eax,%cr0        */
258     };
259     static unsigned char jump_to_bios [] =
260     {
261     	0xea, 0x00, 0x00, 0xff, 0xff		/*    ljmp  $0xffff,$0x0000  */
262     };
263     
264     static inline void kb_wait(void)
265     {
266     	int i;
267     
268     	for (i=0; i<0x10000; i++)
269     		if ((inb_p(0x64) & 0x02) == 0)
270     			break;
271     }
272     
273     /*
274      * Switch to real mode and then execute the code
275      * specified by the code and length parameters.
276      * We assume that length will aways be less that 100!
277      */
278     void machine_real_restart(unsigned char *code, int length)
279     {
280     	unsigned long flags;
281     
282     	cli();
283     
284     	/* Write zero to CMOS register number 0x0f, which the BIOS POST
285     	   routine will recognize as telling it to do a proper reboot.  (Well
286     	   that's what this book in front of me says -- it may only apply to
287     	   the Phoenix BIOS though, it's not clear).  At the same time,
288     	   disable NMIs by setting the top bit in the CMOS address register,
289     	   as we're about to do peculiar things to the CPU.  I'm not sure if
290     	   `outb_p' is needed instead of just `outb'.  Use it to be on the
291     	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
292     	 */
293     
294     	spin_lock_irqsave(&rtc_lock, flags);
295     	CMOS_WRITE(0x00, 0x8f);
296     	spin_unlock_irqrestore(&rtc_lock, flags);
297     
298     	/* Remap the kernel at virtual address zero, as well as offset zero
299     	   from the kernel segment.  This assumes the kernel segment starts at
300     	   virtual address PAGE_OFFSET. */
301     
302     	memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
303     		sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
304     
305     	/* Make sure the first page is mapped to the start of physical memory.
306     	   It is normally not mapped, to trap kernel NULL pointer dereferences. */
307     
308     	pg0[0] = _PAGE_RW | _PAGE_PRESENT;
309     
310     	/*
311     	 * Use `swapper_pg_dir' as our page directory.
312     	 */
313     	asm volatile("movl %0,%%cr3": :"r" (__pa(swapper_pg_dir)));
314     
315     	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
316     	   this on booting to tell it to "Bypass memory test (also warm
317     	   boot)".  This seems like a fairly standard thing that gets set by
318     	   REBOOT.COM programs, and the previous reset routine did this
319     	   too. */
320     
321     	*((unsigned short *)0x472) = reboot_mode;
322     
323     	/* For the switch to real mode, copy some code to low memory.  It has
324     	   to be in the first 64k because it is running in 16-bit mode, and it
325     	   has to have the same physical and virtual address, because it turns
326     	   off paging.  Copy it near the end of the first page, out of the way
327     	   of BIOS variables. */
328     
329     	memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
330     		real_mode_switch, sizeof (real_mode_switch));
331     	memcpy ((void *) (0x1000 - 100), code, length);
332     
333     	/* Set up the IDT for real mode. */
334     
335     	__asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt));
336     
337     	/* Set up a GDT from which we can load segment descriptors for real
338     	   mode.  The GDT is not used in real mode; it is just needed here to
339     	   prepare the descriptors. */
340     
341     	__asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt));
342     
343     	/* Load the data segment registers, and thus the descriptors ready for
344     	   real mode.  The base address of each segment is 0x100, 16 times the
345     	   selector value being loaded here.  This is so that the segment
346     	   registers don't have to be reloaded after switching to real mode:
347     	   the values are consistent for real mode operation already. */
348     
349     	__asm__ __volatile__ ("movl $0x0010,%%eax\n"
350     				"\tmovl %%eax,%%ds\n"
351     				"\tmovl %%eax,%%es\n"
352     				"\tmovl %%eax,%%fs\n"
353     				"\tmovl %%eax,%%gs\n"
354     				"\tmovl %%eax,%%ss" : : : "eax");
355     
356     	/* Jump to the 16-bit code that we copied earlier.  It disables paging
357     	   and the cache, switches to real mode, and jumps to the BIOS reset
358     	   entry point. */
359     
360     	__asm__ __volatile__ ("ljmp $0x0008,%0"
361     				:
362     				: "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
363     }
364     
365     void machine_restart(char * __unused)
366     {
367     #if CONFIG_SMP
368     	int cpuid;
369     	
370     	cpuid = GET_APIC_ID(apic_read(APIC_ID));
371     
372     	if (reboot_smp) {
373     
374     		/* check to see if reboot_cpu is valid 
375     		   if its not, default to the BSP */
376     		if ((reboot_cpu == -1) ||  
377     		      (reboot_cpu > (NR_CPUS -1))  || 
378     		      !(phys_cpu_present_map & (1<<cpuid))) 
379     			reboot_cpu = boot_cpu_id;
380     
381     		reboot_smp = 0;  /* use this as a flag to only go through this once*/
382     		/* re-run this function on the other CPUs
383     		   it will fall though this section since we have 
384     		   cleared reboot_smp, and do the reboot if it is the
385     		   correct CPU, otherwise it halts. */
386     		if (reboot_cpu != cpuid)
387     			smp_call_function((void *)machine_restart , NULL, 1, 0);
388     	}
389     
390     	/* if reboot_cpu is still -1, then we want a tradional reboot, 
391     	   and if we are not running on the reboot_cpu,, halt */
392     	if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
393     		for (;;)
394     		__asm__ __volatile__ ("hlt");
395     	}
396     	/*
397     	 * Stop all CPUs and turn off local APICs and the IO-APIC, so
398     	 * other OSs see a clean IRQ state.
399     	 */
400     	smp_send_stop();
401     	disable_IO_APIC();
402     #endif
403     
404     	if(!reboot_thru_bios) {
405     		/* rebooting needs to touch the page at absolute addr 0 */
406     		*((unsigned short *)__va(0x472)) = reboot_mode;
407     		for (;;) {
408     			int i;
409     			for (i=0; i<100; i++) {
410     				kb_wait();
411     				udelay(50);
412     				outb(0xfe,0x64);         /* pulse reset low */
413     				udelay(50);
414     			}
415     			/* That didn't work - force a triple fault.. */
416     			__asm__ __volatile__("lidt %0": :"m" (no_idt));
417     			__asm__ __volatile__("int3");
418     		}
419     	}
420     
421     	machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
422     }
423     
424     void machine_halt(void)
425     {
426     }
427     
428     void machine_power_off(void)
429     {
430     	if (pm_power_off)
431     		pm_power_off();
432     }
433     
434     extern void show_trace(unsigned long* esp);
435     
436     void show_regs(struct pt_regs * regs)
437     {
438     	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
439     
440     	printk("\n");
441     	printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
442     	printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id());
443     	if (regs->xcs & 3)
444     		printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
445     	printk(" EFLAGS: %08lx\n",regs->eflags);
446     	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
447     		regs->eax,regs->ebx,regs->ecx,regs->edx);
448     	printk("ESI: %08lx EDI: %08lx EBP: %08lx",
449     		regs->esi, regs->edi, regs->ebp);
450     	printk(" DS: %04x ES: %04x\n",
451     		0xffff & regs->xds,0xffff & regs->xes);
452     
453     	__asm__("movl %%cr0, %0": "=r" (cr0));
454     	__asm__("movl %%cr2, %0": "=r" (cr2));
455     	__asm__("movl %%cr3, %0": "=r" (cr3));
456     	/* This could fault if %cr4 does not exist */
457     	__asm__("1: movl %%cr4, %0		\n"
458     		"2:				\n"
459     		".section __ex_table,\"a\"	\n"
460     		".long 1b,2b			\n"
461     		".previous			\n"
462     		: "=r" (cr4): "0" (0));
463     	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
464     	show_trace(&regs->esp);
465     }
466     
467     /*
468      * No need to lock the MM as we are the last user
469      */
470     void release_segments(struct mm_struct *mm)
471     {
472     	void * ldt = mm->context.segments;
473     
474     	/*
475     	 * free the LDT
476     	 */
477     	if (ldt) {
478     		mm->context.segments = NULL;
479     		clear_LDT();
480     		vfree(ldt);
481     	}
482     }
483     
484     /*
485      * Create a kernel thread
486      */
487     int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
488     {
489     	long retval, d0;
490     
491     	__asm__ __volatile__(
492     		"movl %%esp,%%esi\n\t"
493     		"int $0x80\n\t"		/* Linux/i386 system call */
494     		"cmpl %%esp,%%esi\n\t"	/* child or parent? */
495     		"je 1f\n\t"		/* parent - jump */
496     		/* Load the argument into eax, and push it.  That way, it does
497     		 * not matter whether the called function is compiled with
498     		 * -mregparm or not.  */
499     		"movl %4,%%eax\n\t"
500     		"pushl %%eax\n\t"		
501     		"call *%5\n\t"		/* call fn */
502     		"movl %3,%0\n\t"	/* exit */
503     		"int $0x80\n"
504     		"1:\t"
505     		:"=&a" (retval), "=&S" (d0)
506     		:"0" (__NR_clone), "i" (__NR_exit),
507     		 "r" (arg), "r" (fn),
508     		 "b" (flags | CLONE_VM)
509     		: "memory");
510     	return retval;
511     }
512     
513     /*
514      * Free current thread data structures etc..
515      */
516     void exit_thread(void)
517     {
518     	/* nothing to do ... */
519     }
520     
521     void flush_thread(void)
522     {
523     	struct task_struct *tsk = current;
524     
525     	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
526     	/*
527     	 * Forget coprocessor state..
528     	 */
529     	clear_fpu(tsk);
530     	tsk->used_math = 0;
531     }
532     
533     void release_thread(struct task_struct *dead_task)
534     {
535     	if (dead_task->mm) {
536     		void * ldt = dead_task->mm->context.segments;
537     
538     		// temporary debugging check
539     		if (ldt) {
540     			printk("WARNING: dead process %8s still has LDT? <%p>\n",
541     					dead_task->comm, ldt);
542     			BUG();
543     		}
544     	}
545     }
546     
547     /*
548      * we do not have to muck with descriptors here, that is
549      * done in switch_mm() as needed.
550      */
551     void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
552     {
553     	struct mm_struct * old_mm;
554     	void *old_ldt, *ldt;
555     
556     	ldt = NULL;
557     	old_mm = current->mm;
558     	if (old_mm && (old_ldt = old_mm->context.segments) != NULL) {
559     		/*
560     		 * Completely new LDT, we initialize it from the parent:
561     		 */
562     		ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
563     		if (!ldt)
564     			printk(KERN_WARNING "ldt allocation failed\n");
565     		else
566     			memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
567     	}
568     	new_mm->context.segments = ldt;
569     	new_mm->context.cpuvalid = ~0UL;	/* valid on all CPU's - they can't have stale data */
570     }
571     
572     /*
573      * Save a segment.
574      */
575     #define savesegment(seg,value) \
576     	asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value)))
577     
578     int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
579     	unsigned long unused,
580     	struct task_struct * p, struct pt_regs * regs)
581     {
582     	struct pt_regs * childregs;
583     
584     	childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
585     	struct_cpy(childregs, regs);
586     	childregs->eax = 0;
587     	childregs->esp = esp;
588     
589     	p->thread.esp = (unsigned long) childregs;
590     	p->thread.esp0 = (unsigned long) (childregs+1);
591     
592     	p->thread.eip = (unsigned long) ret_from_fork;
593     
594     	savesegment(fs,p->thread.fs);
595     	savesegment(gs,p->thread.gs);
596     
597     	unlazy_fpu(current);
598     	struct_cpy(&p->thread.i387, &current->thread.i387);
599     
600     	return 0;
601     }
602     
603     /*
604      * fill in the user structure for a core dump..
605      */
606     void dump_thread(struct pt_regs * regs, struct user * dump)
607     {
608     	int i;
609     
610     /* changed the size calculations - should hopefully work better. lbt */
611     	dump->magic = CMAGIC;
612     	dump->start_code = 0;
613     	dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
614     	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
615     	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
616     	dump->u_dsize -= dump->u_tsize;
617     	dump->u_ssize = 0;
618     	for (i = 0; i < 8; i++)
619     		dump->u_debugreg[i] = current->thread.debugreg[i];  
620     
621     	if (dump->start_stack < TASK_SIZE)
622     		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
623     
624     	dump->regs.ebx = regs->ebx;
625     	dump->regs.ecx = regs->ecx;
626     	dump->regs.edx = regs->edx;
627     	dump->regs.esi = regs->esi;
628     	dump->regs.edi = regs->edi;
629     	dump->regs.ebp = regs->ebp;
630     	dump->regs.eax = regs->eax;
631     	dump->regs.ds = regs->xds;
632     	dump->regs.es = regs->xes;
633     	savesegment(fs,dump->regs.fs);
634     	savesegment(gs,dump->regs.gs);
635     	dump->regs.orig_eax = regs->orig_eax;
636     	dump->regs.eip = regs->eip;
637     	dump->regs.cs = regs->xcs;
638     	dump->regs.eflags = regs->eflags;
639     	dump->regs.esp = regs->esp;
640     	dump->regs.ss = regs->xss;
641     
642     	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
643     }
644     
645     /*
646      * This special macro can be used to load a debugging register
647      */
648     #define loaddebug(thread,register) \
649     		__asm__("movl %0,%%db" #register  \
650     			: /* no output */ \
651     			:"r" (thread->debugreg[register]))
652     
653     /*
654      *	switch_to(x,yn) should switch tasks from x to y.
655      *
656      * We fsave/fwait so that an exception goes off at the right time
657      * (as a call from the fsave or fwait in effect) rather than to
658      * the wrong process. Lazy FP saving no longer makes any sense
659      * with modern CPU's, and this simplifies a lot of things (SMP
660      * and UP become the same).
661      *
662      * NOTE! We used to use the x86 hardware context switching. The
663      * reason for not using it any more becomes apparent when you
664      * try to recover gracefully from saved state that is no longer
665      * valid (stale segment register values in particular). With the
666      * hardware task-switch, there is no way to fix up bad state in
667      * a reasonable manner.
668      *
669      * The fact that Intel documents the hardware task-switching to
670      * be slow is a fairly red herring - this code is not noticeably
671      * faster. However, there _is_ some room for improvement here,
672      * so the performance issues may eventually be a valid point.
673      * More important, however, is the fact that this allows us much
674      * more flexibility.
675      */
676     void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
677     {
678     	struct thread_struct *prev = &prev_p->thread,
679     				 *next = &next_p->thread;
680     	struct tss_struct *tss = init_tss + smp_processor_id();
681     
682     	unlazy_fpu(prev_p);
683     
684     	/*
685     	 * Reload esp0, LDT and the page table pointer:
686     	 */
687     	tss->esp0 = next->esp0;
688     
689     	/*
690     	 * Save away %fs and %gs. No need to save %es and %ds, as
691     	 * those are always kernel segments while inside the kernel.
692     	 */
693     	asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
694     	asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
695     
696     	/*
697     	 * Restore %fs and %gs.
698     	 */
699     	loadsegment(fs, next->fs);
700     	loadsegment(gs, next->gs);
701     
702     	/*
703     	 * Now maybe reload the debug registers
704     	 */
705     	if (next->debugreg[7]){
706     		loaddebug(next, 0);
707     		loaddebug(next, 1);
708     		loaddebug(next, 2);
709     		loaddebug(next, 3);
710     		/* no 4 and 5 */
711     		loaddebug(next, 6);
712     		loaddebug(next, 7);
713     	}
714     
715     	if (prev->ioperm || next->ioperm) {
716     		if (next->ioperm) {
717     			/*
718     			 * 4 cachelines copy ... not good, but not that
719     			 * bad either. Anyone got something better?
720     			 * This only affects processes which use ioperm().
721     			 * [Putting the TSSs into 4k-tlb mapped regions
722     			 * and playing VM tricks to switch the IO bitmap
723     			 * is not really acceptable.]
724     			 */
725     			memcpy(tss->io_bitmap, next->io_bitmap,
726     				 IO_BITMAP_SIZE*sizeof(unsigned long));
727     			tss->bitmap = IO_BITMAP_OFFSET;
728     		} else
729     			/*
730     			 * a bitmap offset pointing outside of the TSS limit
731     			 * causes a nicely controllable SIGSEGV if a process
732     			 * tries to use a port IO instruction. The first
733     			 * sys_ioperm() call sets up the bitmap properly.
734     			 */
735     			tss->bitmap = INVALID_IO_BITMAP_OFFSET;
736     	}
737     }
738     
739     asmlinkage int sys_fork(struct pt_regs regs)
740     {
741     	return do_fork(SIGCHLD, regs.esp, &regs, 0);
742     }
743     
744     asmlinkage int sys_clone(struct pt_regs regs)
745     {
746     	unsigned long clone_flags;
747     	unsigned long newsp;
748     
749     	clone_flags = regs.ebx;
750     	newsp = regs.ecx;
751     	if (!newsp)
752     		newsp = regs.esp;
753     	return do_fork(clone_flags, newsp, &regs, 0);
754     }
755     
756     /*
757      * This is trivial, and on the face of it looks like it
758      * could equally well be done in user mode.
759      *
760      * Not so, for quite unobvious reasons - register pressure.
761      * In user mode vfork() cannot have a stack frame, and if
762      * done by calling the "clone()" system call directly, you
763      * do not have enough call-clobbered registers to hold all
764      * the information you need.
765      */
766     asmlinkage int sys_vfork(struct pt_regs regs)
767     {
768     	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0);
769     }
770     
771     /*
772      * sys_execve() executes a new program.
773      */
774     asmlinkage int sys_execve(struct pt_regs regs)
775     {
776     	int error;
777     	char * filename;
778     
779     	filename = getname((char *) regs.ebx);
780     	error = PTR_ERR(filename);
781     	if (IS_ERR(filename))
782     		goto out;
783     	error = do_execve(filename, (char **) regs.ecx, (char **) regs.edx, &regs);
784     	if (error == 0)
785     		current->ptrace &= ~PT_DTRACE;
786     	putname(filename);
787     out:
788     	return error;
789     }
790     
791     /*
792      * These bracket the sleeping functions..
793      */
794     extern void scheduling_functions_start_here(void);
795     extern void scheduling_functions_end_here(void);
796     #define first_sched	((unsigned long) scheduling_functions_start_here)
797     #define last_sched	((unsigned long) scheduling_functions_end_here)
798     
799     unsigned long get_wchan(struct task_struct *p)
800     {
801     	unsigned long ebp, esp, eip;
802     	unsigned long stack_page;
803     	int count = 0;
804     	if (!p || p == current || p->state == TASK_RUNNING)
805     		return 0;
806     	stack_page = (unsigned long)p;
807     	esp = p->thread.esp;
808     	if (!stack_page || esp < stack_page || esp > 8188+stack_page)
809     		return 0;
810     	/* include/asm-i386/system.h:switch_to() pushes ebp last. */
811     	ebp = *(unsigned long *) esp;
812     	do {
813     		if (ebp < stack_page || ebp > 8184+stack_page)
814     			return 0;
815     		eip = *(unsigned long *) (ebp+4);
816     		if (eip < first_sched || eip >= last_sched)
817     			return eip;
818     		ebp = *(unsigned long *) ebp;
819     	} while (count++ < 16);
820     	return 0;
821     }
822     #undef last_sched
823     #undef first_sched
824