File: /usr/src/linux/kernel/sched.c

1     /*
2      *  linux/kernel/sched.c
3      *
4      *  Kernel scheduler and related syscalls
5      *
6      *  Copyright (C) 1991, 1992  Linus Torvalds
7      *
8      *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9      *              make semaphores SMP safe
10      *  1998-11-19	Implemented schedule_timeout() and related stuff
11      *		by Andrea Arcangeli
12      *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
13      */
14     
15     /*
16      * 'sched.c' is the main kernel file. It contains scheduling primitives
17      * (sleep_on, wakeup, schedule etc) as well as a number of simple system
18      * call functions (type getpid()), which just extract a field from
19      * current-task
20      */
21     
22     #include <linux/config.h>
23     #include <linux/mm.h>
24     #include <linux/init.h>
25     #include <linux/smp_lock.h>
26     #include <linux/nmi.h>
27     #include <linux/interrupt.h>
28     #include <linux/kernel_stat.h>
29     #include <linux/completion.h>
30     #include <linux/prefetch.h>
31     
32     #include <asm/uaccess.h>
33     #include <asm/mmu_context.h>
34     
35     extern void timer_bh(void);
36     extern void tqueue_bh(void);
37     extern void immediate_bh(void);
38     
39     /*
40      * scheduler variables
41      */
42     
43     unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
44     
45     extern void mem_use(void);
46     
47     /*
48      * Scheduling quanta.
49      *
50      * NOTE! The unix "nice" value influences how long a process
51      * gets. The nice value ranges from -20 to +19, where a -20
52      * is a "high-priority" task, and a "+10" is a low-priority
53      * task.
54      *
55      * We want the time-slice to be around 50ms or so, so this
56      * calculation depends on the value of HZ.
57      */
58     #if HZ < 200
59     #define TICK_SCALE(x)	((x) >> 2)
60     #elif HZ < 400
61     #define TICK_SCALE(x)	((x) >> 1)
62     #elif HZ < 800
63     #define TICK_SCALE(x)	(x)
64     #elif HZ < 1600
65     #define TICK_SCALE(x)	((x) << 1)
66     #else
67     #define TICK_SCALE(x)	((x) << 2)
68     #endif
69     
70     #define NICE_TO_TICKS(nice)	(TICK_SCALE(20-(nice))+1)
71     
72     
73     /*
74      *	Init task must be ok at boot for the ix86 as we will check its signals
75      *	via the SMP irq return path.
76      */
77      
78     struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
79     
80     /*
81      * The tasklist_lock protects the linked list of processes.
82      *
83      * The runqueue_lock locks the parts that actually access
84      * and change the run-queues, and have to be interrupt-safe.
85      *
86      * If both locks are to be concurrently held, the runqueue_lock
87      * nests inside the tasklist_lock.
88      *
89      * task->alloc_lock nests inside tasklist_lock.
90      */
91     spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
92     rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
93     
94     static LIST_HEAD(runqueue_head);
95     
96     /*
97      * We align per-CPU scheduling data on cacheline boundaries,
98      * to prevent cacheline ping-pong.
99      */
100     static union {
101     	struct schedule_data {
102     		struct task_struct * curr;
103     		cycles_t last_schedule;
104     	} schedule_data;
105     	char __pad [SMP_CACHE_BYTES];
106     } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
107     
108     #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
109     #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
110     
111     struct kernel_stat kstat;
112     extern struct task_struct *child_reaper;
113     
114     #ifdef CONFIG_SMP
115     
116     #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
117     #define can_schedule(p,cpu) ((!(p)->has_cpu) && \
118     				((p)->cpus_allowed & (1 << cpu)))
119     
120     #else
121     
122     #define idle_task(cpu) (&init_task)
123     #define can_schedule(p,cpu) (1)
124     
125     #endif
126     
127     void scheduling_functions_start_here(void) { }
128     
129     /*
130      * This is the function that decides how desirable a process is..
131      * You can weigh different processes against each other depending
132      * on what CPU they've run on lately etc to try to handle cache
133      * and TLB miss penalties.
134      *
135      * Return values:
136      *	 -1000: never select this
137      *	     0: out of time, recalculate counters (but it might still be
138      *		selected)
139      *	   +ve: "goodness" value (the larger, the better)
140      *	 +1000: realtime process, select this.
141      */
142     
143     static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
144     {
145     	int weight;
146     
147     	/*
148     	 * select the current process after every other
149     	 * runnable process, but before the idle thread.
150     	 * Also, dont trigger a counter recalculation.
151     	 */
152     	weight = -1;
153     	if (p->policy & SCHED_YIELD)
154     		goto out;
155     
156     	/*
157     	 * Non-RT process - normal case first.
158     	 */
159     	if (p->policy == SCHED_OTHER) {
160     		/*
161     		 * Give the process a first-approximation goodness value
162     		 * according to the number of clock-ticks it has left.
163     		 *
164     		 * Don't do any other calculations if the time slice is
165     		 * over..
166     		 */
167     		weight = p->counter;
168     		if (!weight)
169     			goto out;
170     			
171     #ifdef CONFIG_SMP
172     		/* Give a largish advantage to the same processor...   */
173     		/* (this is equivalent to penalizing other processors) */
174     		if (p->processor == this_cpu)
175     			weight += PROC_CHANGE_PENALTY;
176     #endif
177     
178     		/* .. and a slight advantage to the current MM */
179     		if (p->mm == this_mm || !p->mm)
180     			weight += 1;
181     		weight += 20 - p->nice;
182     		goto out;
183     	}
184     
185     	/*
186     	 * Realtime process, select the first one on the
187     	 * runqueue (taking priorities within processes
188     	 * into account).
189     	 */
190     	weight = 1000 + p->rt_priority;
191     out:
192     	return weight;
193     }
194     
195     /*
196      * the 'goodness value' of replacing a process on a given CPU.
197      * positive value means 'replace', zero or negative means 'dont'.
198      */
199     static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
200     {
201     	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
202     }
203     
204     /*
205      * This is ugly, but reschedule_idle() is very timing-critical.
206      * We are called with the runqueue spinlock held and we must
207      * not claim the tasklist_lock.
208      */
209     static FASTCALL(void reschedule_idle(struct task_struct * p));
210     
211     static void reschedule_idle(struct task_struct * p)
212     {
213     #ifdef CONFIG_SMP
214     	int this_cpu = smp_processor_id();
215     	struct task_struct *tsk, *target_tsk;
216     	int cpu, best_cpu, i, max_prio;
217     	cycles_t oldest_idle;
218     
219     	/*
220     	 * shortcut if the woken up task's last CPU is
221     	 * idle now.
222     	 */
223     	best_cpu = p->processor;
224     	if (can_schedule(p, best_cpu)) {
225     		tsk = idle_task(best_cpu);
226     		if (cpu_curr(best_cpu) == tsk) {
227     			int need_resched;
228     send_now_idle:
229     			/*
230     			 * If need_resched == -1 then we can skip sending
231     			 * the IPI altogether, tsk->need_resched is
232     			 * actively watched by the idle thread.
233     			 */
234     			need_resched = tsk->need_resched;
235     			tsk->need_resched = 1;
236     			if ((best_cpu != this_cpu) && !need_resched)
237     				smp_send_reschedule(best_cpu);
238     			return;
239     		}
240     	}
241     
242     	/*
243     	 * We know that the preferred CPU has a cache-affine current
244     	 * process, lets try to find a new idle CPU for the woken-up
245     	 * process. Select the least recently active idle CPU. (that
246     	 * one will have the least active cache context.) Also find
247     	 * the executing process which has the least priority.
248     	 */
249     	oldest_idle = (cycles_t) -1;
250     	target_tsk = NULL;
251     	max_prio = 0;
252     
253     	for (i = 0; i < smp_num_cpus; i++) {
254     		cpu = cpu_logical_map(i);
255     		if (!can_schedule(p, cpu))
256     			continue;
257     		tsk = cpu_curr(cpu);
258     		/*
259     		 * We use the first available idle CPU. This creates
260     		 * a priority list between idle CPUs, but this is not
261     		 * a problem.
262     		 */
263     		if (tsk == idle_task(cpu)) {
264     			if (last_schedule(cpu) < oldest_idle) {
265     				oldest_idle = last_schedule(cpu);
266     				target_tsk = tsk;
267     			}
268     		} else {
269     			if (oldest_idle == -1ULL) {
270     				int prio = preemption_goodness(tsk, p, cpu);
271     
272     				if (prio > max_prio) {
273     					max_prio = prio;
274     					target_tsk = tsk;
275     				}
276     			}
277     		}
278     	}
279     	tsk = target_tsk;
280     	if (tsk) {
281     		if (oldest_idle != -1ULL) {
282     			best_cpu = tsk->processor;
283     			goto send_now_idle;
284     		}
285     		tsk->need_resched = 1;
286     		if (tsk->processor != this_cpu)
287     			smp_send_reschedule(tsk->processor);
288     	}
289     	return;
290     		
291     
292     #else /* UP */
293     	int this_cpu = smp_processor_id();
294     	struct task_struct *tsk;
295     
296     	tsk = cpu_curr(this_cpu);
297     	if (preemption_goodness(tsk, p, this_cpu) > 0)
298     		tsk->need_resched = 1;
299     #endif
300     }
301     
302     /*
303      * Careful!
304      *
305      * This has to add the process to the _beginning_ of the
306      * run-queue, not the end. See the comment about "This is
307      * subtle" in the scheduler proper..
308      */
309     static inline void add_to_runqueue(struct task_struct * p)
310     {
311     	list_add(&p->run_list, &runqueue_head);
312     	nr_running++;
313     }
314     
315     static inline void move_last_runqueue(struct task_struct * p)
316     {
317     	list_del(&p->run_list);
318     	list_add_tail(&p->run_list, &runqueue_head);
319     }
320     
321     static inline void move_first_runqueue(struct task_struct * p)
322     {
323     	list_del(&p->run_list);
324     	list_add(&p->run_list, &runqueue_head);
325     }
326     
327     /*
328      * Wake up a process. Put it on the run-queue if it's not
329      * already there.  The "current" process is always on the
330      * run-queue (except when the actual re-schedule is in
331      * progress), and as such you're allowed to do the simpler
332      * "current->state = TASK_RUNNING" to mark yourself runnable
333      * without the overhead of this.
334      */
335     static inline int try_to_wake_up(struct task_struct * p, int synchronous)
336     {
337     	unsigned long flags;
338     	int success = 0;
339     
340     	/*
341     	 * We want the common case fall through straight, thus the goto.
342     	 */
343     	spin_lock_irqsave(&runqueue_lock, flags);
344     	p->state = TASK_RUNNING;
345     	if (task_on_runqueue(p))
346     		goto out;
347     	add_to_runqueue(p);
348     	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
349     		reschedule_idle(p);
350     	success = 1;
351     out:
352     	spin_unlock_irqrestore(&runqueue_lock, flags);
353     	return success;
354     }
355     
356     inline int wake_up_process(struct task_struct * p)
357     {
358     	return try_to_wake_up(p, 0);
359     }
360     
361     static void process_timeout(unsigned long __data)
362     {
363     	struct task_struct * p = (struct task_struct *) __data;
364     
365     	wake_up_process(p);
366     }
367     
368     /**
369      * schedule_timeout - sleep until timeout
370      * @timeout: timeout value in jiffies
371      *
372      * Make the current task sleep until @timeout jiffies have
373      * elapsed. The routine will return immediately unless
374      * the current task state has been set (see set_current_state()).
375      *
376      * You can set the task state as follows -
377      *
378      * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
379      * pass before the routine returns. The routine will return 0
380      *
381      * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
382      * delivered to the current task. In this case the remaining time
383      * in jiffies will be returned, or 0 if the timer expired in time
384      *
385      * The current task state is guaranteed to be TASK_RUNNING when this 
386      * routine returns.
387      *
388      * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
389      * the CPU away without a bound on the timeout. In this case the return
390      * value will be %MAX_SCHEDULE_TIMEOUT.
391      *
392      * In all cases the return value is guaranteed to be non-negative.
393      */
394     signed long schedule_timeout(signed long timeout)
395     {
396     	struct timer_list timer;
397     	unsigned long expire;
398     
399     	switch (timeout)
400     	{
401     	case MAX_SCHEDULE_TIMEOUT:
402     		/*
403     		 * These two special cases are useful to be comfortable
404     		 * in the caller. Nothing more. We could take
405     		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
406     		 * but I' d like to return a valid offset (>=0) to allow
407     		 * the caller to do everything it want with the retval.
408     		 */
409     		schedule();
410     		goto out;
411     	default:
412     		/*
413     		 * Another bit of PARANOID. Note that the retval will be
414     		 * 0 since no piece of kernel is supposed to do a check
415     		 * for a negative retval of schedule_timeout() (since it
416     		 * should never happens anyway). You just have the printk()
417     		 * that will tell you if something is gone wrong and where.
418     		 */
419     		if (timeout < 0)
420     		{
421     			printk(KERN_ERR "schedule_timeout: wrong timeout "
422     			       "value %lx from %p\n", timeout,
423     			       __builtin_return_address(0));
424     			current->state = TASK_RUNNING;
425     			goto out;
426     		}
427     	}
428     
429     	expire = timeout + jiffies;
430     
431     	init_timer(&timer);
432     	timer.expires = expire;
433     	timer.data = (unsigned long) current;
434     	timer.function = process_timeout;
435     
436     	add_timer(&timer);
437     	schedule();
438     	del_timer_sync(&timer);
439     
440     	timeout = expire - jiffies;
441     
442      out:
443     	return timeout < 0 ? 0 : timeout;
444     }
445     
446     /*
447      * schedule_tail() is getting called from the fork return path. This
448      * cleans up all remaining scheduler things, without impacting the
449      * common case.
450      */
451     static inline void __schedule_tail(struct task_struct *prev)
452     {
453     #ifdef CONFIG_SMP
454     	int policy;
455     
456     	/*
457     	 * prev->policy can be written from here only before `prev'
458     	 * can be scheduled (before setting prev->has_cpu to zero).
459     	 * Of course it must also be read before allowing prev
460     	 * to be rescheduled, but since the write depends on the read
461     	 * to complete, wmb() is enough. (the spin_lock() acquired
462     	 * before setting has_cpu is not enough because the spin_lock()
463     	 * common code semantics allows code outside the critical section
464     	 * to enter inside the critical section)
465     	 */
466     	policy = prev->policy;
467     	prev->policy = policy & ~SCHED_YIELD;
468     	wmb();
469     
470     	/*
471     	 * fast path falls through. We have to clear has_cpu before
472     	 * checking prev->state to avoid a wakeup race - thus we
473     	 * also have to protect against the task exiting early.
474     	 */
475     	task_lock(prev);
476     	prev->has_cpu = 0;
477     	mb();
478     	if (prev->state == TASK_RUNNING)
479     		goto needs_resched;
480     
481     out_unlock:
482     	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
483     	return;
484     
485     	/*
486     	 * Slow path - we 'push' the previous process and
487     	 * reschedule_idle() will attempt to find a new
488     	 * processor for it. (but it might preempt the
489     	 * current process as well.) We must take the runqueue
490     	 * lock and re-check prev->state to be correct. It might
491     	 * still happen that this process has a preemption
492     	 * 'in progress' already - but this is not a problem and
493     	 * might happen in other circumstances as well.
494     	 */
495     needs_resched:
496     	{
497     		unsigned long flags;
498     
499     		/*
500     		 * Avoid taking the runqueue lock in cases where
501     		 * no preemption-check is necessery:
502     		 */
503     		if ((prev == idle_task(smp_processor_id())) ||
504     						(policy & SCHED_YIELD))
505     			goto out_unlock;
506     
507     		spin_lock_irqsave(&runqueue_lock, flags);
508     		if ((prev->state == TASK_RUNNING) && !prev->has_cpu)
509     			reschedule_idle(prev);
510     		spin_unlock_irqrestore(&runqueue_lock, flags);
511     		goto out_unlock;
512     	}
513     #else
514     	prev->policy &= ~SCHED_YIELD;
515     #endif /* CONFIG_SMP */
516     }
517     
518     void schedule_tail(struct task_struct *prev)
519     {
520     	__schedule_tail(prev);
521     }
522     
523     /*
524      *  'schedule()' is the scheduler function. It's a very simple and nice
525      * scheduler: it's not perfect, but certainly works for most things.
526      *
527      * The goto is "interesting".
528      *
529      *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
530      * tasks can run. It can not be killed, and it cannot sleep. The 'state'
531      * information in task[0] is never used.
532      */
533     asmlinkage void schedule(void)
534     {
535     	struct schedule_data * sched_data;
536     	struct task_struct *prev, *next, *p;
537     	struct list_head *tmp;
538     	int this_cpu, c;
539     
540     
541     	spin_lock_prefetch(&runqueue_lock);
542     
543     	if (!current->active_mm) BUG();
544     need_resched_back:
545     	prev = current;
546     	this_cpu = prev->processor;
547     
548     	if (in_interrupt())
549     		goto scheduling_in_interrupt;
550     
551     	release_kernel_lock(prev, this_cpu);
552     
553     	/*
554     	 * 'sched_data' is protected by the fact that we can run
555     	 * only one process per CPU.
556     	 */
557     	sched_data = & aligned_data[this_cpu].schedule_data;
558     
559     	spin_lock_irq(&runqueue_lock);
560     
561     	/* move an exhausted RR process to be last.. */
562     	if (prev->policy == SCHED_RR)
563     		goto move_rr_last;
564     move_rr_back:
565     
566     	switch (prev->state) {
567     		case TASK_INTERRUPTIBLE:
568     			if (signal_pending(prev)) {
569     				prev->state = TASK_RUNNING;
570     				break;
571     			}
572     		default:
573     			del_from_runqueue(prev);
574     		case TASK_RUNNING:;
575     	}
576     	prev->need_resched = 0;
577     
578     	/*
579     	 * this is the scheduler proper:
580     	 */
581     
582     repeat_schedule:
583     	/*
584     	 * Default process to select..
585     	 */
586     	next = idle_task(this_cpu);
587     	c = -1000;
588     	if (prev->state == TASK_RUNNING)
589     		goto still_running;
590     
591     still_running_back:
592     	list_for_each(tmp, &runqueue_head) {
593     		p = list_entry(tmp, struct task_struct, run_list);
594     		if (can_schedule(p, this_cpu)) {
595     			int weight = goodness(p, this_cpu, prev->active_mm);
596     			if (weight > c)
597     				c = weight, next = p;
598     		}
599     	}
600     
601     	/* Do we need to re-calculate counters? */
602     	if (!c)
603     		goto recalculate;
604     	/*
605     	 * from this point on nothing can prevent us from
606     	 * switching to the next task, save this fact in
607     	 * sched_data.
608     	 */
609     	sched_data->curr = next;
610     #ifdef CONFIG_SMP
611      	next->has_cpu = 1;
612     	next->processor = this_cpu;
613     #endif
614     	spin_unlock_irq(&runqueue_lock);
615     
616     	if (prev == next) {
617     		/* We won't go through the normal tail, so do this by hand */
618     		prev->policy &= ~SCHED_YIELD;
619     		goto same_process;
620     	}
621     
622     #ifdef CONFIG_SMP
623      	/*
624      	 * maintain the per-process 'last schedule' value.
625      	 * (this has to be recalculated even if we reschedule to
626      	 * the same process) Currently this is only used on SMP,
627     	 * and it's approximate, so we do not have to maintain
628     	 * it while holding the runqueue spinlock.
629      	 */
630      	sched_data->last_schedule = get_cycles();
631     
632     	/*
633     	 * We drop the scheduler lock early (it's a global spinlock),
634     	 * thus we have to lock the previous process from getting
635     	 * rescheduled during switch_to().
636     	 */
637     
638     #endif /* CONFIG_SMP */
639     
640     	kstat.context_swtch++;
641     	/*
642     	 * there are 3 processes which are affected by a context switch:
643     	 *
644     	 * prev == .... ==> (last => next)
645     	 *
646     	 * It's the 'much more previous' 'prev' that is on next's stack,
647     	 * but prev is set to (the just run) 'last' process by switch_to().
648     	 * This might sound slightly confusing but makes tons of sense.
649     	 */
650     	prepare_to_switch();
651     	{
652     		struct mm_struct *mm = next->mm;
653     		struct mm_struct *oldmm = prev->active_mm;
654     		if (!mm) {
655     			if (next->active_mm) BUG();
656     			next->active_mm = oldmm;
657     			atomic_inc(&oldmm->mm_count);
658     			enter_lazy_tlb(oldmm, next, this_cpu);
659     		} else {
660     			if (next->active_mm != mm) BUG();
661     			switch_mm(oldmm, mm, next, this_cpu);
662     		}
663     
664     		if (!prev->mm) {
665     			prev->active_mm = NULL;
666     			mmdrop(oldmm);
667     		}
668     	}
669     
670     	/*
671     	 * This just switches the register state and the
672     	 * stack.
673     	 */
674     	switch_to(prev, next, prev);
675     	__schedule_tail(prev);
676     
677     same_process:
678     	reacquire_kernel_lock(current);
679     	if (current->need_resched)
680     		goto need_resched_back;
681     
682     	return;
683     
684     recalculate:
685     	{
686     		struct task_struct *p;
687     		spin_unlock_irq(&runqueue_lock);
688     		read_lock(&tasklist_lock);
689     		for_each_task(p)
690     			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
691     		read_unlock(&tasklist_lock);
692     		spin_lock_irq(&runqueue_lock);
693     	}
694     	goto repeat_schedule;
695     
696     still_running:
697     	if (!(prev->cpus_allowed & (1UL << this_cpu)))
698     		goto still_running_back;
699     	c = goodness(prev, this_cpu, prev->active_mm);
700     	next = prev;
701     	goto still_running_back;
702     
703     move_rr_last:
704     	if (!prev->counter) {
705     		prev->counter = NICE_TO_TICKS(prev->nice);
706     		move_last_runqueue(prev);
707     	}
708     	goto move_rr_back;
709     
710     scheduling_in_interrupt:
711     	printk("Scheduling in interrupt\n");
712     	BUG();
713     	return;
714     }
715     
716     /*
717      * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
718      * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
719      * non-exclusive tasks and one exclusive task.
720      *
721      * There are circumstances in which we can try to wake a task which has already
722      * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
723      * in this (rare) case, and we handle it by contonuing to scan the queue.
724      */
725     static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
726     			 	     int nr_exclusive, const int sync)
727     {
728     	struct list_head *tmp;
729     	struct task_struct *p;
730     
731     	CHECK_MAGIC_WQHEAD(q);
732     	WQ_CHECK_LIST_HEAD(&q->task_list);
733     	
734     	list_for_each(tmp,&q->task_list) {
735     		unsigned int state;
736                     wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
737     
738     		CHECK_MAGIC(curr->__magic);
739     		p = curr->task;
740     		state = p->state;
741     		if (state & mode) {
742     			WQ_NOTE_WAKER(curr);
743     			if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
744     				break;
745     		}
746     	}
747     }
748     
749     void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr)
750     {
751     	if (q) {
752     		unsigned long flags;
753     		wq_read_lock_irqsave(&q->lock, flags);
754     		__wake_up_common(q, mode, nr, 0);
755     		wq_read_unlock_irqrestore(&q->lock, flags);
756     	}
757     }
758     
759     void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)
760     {
761     	if (q) {
762     		unsigned long flags;
763     		wq_read_lock_irqsave(&q->lock, flags);
764     		__wake_up_common(q, mode, nr, 1);
765     		wq_read_unlock_irqrestore(&q->lock, flags);
766     	}
767     }
768     
769     void complete(struct completion *x)
770     {
771     	unsigned long flags;
772     
773     	spin_lock_irqsave(&x->wait.lock, flags);
774     	x->done++;
775     	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
776     	spin_unlock_irqrestore(&x->wait.lock, flags);
777     }
778     
779     void wait_for_completion(struct completion *x)
780     {
781     	spin_lock_irq(&x->wait.lock);
782     	if (!x->done) {
783     		DECLARE_WAITQUEUE(wait, current);
784     
785     		wait.flags |= WQ_FLAG_EXCLUSIVE;
786     		__add_wait_queue_tail(&x->wait, &wait);
787     		do {
788     			__set_current_state(TASK_UNINTERRUPTIBLE);
789     			spin_unlock_irq(&x->wait.lock);
790     			schedule();
791     			spin_lock_irq(&x->wait.lock);
792     		} while (!x->done);
793     		__remove_wait_queue(&x->wait, &wait);
794     	}
795     	x->done--;
796     	spin_unlock_irq(&x->wait.lock);
797     }
798     
799     #define	SLEEP_ON_VAR				\
800     	unsigned long flags;			\
801     	wait_queue_t wait;			\
802     	init_waitqueue_entry(&wait, current);
803     
804     #define	SLEEP_ON_HEAD					\
805     	wq_write_lock_irqsave(&q->lock,flags);		\
806     	__add_wait_queue(q, &wait);			\
807     	wq_write_unlock(&q->lock);
808     
809     #define	SLEEP_ON_TAIL						\
810     	wq_write_lock_irq(&q->lock);				\
811     	__remove_wait_queue(q, &wait);				\
812     	wq_write_unlock_irqrestore(&q->lock,flags);
813     
814     void interruptible_sleep_on(wait_queue_head_t *q)
815     {
816     	SLEEP_ON_VAR
817     
818     	current->state = TASK_INTERRUPTIBLE;
819     
820     	SLEEP_ON_HEAD
821     	schedule();
822     	SLEEP_ON_TAIL
823     }
824     
825     long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
826     {
827     	SLEEP_ON_VAR
828     
829     	current->state = TASK_INTERRUPTIBLE;
830     
831     	SLEEP_ON_HEAD
832     	timeout = schedule_timeout(timeout);
833     	SLEEP_ON_TAIL
834     
835     	return timeout;
836     }
837     
838     void sleep_on(wait_queue_head_t *q)
839     {
840     	SLEEP_ON_VAR
841     	
842     	current->state = TASK_UNINTERRUPTIBLE;
843     
844     	SLEEP_ON_HEAD
845     	schedule();
846     	SLEEP_ON_TAIL
847     }
848     
849     long sleep_on_timeout(wait_queue_head_t *q, long timeout)
850     {
851     	SLEEP_ON_VAR
852     	
853     	current->state = TASK_UNINTERRUPTIBLE;
854     
855     	SLEEP_ON_HEAD
856     	timeout = schedule_timeout(timeout);
857     	SLEEP_ON_TAIL
858     
859     	return timeout;
860     }
861     
862     void scheduling_functions_end_here(void) { }
863     
864     #ifndef __alpha__
865     
866     /*
867      * This has been replaced by sys_setpriority.  Maybe it should be
868      * moved into the arch dependent tree for those ports that require
869      * it for backward compatibility?
870      */
871     
872     asmlinkage long sys_nice(int increment)
873     {
874     	long newprio;
875     
876     	/*
877     	 *	Setpriority might change our priority at the same moment.
878     	 *	We don't have to worry. Conceptually one call occurs first
879     	 *	and we have a single winner.
880     	 */
881     	if (increment < 0) {
882     		if (!capable(CAP_SYS_NICE))
883     			return -EPERM;
884     		if (increment < -40)
885     			increment = -40;
886     	}
887     	if (increment > 40)
888     		increment = 40;
889     
890     	newprio = current->nice + increment;
891     	if (newprio < -20)
892     		newprio = -20;
893     	if (newprio > 19)
894     		newprio = 19;
895     	current->nice = newprio;
896     	return 0;
897     }
898     
899     #endif
900     
901     static inline struct task_struct *find_process_by_pid(pid_t pid)
902     {
903     	struct task_struct *tsk = current;
904     
905     	if (pid)
906     		tsk = find_task_by_pid(pid);
907     	return tsk;
908     }
909     
910     static int setscheduler(pid_t pid, int policy, 
911     			struct sched_param *param)
912     {
913     	struct sched_param lp;
914     	struct task_struct *p;
915     	int retval;
916     
917     	retval = -EINVAL;
918     	if (!param || pid < 0)
919     		goto out_nounlock;
920     
921     	retval = -EFAULT;
922     	if (copy_from_user(&lp, param, sizeof(struct sched_param)))
923     		goto out_nounlock;
924     
925     	/*
926     	 * We play safe to avoid deadlocks.
927     	 */
928     	read_lock_irq(&tasklist_lock);
929     	spin_lock(&runqueue_lock);
930     
931     	p = find_process_by_pid(pid);
932     
933     	retval = -ESRCH;
934     	if (!p)
935     		goto out_unlock;
936     			
937     	if (policy < 0)
938     		policy = p->policy;
939     	else {
940     		retval = -EINVAL;
941     		if (policy != SCHED_FIFO && policy != SCHED_RR &&
942     				policy != SCHED_OTHER)
943     			goto out_unlock;
944     	}
945     	
946     	/*
947     	 * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
948     	 * priority for SCHED_OTHER is 0.
949     	 */
950     	retval = -EINVAL;
951     	if (lp.sched_priority < 0 || lp.sched_priority > 99)
952     		goto out_unlock;
953     	if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
954     		goto out_unlock;
955     
956     	retval = -EPERM;
957     	if ((policy == SCHED_FIFO || policy == SCHED_RR) && 
958     	    !capable(CAP_SYS_NICE))
959     		goto out_unlock;
960     	if ((current->euid != p->euid) && (current->euid != p->uid) &&
961     	    !capable(CAP_SYS_NICE))
962     		goto out_unlock;
963     
964     	retval = 0;
965     	p->policy = policy;
966     	p->rt_priority = lp.sched_priority;
967     	if (task_on_runqueue(p))
968     		move_first_runqueue(p);
969     
970     	current->need_resched = 1;
971     
972     out_unlock:
973     	spin_unlock(&runqueue_lock);
974     	read_unlock_irq(&tasklist_lock);
975     
976     out_nounlock:
977     	return retval;
978     }
979     
980     asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 
981     				      struct sched_param *param)
982     {
983     	return setscheduler(pid, policy, param);
984     }
985     
986     asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
987     {
988     	return setscheduler(pid, -1, param);
989     }
990     
991     asmlinkage long sys_sched_getscheduler(pid_t pid)
992     {
993     	struct task_struct *p;
994     	int retval;
995     
996     	retval = -EINVAL;
997     	if (pid < 0)
998     		goto out_nounlock;
999     
1000     	retval = -ESRCH;
1001     	read_lock(&tasklist_lock);
1002     	p = find_process_by_pid(pid);
1003     	if (p)
1004     		retval = p->policy & ~SCHED_YIELD;
1005     	read_unlock(&tasklist_lock);
1006     
1007     out_nounlock:
1008     	return retval;
1009     }
1010     
1011     asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
1012     {
1013     	struct task_struct *p;
1014     	struct sched_param lp;
1015     	int retval;
1016     
1017     	retval = -EINVAL;
1018     	if (!param || pid < 0)
1019     		goto out_nounlock;
1020     
1021     	read_lock(&tasklist_lock);
1022     	p = find_process_by_pid(pid);
1023     	retval = -ESRCH;
1024     	if (!p)
1025     		goto out_unlock;
1026     	lp.sched_priority = p->rt_priority;
1027     	read_unlock(&tasklist_lock);
1028     
1029     	/*
1030     	 * This one might sleep, we cannot do it with a spinlock held ...
1031     	 */
1032     	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1033     
1034     out_nounlock:
1035     	return retval;
1036     
1037     out_unlock:
1038     	read_unlock(&tasklist_lock);
1039     	return retval;
1040     }
1041     
1042     asmlinkage long sys_sched_yield(void)
1043     {
1044     	/*
1045     	 * Trick. sched_yield() first counts the number of truly 
1046     	 * 'pending' runnable processes, then returns if it's
1047     	 * only the current processes. (This test does not have
1048     	 * to be atomic.) In threaded applications this optimization
1049     	 * gets triggered quite often.
1050     	 */
1051     
1052     	int nr_pending = nr_running;
1053     
1054     #if CONFIG_SMP
1055     	int i;
1056     
1057     	// Subtract non-idle processes running on other CPUs.
1058     	for (i = 0; i < smp_num_cpus; i++) {
1059     		int cpu = cpu_logical_map(i);
1060     		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
1061     			nr_pending--;
1062     	}
1063     #else
1064     	// on UP this process is on the runqueue as well
1065     	nr_pending--;
1066     #endif
1067     	if (nr_pending) {
1068     		/*
1069     		 * This process can only be rescheduled by us,
1070     		 * so this is safe without any locking.
1071     		 */
1072     		if (current->policy == SCHED_OTHER)
1073     			current->policy |= SCHED_YIELD;
1074     		current->need_resched = 1;
1075     	}
1076     	return 0;
1077     }
1078     
1079     asmlinkage long sys_sched_get_priority_max(int policy)
1080     {
1081     	int ret = -EINVAL;
1082     
1083     	switch (policy) {
1084     	case SCHED_FIFO:
1085     	case SCHED_RR:
1086     		ret = 99;
1087     		break;
1088     	case SCHED_OTHER:
1089     		ret = 0;
1090     		break;
1091     	}
1092     	return ret;
1093     }
1094     
1095     asmlinkage long sys_sched_get_priority_min(int policy)
1096     {
1097     	int ret = -EINVAL;
1098     
1099     	switch (policy) {
1100     	case SCHED_FIFO:
1101     	case SCHED_RR:
1102     		ret = 1;
1103     		break;
1104     	case SCHED_OTHER:
1105     		ret = 0;
1106     	}
1107     	return ret;
1108     }
1109     
1110     asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1111     {
1112     	struct timespec t;
1113     	struct task_struct *p;
1114     	int retval = -EINVAL;
1115     
1116     	if (pid < 0)
1117     		goto out_nounlock;
1118     
1119     	retval = -ESRCH;
1120     	read_lock(&tasklist_lock);
1121     	p = find_process_by_pid(pid);
1122     	if (p)
1123     		jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
1124     				    &t);
1125     	read_unlock(&tasklist_lock);
1126     	if (p)
1127     		retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1128     out_nounlock:
1129     	return retval;
1130     }
1131     
1132     static void show_task(struct task_struct * p)
1133     {
1134     	unsigned long free = 0;
1135     	int state;
1136     	static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1137     
1138     	printk("%-13.13s ", p->comm);
1139     	state = p->state ? ffz(~p->state) + 1 : 0;
1140     	if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1141     		printk(stat_nam[state]);
1142     	else
1143     		printk(" ");
1144     #if (BITS_PER_LONG == 32)
1145     	if (p == current)
1146     		printk(" current  ");
1147     	else
1148     		printk(" %08lX ", thread_saved_pc(&p->thread));
1149     #else
1150     	if (p == current)
1151     		printk("   current task   ");
1152     	else
1153     		printk(" %016lx ", thread_saved_pc(&p->thread));
1154     #endif
1155     	{
1156     		unsigned long * n = (unsigned long *) (p+1);
1157     		while (!*n)
1158     			n++;
1159     		free = (unsigned long) n - (unsigned long)(p+1);
1160     	}
1161     	printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1162     	if (p->p_cptr)
1163     		printk("%5d ", p->p_cptr->pid);
1164     	else
1165     		printk("      ");
1166     	if (p->p_ysptr)
1167     		printk("%7d", p->p_ysptr->pid);
1168     	else
1169     		printk("       ");
1170     	if (p->p_osptr)
1171     		printk(" %5d", p->p_osptr->pid);
1172     	else
1173     		printk("      ");
1174     	if (!p->mm)
1175     		printk(" (L-TLB)\n");
1176     	else
1177     		printk(" (NOTLB)\n");
1178     
1179     #if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_ARM) || defined(CONFIG_ALPHA)
1180     /* This is very useful, but only works on ARM, x86 and sparc64 right now */
1181     	{
1182     		extern void show_trace_task(struct task_struct *tsk);
1183     		show_trace_task(p);
1184     	}
1185     #endif
1186     }
1187     
1188     char * render_sigset_t(sigset_t *set, char *buffer)
1189     {
1190     	int i = _NSIG, x;
1191     	do {
1192     		i -= 4, x = 0;
1193     		if (sigismember(set, i+1)) x |= 1;
1194     		if (sigismember(set, i+2)) x |= 2;
1195     		if (sigismember(set, i+3)) x |= 4;
1196     		if (sigismember(set, i+4)) x |= 8;
1197     		*buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1198     	} while (i >= 4);
1199     	*buffer = 0;
1200     	return buffer;
1201     }
1202     
1203     void show_state(void)
1204     {
1205     	struct task_struct *p;
1206     
1207     #if (BITS_PER_LONG == 32)
1208     	printk("\n"
1209     	       "                         free                        sibling\n");
1210     	printk("  task             PC    stack   pid father child younger older\n");
1211     #else
1212     	printk("\n"
1213     	       "                                 free                        sibling\n");
1214     	printk("  task                 PC        stack   pid father child younger older\n");
1215     #endif
1216     	read_lock(&tasklist_lock);
1217     	for_each_task(p) {
1218     		/*
1219     		 * reset the NMI-timeout, listing all files on a slow
1220     		 * console might take alot of time:
1221     		 */
1222     		touch_nmi_watchdog();
1223     		show_task(p);
1224     	}
1225     	read_unlock(&tasklist_lock);
1226     }
1227     
1228     /**
1229      * reparent_to_init() - Reparent the calling kernel thread to the init task.
1230      *
1231      * If a kernel thread is launched as a result of a system call, or if
1232      * it ever exits, it should generally reparent itself to init so that
1233      * it is correctly cleaned up on exit.
1234      *
1235      * The various task state such as scheduling policy and priority may have
1236      * been inherited fro a user process, so we reset them to sane values here.
1237      *
1238      * NOTE that reparent_to_init() gives the caller full capabilities.
1239      */
1240     void reparent_to_init(void)
1241     {
1242     	struct task_struct *this_task = current;
1243     
1244     	write_lock_irq(&tasklist_lock);
1245     
1246     	/* Reparent to init */
1247     	REMOVE_LINKS(this_task);
1248     	this_task->p_pptr = child_reaper;
1249     	this_task->p_opptr = child_reaper;
1250     	SET_LINKS(this_task);
1251     
1252     	/* Set the exit signal to SIGCHLD so we signal init on exit */
1253     	if (this_task->exit_signal != 0) {
1254     		printk(KERN_ERR "task `%s' exit_signal %d in "
1255     				__FUNCTION__ "\n",
1256     			this_task->comm, this_task->exit_signal);
1257     	}
1258     	this_task->exit_signal = SIGCHLD;
1259     
1260     	/* We also take the runqueue_lock while altering task fields
1261     	 * which affect scheduling decisions */
1262     	spin_lock(&runqueue_lock);
1263     
1264     	this_task->ptrace = 0;
1265     	this_task->nice = DEF_NICE;
1266     	this_task->policy = SCHED_OTHER;
1267     	/* cpus_allowed? */
1268     	/* rt_priority? */
1269     	/* signals? */
1270     	this_task->cap_effective = CAP_INIT_EFF_SET;
1271     	this_task->cap_inheritable = CAP_INIT_INH_SET;
1272     	this_task->cap_permitted = CAP_FULL_SET;
1273     	this_task->keep_capabilities = 0;
1274     	memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
1275     	this_task->user = INIT_USER;
1276     
1277     	spin_unlock(&runqueue_lock);
1278     	write_unlock_irq(&tasklist_lock);
1279     }
1280     
1281     /*
1282      *	Put all the gunge required to become a kernel thread without
1283      *	attached user resources in one place where it belongs.
1284      */
1285     
1286     void daemonize(void)
1287     {
1288     	struct fs_struct *fs;
1289     
1290     
1291     	/*
1292     	 * If we were started as result of loading a module, close all of the
1293     	 * user space pages.  We don't need them, and if we didn't close them
1294     	 * they would be locked into memory.
1295     	 */
1296     	exit_mm(current);
1297     
1298     	current->session = 1;
1299     	current->pgrp = 1;
1300     
1301     	/* Become as one with the init task */
1302     
1303     	exit_fs(current);	/* current->fs->count--; */
1304     	fs = init_task.fs;
1305     	current->fs = fs;
1306     	atomic_inc(&fs->count);
1307      	exit_files(current);
1308     	current->files = init_task.files;
1309     	atomic_inc(&current->files->count);
1310     }
1311     
1312     void __init init_idle(void)
1313     {
1314     	struct schedule_data * sched_data;
1315     	sched_data = &aligned_data[smp_processor_id()].schedule_data;
1316     
1317     	if (current != &init_task && task_on_runqueue(current)) {
1318     		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
1319     			smp_processor_id(), current->pid);
1320     		del_from_runqueue(current);
1321     	}
1322     	sched_data->curr = current;
1323     	sched_data->last_schedule = get_cycles();
1324     }
1325     
1326     extern void init_timervecs (void);
1327     
1328     void __init sched_init(void)
1329     {
1330     	/*
1331     	 * We have to do a little magic to get the first
1332     	 * process right in SMP mode.
1333     	 */
1334     	int cpu = smp_processor_id();
1335     	int nr;
1336     
1337     	init_task.processor = cpu;
1338     
1339     	for(nr = 0; nr < PIDHASH_SZ; nr++)
1340     		pidhash[nr] = NULL;
1341     
1342     	init_timervecs();
1343     
1344     	init_bh(TIMER_BH, timer_bh);
1345     	init_bh(TQUEUE_BH, tqueue_bh);
1346     	init_bh(IMMEDIATE_BH, immediate_bh);
1347     
1348     	/*
1349     	 * The boot idle thread does lazy MMU switching as well:
1350     	 */
1351     	atomic_inc(&init_mm.mm_count);
1352     	enter_lazy_tlb(&init_mm, current, cpu);
1353     }
1354