diff -Nru linux-2.5.2-pre9.vanilla/Makefile linux-2.5.2-pre9.xs2/Makefile
--- linux-2.5.2-pre9.vanilla/Makefile	Sat Jan  5 19:46:25 2002
+++ linux-2.5.2-pre9.xs2/Makefile	Sun Jan  6 15:04:22 2002
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 5
 SUBLEVEL = 2
-EXTRAVERSION =-pre9
+EXTRAVERSION = -pre9-xs2
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -Nru linux-2.5.2-pre9.vanilla/arch/i386/kernel/process.c linux-2.5.2-pre9.xs2/arch/i386/kernel/process.c
--- linux-2.5.2-pre9.vanilla/arch/i386/kernel/process.c	Sat Jan  5 19:46:25 2002
+++ linux-2.5.2-pre9.xs2/arch/i386/kernel/process.c	Sun Jan  6 15:04:05 2002
@@ -122,10 +122,6 @@
  */
 void cpu_idle (void)
 {
-	/* endless idle loop with no priority at all */
-	init_idle();
-	current->nice = 20;
-
 	while (1) {
 		void (*idle)(void) = pm_idle;
 		if (!idle)
diff -Nru linux-2.5.2-pre9.vanilla/arch/i386/kernel/smpboot.c linux-2.5.2-pre9.xs2/arch/i386/kernel/smpboot.c
--- linux-2.5.2-pre9.vanilla/arch/i386/kernel/smpboot.c	Sat Jan  5 19:46:25 2002
+++ linux-2.5.2-pre9.xs2/arch/i386/kernel/smpboot.c	Sun Jan  6 15:04:05 2002
@@ -471,6 +471,7 @@
 	 */
 	local_flush_tlb();
 
+	init_idle();
 	return cpu_idle();
 }
 
@@ -803,15 +804,13 @@
 	if (!idle)
 		panic("No idle process for CPU %d", cpu);
 
-	idle->processor = cpu;
-	idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
-
 	map_cpu_to_boot_apicid(cpu, apicid);
 
-	idle->thread.eip = (unsigned long) start_secondary;
-
 	del_from_runqueue(idle);
 	unhash_process(idle);
+	idle->processor = cpu;
+	idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
+	idle->thread.eip = (unsigned long) start_secondary;
 	init_tasks[cpu] = idle;
 
 	/* start_eip had better be page-aligned! */
diff -Nru linux-2.5.2-pre9.vanilla/include/linux/sched.h linux-2.5.2-pre9.xs2/include/linux/sched.h
--- linux-2.5.2-pre9.vanilla/include/linux/sched.h	Sat Jan  5 19:46:31 2002
+++ linux-2.5.2-pre9.xs2/include/linux/sched.h	Sun Jan  6 15:13:35 2002
@@ -15,6 +15,7 @@
 #include <linux/rbtree.h>
 
 #include <asm/system.h>
+#include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -72,7 +73,11 @@
 #define CT_TO_SECS(x)	((x) / HZ)
 #define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
 
-extern int nr_running, nr_threads;
+extern int nr_task_running(void);
+
+#define nr_running	nr_task_running()
+
+extern int nr_threads;
 extern int last_pid;
 
 #include <linux/fs.h>
@@ -121,6 +126,7 @@
  * yield the CPU for one re-schedule..
  */
 #define SCHED_YIELD		0x10
+#define SCHED_RTLOCAL	0x20
 
 struct sched_param {
 	int sched_priority;
@@ -139,9 +145,9 @@
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
-extern spinlock_t runqueue_lock;
 extern spinlock_t mmlist_lock;
 
+extern void sched_cpudmap_init(void);
 extern void sched_init(void);
 extern void init_idle(void);
 extern void show_state(void);
@@ -321,9 +327,11 @@
 	 * that's just fine.)
 	 */
 	struct list_head run_list;
+	int task_qid;
 	long time_slice;
 	/* recalculation loop checkpoint */
 	unsigned long rcl_last;
+	unsigned long run_jtime;
 
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
@@ -407,6 +415,9 @@
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
+
+/* per cpu proc list */
+	struct list_head proclist_cpu;
 	
 /* Thread group tracking */
    	u32 parent_exec_id;
@@ -482,11 +493,13 @@
     active_mm:		&init_mm,					\
     cpus_runnable:	-1,						\
     cpus_allowed:	-1,						\
+    processor:		0,					\
     run_list:		{ NULL, NULL },			\
     rcl_last:		0,					\
     time_slice:		DEF_TSLICE,					\
     next_task:		&tsk,						\
     prev_task:		&tsk,						\
+    pid:		0,					\
     p_opptr:		&tsk,						\
     p_pptr:		&tsk,						\
     thread_group:	LIST_HEAD_INIT(tsk.thread_group),		\
@@ -510,6 +523,9 @@
     blocked:		{{0}},						\
     alloc_lock:		SPIN_LOCK_UNLOCKED,				\
     journal_info:	NULL,						\
+	task_qid:       0,                      \
+	proclist_cpu:   LIST_HEAD_INIT(tsk.proclist_cpu),           \
+	run_jtime:		0,						\
 }
 
 
@@ -802,6 +818,16 @@
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
+extern void del_from_runqueue(struct task_struct * p);
+extern void add_to_proclist(struct task_struct * p);
+extern void del_from_proclist(struct task_struct * p);
+extern void sched_wake_idle(void);
+extern int move_to_cpu(struct task_struct * p, int cpu, int stick);
+extern int task_cpu_place(struct task_struct *p);
+extern void runqueue_spin_lock(struct task_struct * p);
+extern void runqueue_spin_unlock(struct task_struct * p);
+
+
 #define __wait_event(wq, condition) 					\
 do {									\
 	wait_queue_t __wait;						\
@@ -855,6 +881,7 @@
 })
 
 #define REMOVE_LINKS(p) do { \
+	del_from_proclist(p); \
 	(p)->next_task->prev_task = (p)->prev_task; \
 	(p)->prev_task->next_task = (p)->next_task; \
 	if ((p)->p_osptr) \
@@ -866,6 +893,7 @@
 	} while (0)
 
 #define SET_LINKS(p) do { \
+	add_to_proclist(p); \
 	(p)->next_task = &init_task; \
 	(p)->prev_task = init_task.prev_task; \
 	init_task.prev_task->next_task = (p); \
@@ -882,16 +910,19 @@
 #define next_thread(p) \
 	list_entry((p)->thread_group.next, struct task_struct, thread_group)
 
-static inline void del_from_runqueue(struct task_struct * p)
+static inline int task_on_runqueue(struct task_struct *p)
+{
+	return (p->run_list.next != NULL);
+}
+
+static inline int task_on_proclist(struct task_struct *p)
 {
-	nr_running--;
-	list_del(&p->run_list);
-	p->run_list.next = NULL;
+	return (p->proclist_cpu.next != NULL);
 }
 
-static inline int task_on_runqueue(struct task_struct *p)
+static inline int task_realtime(struct task_struct *p)
 {
-	return (p->run_list.next != NULL);
+	return ((p->policy & ~SCHED_YIELD) != SCHED_OTHER);
 }
 
 static inline void unhash_process(struct task_struct *p)
diff -Nru linux-2.5.2-pre9.vanilla/init/main.c linux-2.5.2-pre9.xs2/init/main.c
--- linux-2.5.2-pre9.vanilla/init/main.c	Fri Dec  7 16:24:52 2001
+++ linux-2.5.2-pre9.xs2/init/main.c	Sun Jan  6 15:04:05 2002
@@ -316,14 +316,6 @@
 
 	smp_threads_ready=1;
 	smp_commence();
-
-	/* Wait for the other cpus to set up their idle processes */
-	printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
-	while (wait_init_idle) {
-		cpu_relax();
-		barrier();
-	}
-	printk("All processors have done init_idle\n");
 }
 
 #endif
@@ -337,6 +329,7 @@
 
 static void rest_init(void)
 {
+	init_idle();
 	kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 	unlock_kernel();
 	current->need_resched = 1;
@@ -427,6 +420,10 @@
 	 *	make syscalls (and thus be locked).
 	 */
 	smp_init();
+	/*
+	 * after smp initialization we can finally setup the cpu distance map
+	 */
+	sched_cpudmap_init();
 	rest_init();
 }
 
diff -Nru linux-2.5.2-pre9.vanilla/kernel/fork.c linux-2.5.2-pre9.xs2/kernel/fork.c
--- linux-2.5.2-pre9.vanilla/kernel/fork.c	Sat Jan  5 19:46:31 2002
+++ linux-2.5.2-pre9.xs2/kernel/fork.c	Sun Jan  6 15:04:05 2002
@@ -22,6 +22,7 @@
 #include <linux/namespace.h>
 #include <linux/personality.h>
 
+#include <asm/atomic.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -29,7 +30,6 @@
 
 /* The idle threads do not count.. */
 int nr_threads;
-int nr_running;
 
 int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
@@ -620,6 +620,9 @@
 	p->run_list.next = NULL;
 	p->run_list.prev = NULL;
 
+	p->proclist_cpu.next = NULL;
+	p->proclist_cpu.prev = NULL;
+
 	p->p_cptr = NULL;
 	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
@@ -645,7 +648,11 @@
 	{
 		int i;
 		p->cpus_runnable = ~0UL;
-		p->processor = current->processor;
+		/*
+		 * if it's a real time task we leave it on the same processor/task_qid
+		 */
+		if (!task_realtime(p) && !(clone_flags & CLONE_PID))
+			task_cpu_place(p);
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < smp_num_cpus; i++)
 			p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
diff -Nru linux-2.5.2-pre9.vanilla/kernel/ksyms.c linux-2.5.2-pre9.xs2/kernel/ksyms.c
--- linux-2.5.2-pre9.vanilla/kernel/ksyms.c	Sat Jan  5 19:46:31 2002
+++ linux-2.5.2-pre9.xs2/kernel/ksyms.c	Sun Jan  6 15:04:05 2002
@@ -451,7 +451,6 @@
 #endif
 
 EXPORT_SYMBOL(kstat);
-EXPORT_SYMBOL(nr_running);
 
 /* misc */
 EXPORT_SYMBOL(panic);
diff -Nru linux-2.5.2-pre9.vanilla/kernel/sched.c linux-2.5.2-pre9.xs2/kernel/sched.c
--- linux-2.5.2-pre9.vanilla/kernel/sched.c	Sat Jan  5 19:46:31 2002
+++ linux-2.5.2-pre9.xs2/kernel/sched.c	Mon Jan  7 11:03:24 2002
@@ -72,54 +72,159 @@
 /*
  * The tasklist_lock protects the linked list of processes.
  *
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
+ * This is the lock order :
+ * 1) tasklist_lock
+ * 2) RT_QID
+ * 3) lock(0)
+ * ...
+ * M) lock(N)
  *
- * If both locks are to be concurrently held, the runqueue_lock
- * nests inside the tasklist_lock.
+ * This does not mean that if a lock(3) is needed we've to lock the
+ * whole chain down to lock(3) but it states that if RT_QID and lock(3)
+ * are needed, RT_QID must be locked first.
+ * The lock patterns are tasklist_lock+lock(i) when the task is moved to
+ * a different runqueue, RT_QID+lock(i) inside rechedule_idle() of a global
+ * RT task ( only if the best CPU of the RT task is running another RT task ),
+ * RT_QID+lock(0)+..+lock(N) inside rechedule_idle() of a global RT task
+ * when all CPUs are running RT tasks ).
+ * No other patterns are used, ie: lock(i)+lock(j)
  *
  * task->alloc_lock nests inside tasklist_lock.
  */
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
-static LIST_HEAD(runqueue_head);
+/*
+ * this is the distance map ( move cost ) between cpus.
+ * the move cost from cpu I to cpu J is : cpus_dmap[I][J]
+ * this value can be seen as the number of milliseconds we can
+ * tolerate to have an idle cpu before grabbing a remote task
+ * to run on the idle cpu
+ */
+#define DEF_CPU_DIST_MS	10
+#define FAR_CPU_DIST_MS	20
+#define MS_TO_DIST(t)	(((t) * HZ) / 1000)
+#define DEF_CPU_DIST	MS_TO_DIST(DEF_CPU_DIST_MS)
+#define FAR_CPU_DIST	MS_TO_DIST(FAR_CPU_DIST_MS)
+
+#define cpu_distance(i, j)	((unsigned int) cpus_dmap[i][j])
+
+/*
+ * this is a bonus that we give to cpus that have previously run
+ * an affine mm struct. the bonus value is in milliseconds
+ */
+#define MOVE_MM_BONUS_MS	20
+#define MOVE_MM_BONUS	MS_TO_DIST(MOVE_MM_BONUS_MS)
 
-static unsigned long rcl_curr;
+/*
+ * this is the cpu distance map that should be compiled by the architecture
+ * dependent code or by the common code using a provided abstract topology
+ * interface
+ */
+unsigned char cpus_dmap[NR_CPUS][NR_CPUS];
+
+/*
+ * this is the minimum run queue length that trigger balancing decisions
+ */
+int min_mov_rqlen = 2;
+
+/*
+ * this is the weight ( in milliseconds ) that a remote process has and is
+ * used together with the cpu distance ( metric ) map to build a uniform
+ * cost of move
+ */
+int mvtsk_cost = DEF_CPU_DIST_MS / 2 - 1;
+
+/*
+ * this is used for global real time tasks checkpointing. to avoid the global
+ * real time task selection every time that there's a global real time task
+ * running, this variable is incremented at every global real time task wakeup
+ * and when the first global real time task queue pickup fails for a cpu, its
+ * cpu-local variable is aligned to this one avoiding subsequent failing list
+ * lookup.
+ */
+static volatile unsigned long grt_chkp = 0;
 
 /*
  * We align per-CPU scheduling data on cacheline boundaries,
  * to prevent cacheline ping-pong.
  */
-static union {
-	struct schedule_data {
-		struct task_struct * curr;
-		cycles_t last_schedule;
-	} schedule_data;
-	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+struct cpu_sched_data {
+	int qnr_processes;
+	int qnr_running;
+	struct list_head proclist_head;
+	struct list_head runqueue_head;
+	struct task_struct *curr;
+	unsigned long hit_cpus;
+	unsigned char ldhits[NR_CPUS];
+	unsigned long rtt_chkp;
+	unsigned long rcl_curr;
+	spinlock_t runqueue_lock ____cacheline_aligned;
+};
+
+static struct cpu_sched_data aligned_data[NR_CPUS + 1] __cacheline_aligned;
+
+#define RT_QID	NR_CPUS
+#define global_rttask(p)	((p)->task_qid == RT_QID)
+#define task_foreign(p)	(cpu_number_map((p)->processor) != (p)->task_qid)
+#define cpu_next(cpu)	(((cpu) + 1) < smp_num_cpus ? (cpu) + 1: 0)
+
+#define cpu_curr(cpu) aligned_data[(cpu)].curr
+#define rcl_curr(cpu) aligned_data[(cpu)].rcl_curr
+#define rtt_chkp(cpu) aligned_data[(cpu)].rtt_chkp
+#define hit_cpus(cpu) aligned_data[(cpu)].hit_cpus
+#define ldhits(cpu, i) aligned_data[(cpu)].ldhits[i]
+#define qnr_processes(cpu) aligned_data[(cpu)].qnr_processes
+#define qnr_running(cpu) aligned_data[(cpu)].qnr_running
+#define proclist_head(cpu) aligned_data[(cpu)].proclist_head
+#define runqueue_head(cpu) aligned_data[(cpu)].runqueue_head
+#define runqueue_lock(cpu) aligned_data[(cpu)].runqueue_lock
+
+
+#define rq_lock(p)	lock_task_rq(p)
+#define rq_unlock(p)	spin_unlock(&runqueue_lock((p)->task_qid))
+#define rq_lock_irq(p)	do { local_irq_disable(); lock_task_rq(p); } while (0)
+#define rq_unlock_irq(p)	do { spin_unlock(&runqueue_lock((p)->task_qid)); local_irq_enable(); } while (0)
+#define rq_lock_irqsave(p, f)	do { local_irq_save(f); lock_task_rq(p); } while (0)
+#define rq_unlock_irqrestore(p, f)	do { spin_unlock(&runqueue_lock((p)->task_qid)); local_irq_restore(f); } while (0)
 
-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 
 struct kernel_stat kstat;
 extern struct task_struct *child_reaper;
 
+
 #ifdef CONFIG_SMP
 
 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
-#define can_schedule(p,cpu) \
-	((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
+#define can_schedule(p, cpu) \
+	((p)->cpus_runnable & (p)->cpus_allowed & (1 << (cpu)))
+#define can_move(p, cpu) \
+	((p)->cpus_runnable == ~0L && (p)->cpus_allowed & (1 << (cpu)))
+#define run_allowed(p, cpu)	((p)->cpus_allowed & (1 << (cpu)))
 
 #else
 
 #define idle_task(cpu) (&init_task)
-#define can_schedule(p,cpu) (1)
+#define can_schedule(p, cpu) (1)
+#define can_move(p, cpu) (1)
+#define run_allowed(p, cpu)	(1)
 
 #endif
 
+
 void scheduling_functions_start_here(void) { }
 
+static inline void lock_task_rq(struct task_struct *p)
+{
+	int rqn = p->task_qid;
+
+	spin_lock(&runqueue_lock(rqn));
+	while (p->task_qid != rqn) {
+		spin_unlock(&runqueue_lock(rqn));
+		rqn = p->task_qid;
+		spin_lock(&runqueue_lock(rqn));
+	}
+}
+
 /*
  * This is the function that decides how desirable a process is..
  * You can weigh different processes against each other depending
@@ -134,7 +239,7 @@
  *	 +1000: realtime process, select this.
  */
 
-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+static inline int goodness(struct task_struct * p, struct mm_struct *this_mm)
 {
 	int weight;
 
@@ -152,24 +257,13 @@
 	 */
 	if (p->policy == SCHED_OTHER) {
 		/*
-		 * Give the process a first-approximation goodness value
-		 * according to the number of clock-ticks it has left.
-		 *
-		 * Don't do any other calculations if the time slice is
-		 * over..
+		 * if the task is expired return a zero goodness ...
 		 */
 		if (!p->time_slice)
 			return 0;
 
 		weight = p->dyn_prio + 1;
 
-#ifdef CONFIG_SMP
-		/* Give a largish advantage to the same processor...   */
-		/* (this is equivalent to penalizing other processors) */
-		if (p->processor == this_cpu)
-			weight += PROC_CHANGE_PENALTY;
-#endif
-
 		/* .. and a slight advantage to the current MM */
 		if (p->mm == this_mm || !p->mm)
 			weight += MM_AFFINITY_BONUS;
@@ -187,158 +281,468 @@
 	return weight;
 }
 
+static inline int rt_goodness(struct task_struct * p)
+{
+	return p->policy & SCHED_YIELD ? -1: 1000 + p->rt_priority;
+}
+
 /*
  * the 'goodness value' of replacing a process on a given CPU.
  * positive value means 'replace', zero or negative means 'dont'.
  */
-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p)
+{
+	return goodness(p, prev->active_mm) - goodness(prev, prev->active_mm);
+}
+
+#ifdef CONFIG_SMP
+
+static inline void lock_queues(void)
+{
+	int cpu;
+	for (cpu = 0; cpu < smp_num_cpus; cpu++)
+		spin_lock(&runqueue_lock(cpu));
+}
+
+static inline void unlock_queues(void)
 {
-	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+	int cpu;
+	for (cpu = smp_num_cpus - 1; cpu >= 0; cpu--)
+		spin_unlock(&runqueue_lock(cpu));
 }
 
 /*
- * This is ugly, but reschedule_idle() is very timing-critical.
- * We are called with the runqueue spinlock held and we must
- * not claim the tasklist_lock.
+ * this is used to try to find a place to run the global rt task.
+ * it's called with the RT_QID lock held and with local irq disabled.
  */
-static FASTCALL(void reschedule_idle(struct task_struct * p));
-
-static void reschedule_idle(struct task_struct * p)
+static inline void rtt_reschedule_idle(struct task_struct * p)
 {
-#ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, i, max_prio;
-	cycles_t oldest_idle;
-
-	/*
-	 * shortcut if the woken up task's last CPU is
-	 * idle now.
-	 */
-	best_cpu = p->processor;
-	if (can_schedule(p, best_cpu)) {
-		tsk = idle_task(best_cpu);
-		if (cpu_curr(best_cpu) == tsk) {
-			int need_resched;
-send_now_idle:
-			/*
-			 * If need_resched == -1 then we can skip sending
-			 * the IPI altogether, tsk->need_resched is
-			 * actively watched by the idle thread.
-			 */
+	int cpu, best_cpu = cpu_number_map(p->processor),
+		this_cpu = cpu_number_map(smp_processor_id()), need_resched, maxpg = 0, pg;
+	struct task_struct *tsk, *ttsk = NULL;
+
+	/*
+	 * if the best cpu for the global rt task is not currently running
+	 * another rt task, that's the choice.
+	 */
+	if (can_schedule(p, cpu_logical_map(best_cpu))) {
+		spin_lock(&runqueue_lock(best_cpu));
+		tsk = cpu_curr(best_cpu);
+		if (!task_realtime(tsk)) {
 			need_resched = tsk->need_resched;
 			tsk->need_resched = 1;
-			if ((best_cpu != this_cpu) && !need_resched)
-				smp_send_reschedule(best_cpu);
+			if (best_cpu != this_cpu &&
+				(!need_resched || tsk != idle_task(cpu_logical_map(best_cpu))))
+				smp_send_reschedule(cpu_logical_map(best_cpu));
+			spin_unlock(&runqueue_lock(best_cpu));
 			return;
 		}
+		spin_unlock(&runqueue_lock(best_cpu));
 	}
-
 	/*
-	 * We know that the preferred CPU has a cache-affine current
-	 * process, lets try to find a new idle CPU for the woken-up
-	 * process. Select the least recently active idle CPU. (that
-	 * one will have the least active cache context.) Also find
-	 * the executing process which has the least priority.
-	 */
-	oldest_idle = (cycles_t) -1;
-	target_tsk = NULL;
-	max_prio = 0;
-
-	for (i = 0; i < smp_num_cpus; i++) {
-		cpu = cpu_logical_map(i);
-		if (!can_schedule(p, cpu))
-			continue;
-		tsk = cpu_curr(cpu);
-		/*
-		 * We use the first available idle CPU. This creates
-		 * a priority list between idle CPUs, but this is not
-		 * a problem.
-		 */
-		if (tsk == idle_task(cpu)) {
-#if defined(__i386__) && defined(CONFIG_SMP)
-                        /*
-			 * Check if two siblings are idle in the same
-			 * physical package. Use them if found.
-			 */
-			if (smp_num_siblings == 2) {
-				if (cpu_curr(cpu_sibling_map[cpu]) == 
-			            idle_task(cpu_sibling_map[cpu])) {
-					oldest_idle = last_schedule(cpu);
-					target_tsk = tsk;
-					break;
-				}
-				
-                        }
-#endif		
-			if (last_schedule(cpu) < oldest_idle) {
-				oldest_idle = last_schedule(cpu);
-				target_tsk = tsk;
-			}
-		} else {
-			if (oldest_idle == -1ULL) {
-				int prio = preemption_goodness(tsk, p, cpu);
-
-				if (prio > max_prio) {
-					max_prio = prio;
-					target_tsk = tsk;
-				}
+	 * the best cpu for the global rt task is running another rt task.
+	 * instead of using preemption_goodness() to try to schedule on that cpu
+	 * we try to find a cpu that is not running another rt task.
+	 */
+	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+		if (can_schedule(p, cpu_logical_map(cpu))) {
+			spin_lock(&runqueue_lock(cpu));
+			tsk = cpu_curr(cpu);
+			if (!task_realtime(tsk)) {
+				need_resched = tsk->need_resched;
+				tsk->need_resched = 1;
+				if (cpu != this_cpu &&
+					(!need_resched || tsk != idle_task(cpu_logical_map(cpu))))
+					smp_send_reschedule(cpu_logical_map(cpu));
+				spin_unlock(&runqueue_lock(cpu));
+				return;
 			}
+			spin_unlock(&runqueue_lock(cpu));
 		}
 	}
-	tsk = target_tsk;
-	if (tsk) {
-		if (oldest_idle != -1ULL) {
-			best_cpu = tsk->processor;
-			goto send_now_idle;
+	/*
+	 * it's not our lucky day ..., all the cpus are running rt tasks and
+	 * a preemption_goodness() loop is needed to ensure that the global
+	 * priority is respected among rt tasks.
+	 */
+	lock_queues();
+	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+		if (can_schedule(p, cpu_logical_map(cpu))) {
+			tsk = cpu_curr(cpu);
+			if ((pg = preemption_goodness(tsk, p)) > maxpg) {
+				ttsk = tsk;
+				maxpg = pg;
+				if (tsk == idle_task(cpu_logical_map(cpu)))
+					break;
+			}
 		}
+	}
+	if (ttsk) {
+		need_resched = ttsk->need_resched;
+		ttsk->need_resched = 1;
+		if (ttsk->processor != smp_processor_id() && !need_resched)
+			smp_send_reschedule(ttsk->processor);
+	}
+	unlock_queues();
+}
+
+static inline void std_reschedule_idle(struct task_struct * p)
+{
+	int best_cpu = p->task_qid, this_cpu = cpu_number_map(smp_processor_id());
+	struct task_struct *tsk;
+
+	tsk = cpu_curr(best_cpu);
+	if (tsk == idle_task(cpu_logical_map(best_cpu))) {
+		/*
+		 * If need_resched == -1 then we can skip sending
+		 * the IPI altogether, tsk->need_resched is
+		 * actively watched by the idle thread.
+		 */
+		int need_resched = tsk->need_resched;
+		tsk->need_resched = 1;
+		if ((best_cpu != this_cpu) && !need_resched)
+			smp_send_reschedule(cpu_logical_map(best_cpu));
+	} else if (tsk != p && preemption_goodness(tsk, p) > 0) {
 		tsk->need_resched = 1;
-		if (tsk->processor != this_cpu)
-			smp_send_reschedule(tsk->processor);
+		if (tsk->task_qid != this_cpu)
+			smp_send_reschedule(cpu_logical_map(tsk->task_qid));
 	}
-	return;
+}
 		
+#endif	/* #ifdef CONFIG_SMP */
+
+/*
+ * This is ugly, but reschedule_idle() is very timing-critical.
+ * We are called with the runqueue spinlock held and we must
+ * not claim the tasklist_lock.
+ */
+static FASTCALL(void reschedule_idle(struct task_struct * p));
 
-#else /* UP */
-	int this_cpu = smp_processor_id();
+static void reschedule_idle(struct task_struct * p)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * it's better to fork the path here instead of having complex if()s
+	 * inside the function itself. rt tasks really have different wakeup
+	 * methods compared with local cpu ones
+	 */
+	if (!global_rttask(p))
+		std_reschedule_idle(p);
+	else
+		rtt_reschedule_idle(p);
+
+#else	/* #ifdef CONFIG_SMP */
 	struct task_struct *tsk;
 
-	tsk = cpu_curr(this_cpu);
-	if (preemption_goodness(tsk, p, this_cpu) > 0)
+	tsk = cpu_curr(smp_processor_id());
+	if (preemption_goodness(tsk, p) > 0)
 		tsk->need_resched = 1;
-#endif
+#endif	/* #ifdef CONFIG_SMP */
+}
+
+int nr_task_running(void)
+{
+	int i, tsk_running = qnr_running(RT_QID);
+
+	for (i = 0; i < smp_num_cpus; i++)
+		tsk_running += qnr_running(i);
+	return tsk_running;
 }
 
 /*
- * Careful!
- *
- * This has to add the process to the _beginning_ of the
- * run-queue, not the end. See the comment about "This is
- * subtle" in the scheduler proper..
+ * if it's a standard task its priority bonus is calculated and merged to
+ * its dynamic priority. for global real time tasks the checkpoint counter
+ * is incremented to force cpu's schedulers to try a global rt queue lookup
  */
-static inline void add_to_runqueue(struct task_struct * p)
+static inline void __add_to_runqueue(struct task_struct * p, int task_qid)
 {
-	p->dyn_prio += rcl_curr - p->rcl_last;
-	p->rcl_last = rcl_curr;
-	if (p->dyn_prio > MAX_DYNPRIO)
-		p->dyn_prio = MAX_DYNPRIO;
-	list_add(&p->run_list, &runqueue_head);
-	nr_running++;
+	if (task_qid != RT_QID) {
+		p->dyn_prio += rcl_curr(task_qid) - p->rcl_last;
+		p->rcl_last = rcl_curr(task_qid);
+		if (p->dyn_prio > MAX_DYNPRIO) p->dyn_prio = MAX_DYNPRIO;
+	} else
+		grt_chkp++;
+	list_add(&p->run_list, &runqueue_head(task_qid));
+	qnr_running(task_qid)++;
 }
 
-static inline void move_last_runqueue(struct task_struct * p)
+static inline void __del_from_runqueue(struct task_struct * p, int task_qid)
 {
+	qnr_running(task_qid)--;
 	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+	p->run_list.next = NULL;
+	p->rcl_last = rcl_curr(task_qid);
+}
+
+void del_from_runqueue(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__del_from_runqueue(p, p->task_qid);
+	rq_unlock_irqrestore(p, flags);
+}
+
+static inline void __add_to_proclist(struct task_struct * p, int task_qid)
+{
+	list_add(&p->proclist_cpu, &proclist_head(task_qid));
+	qnr_processes(task_qid)++;
 }
 
-static inline void move_first_runqueue(struct task_struct * p)
+void add_to_proclist(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__add_to_proclist(p, p->task_qid);
+	rq_unlock_irqrestore(p, flags);
+}
+
+static inline void __del_from_proclist(struct task_struct * p, int task_qid)
+{
+	list_del(&p->proclist_cpu);
+	qnr_processes(task_qid)--;
+	p->proclist_cpu.next = NULL;
+}
+
+void del_from_proclist(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__del_from_proclist(p, p->task_qid);
+	rq_unlock_irqrestore(p, flags);
+}
+
+void runqueue_spin_lock(struct task_struct * p)
+{
+	rq_lock(p);
+}
+
+void runqueue_spin_unlock(struct task_struct * p)
+{
+	rq_unlock(p);
+}
+
+static inline void __move_last_runqueue(struct task_struct * p)
+{
+	list_del(&p->run_list);
+	list_add_tail(&p->run_list, &runqueue_head(p->task_qid));
+}
+
+static inline void __move_first_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	list_add(&p->run_list, &runqueue_head(p->task_qid));
 }
 
 /*
+ * move_to_rqn() must be called with 1) local irq disabled
+ * 2) tasklist_lock write-locked 3) task locked
+ */
+static int move_to_rqn(struct task_struct * p, int rqn, int stick)
+{
+	int task_cpu, onpslist = 0, onrqlist = 0;
+	unsigned long cpus_allowed;
+
+	if (p->task_qid == rqn) {
+		if (stick)
+			p->cpus_allowed = (1 << cpu_logical_map(rqn));
+		return rqn;
+	}
+	if (task_on_runqueue(p))
+		__del_from_runqueue(p, p->task_qid), onrqlist++;
+	if (task_on_proclist(p))
+		__del_from_proclist(p, p->task_qid), onpslist++;
+	cpus_allowed = stick ? (1 << cpu_logical_map(rqn)): p->cpus_allowed;
+	p->cpus_allowed = 0;
+	task_cpu = p->task_qid;
+	p->task_qid = rqn;
+	spin_unlock(&runqueue_lock(task_cpu));
+
+	rq_lock(p);
+	p->rcl_last = rcl_curr(rqn);
+	if (onpslist)
+		__add_to_proclist(p, p->task_qid);
+	if (onrqlist)
+		__add_to_runqueue(p, p->task_qid);
+	p->cpus_allowed = cpus_allowed;
+	return task_cpu;
+}
+
+/*
+ * this is only called by softirq.c::ksoftirqd() and is used to place
+ * ksoftirqd tasks over different cpus.
+ */
+int move_to_cpu(struct task_struct * p, int cpu, int stick)
+{
+#ifdef CONFIG_SMP
+	unsigned long flags;
+
+	write_lock_irqsave(&tasklist_lock, flags);
+	rq_lock(p);
+	move_to_rqn(p, cpu_number_map(cpu), stick);
+	rq_unlock(p);
+	write_unlock_irqrestore(&tasklist_lock, flags);
+	return cpu;
+#else	/* #ifdef CONFIG_SMP */
+	return 0;
+#endif	/* #ifdef CONFIG_SMP */
+}
+
+/*
+ * this function gets called inside kernel/timer.c when the timer
+ * tick hit the idle task. maybe architectures with huge HZ might
+ * want to not wake up the idle at every timer tick
+ */
+void sched_wake_idle(void)
+{
+	if (smp_num_cpus > 1)
+		current->need_resched = 1;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * the runtime cpu distance is the sum of the base cpu distance plus the
+ * load on the remote cpu
+ */
+static inline long rt_cpu_dist(int src_cpu, int dst_cpu)
+{
+	return (cpu_distance(src_cpu, dst_cpu) << 4) +
+		(qnr_running(src_cpu) * mvtsk_cost  * (HZ << 4)) / 1000;
+}
+
+/*
+ * try to find the best cpu to run a fresh new process, no locks are held
+ * during this function. it gets called by do_fork() in SMP mode
+ */
+int task_cpu_place(struct task_struct *p)
+{
+	int i, best_cpu, this_cpu = cpu_number_map(smp_processor_id());
+	long cdist, min_cdist;
+
+	best_cpu = this_cpu;
+	min_cdist = rt_cpu_dist(this_cpu, this_cpu);
+	for (i = 0; i < smp_num_cpus; i++) {
+		if (i == this_cpu || !run_allowed(p, cpu_logical_map(i))) continue;
+		if ((cdist = rt_cpu_dist(i, this_cpu)) < min_cdist) {
+			min_cdist = cdist;
+			best_cpu = i;
+		}
+	}
+	p->rcl_last = rcl_curr(best_cpu);
+	p->processor = cpu_logical_map(best_cpu);
+	p->task_qid = best_cpu;
+	return p->processor;
+}
+
+static inline long move_goodness(struct task_struct *p, struct mm_struct *this_mm)
+{
+	long mgds = (long) (jiffies - p->run_jtime);
+
+	if (p->mm == this_mm || !p->mm)
+		mgds += MOVE_MM_BONUS;
+	return mgds;
+}
+
+static inline struct task_struct *try_steal_task(int src_cpu, int dst_cpu)
+{
+	int ldst_cpu = cpu_logical_map(dst_cpu);
+	long mgdns = -1, mvg;
+	struct mm_struct *this_mm = current->active_mm;
+	struct task_struct *tsk, *mvtsk = NULL;
+	struct list_head *head, *tmp;
+
+	spin_lock_irq(&runqueue_lock(src_cpu));
+	head = &runqueue_head(src_cpu);
+	list_for_each(tmp, head) {
+		tsk = list_entry(tmp, struct task_struct, run_list);
+		if (can_move(tsk, ldst_cpu) && !task_foreign(tsk) &&
+			(mvg = move_goodness(tsk, this_mm)) > mgdns) {
+			mvtsk = tsk;
+			mgdns = mvg;
+		}
+	}
+	if (mvtsk) {
+		unsigned long cpus_allowed = mvtsk->cpus_allowed;
+
+		mvtsk->cpus_allowed = 0;
+		__del_from_runqueue(mvtsk, src_cpu);
+		spin_unlock(&runqueue_lock(src_cpu));
+		write_lock(&tasklist_lock);
+		spin_lock(&runqueue_lock(src_cpu));
+		__del_from_proclist(mvtsk, src_cpu);
+		spin_unlock(&runqueue_lock(src_cpu));
+		spin_lock(&runqueue_lock(dst_cpu));
+		mvtsk->rcl_last = rcl_curr(dst_cpu);
+		__add_to_runqueue(mvtsk, dst_cpu);
+		__add_to_proclist(mvtsk, dst_cpu);
+		mvtsk->cpus_allowed = cpus_allowed;
+		mvtsk->task_qid = dst_cpu;
+		spin_unlock(&runqueue_lock(dst_cpu));
+		write_unlock_irq(&tasklist_lock);
+	} else
+		spin_unlock_irq(&runqueue_lock(src_cpu));
+	return mvtsk;
+}
+
+/*
+ * the move cost is the difference from the cpu distance and the run queue
+ * load on the remote cpu. both terms are scaled by a factor 16 ( << 4 ) and
+ * the cost for each remote cpu task depend on mvtsk_cost
+ */
+static inline long move_cost(int src_cpu, int dst_cpu)
+{
+	return (cpu_distance(src_cpu, dst_cpu) << 4) -
+		(qnr_running(src_cpu) * mvtsk_cost  * (HZ << 4)) / 1000;
+}
+
+static inline struct task_struct *get_remote_task(int this_cpu)
+{
+	int i, max_cpu;
+	unsigned long hcpus = 0;
+	long ccost, min_cost;
+	struct task_struct *rtask;
+
+	this_cpu = cpu_number_map(this_cpu);
+	for (i = 0; i < smp_num_cpus; i++) {
+		if (i == this_cpu) continue;
+		if (qnr_running(i) >= min_mov_rqlen) {
+			if (hit_cpus(this_cpu) & (1 << i))
+				ldhits(this_cpu, i)++;
+			else {
+				hit_cpus(this_cpu) |= (1 << i);
+				ldhits(this_cpu, i) = 1;
+			}
+			if (ldhits(this_cpu, i) >= cpu_distance(this_cpu, i))
+				hcpus |= (1 << i);
+		} else
+			hit_cpus(this_cpu) &= ~(1 << i);
+	}
+	while (hcpus) {
+		max_cpu = -1;
+		min_cost = 1000;
+		for (i = 0; i < smp_num_cpus; i++) {
+			if (!(hcpus & (1 << i))) continue;
+			if ((ccost = move_cost(i, this_cpu)) < min_cost) {
+				min_cost = ccost;
+				max_cpu = i;
+			}
+		}
+		if (max_cpu < 0) break;
+		if ((rtask = try_steal_task(max_cpu, this_cpu))) {
+			hit_cpus(this_cpu) = 0;
+			return rtask;
+		}
+		hcpus &= ~(1 << max_cpu);
+	}
+	return NULL;
+}
+
+#endif	/* #ifdef CONFIG_SMP */
+
+/*
  * Wake up a process. Put it on the run-queue if it's not
  * already there.  The "current" process is always on the
  * run-queue (except when the actual re-schedule is in
@@ -354,16 +758,21 @@
 	/*
 	 * We want the common case fall through straight, thus the goto.
 	 */
-	spin_lock_irqsave(&runqueue_lock, flags);
+	rq_lock_irqsave(p, flags);
 	p->state = TASK_RUNNING;
-	if (task_on_runqueue(p))
+	/*
+	 * cpus_allowed is cleared when a task is moving from one cpu
+	 * to another and it is used to avoid to be hit while we're
+	 * switching locks.
+	 */
+	if (task_on_runqueue(p) || !p->cpus_allowed)
 		goto out;
-	add_to_runqueue(p);
-	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
+	__add_to_runqueue(p, p->task_qid);
+	if (!synchronous || p->task_qid != cpu_number_map(smp_processor_id()))
 		reschedule_idle(p);
 	success = 1;
 out:
-	spin_unlock_irqrestore(&runqueue_lock, flags);
+	rq_unlock_irqrestore(p, flags);
 	return success;
 }
 
@@ -487,41 +896,7 @@
 	task_lock(prev);
 	task_release_cpu(prev);
 	mb();
-	if (prev->state == TASK_RUNNING)
-		goto needs_resched;
-
-out_unlock:
 	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
-	return;
-
-	/*
-	 * Slow path - we 'push' the previous process and
-	 * reschedule_idle() will attempt to find a new
-	 * processor for it. (but it might preempt the
-	 * current process as well.) We must take the runqueue
-	 * lock and re-check prev->state to be correct. It might
-	 * still happen that this process has a preemption
-	 * 'in progress' already - but this is not a problem and
-	 * might happen in other circumstances as well.
-	 */
-needs_resched:
-	{
-		unsigned long flags;
-
-		/*
-		 * Avoid taking the runqueue lock in cases where
-		 * no preemption-check is necessery:
-		 */
-		if ((prev == idle_task(smp_processor_id())) ||
-						(policy & SCHED_YIELD))
-			goto out_unlock;
-
-		spin_lock_irqsave(&runqueue_lock, flags);
-		if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
-			reschedule_idle(prev);
-		spin_unlock_irqrestore(&runqueue_lock, flags);
-		goto out_unlock;
-	}
 #else
 	prev->policy &= ~SCHED_YIELD;
 #endif /* CONFIG_SMP */
@@ -534,62 +909,83 @@
 
 void expire_task(struct task_struct *p)
 {
-	if (unlikely(!p->time_slice))
-		goto need_resched;
-
-	if (!--p->time_slice) {
+	if (--p->time_slice <= 0) {
 		if (p->dyn_prio)
 			p->dyn_prio--;
-need_resched:
+		p->time_slice = 0;
 		p->need_resched = 1;
 	}
 }
 
-/*
- *  'schedule()' is the scheduler function. It's a very simple and nice
- * scheduler: it's not perfect, but certainly works for most things.
- *
- * The goto is "interesting".
- *
- *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
- * tasks can run. It can not be killed, and it cannot sleep. The 'state'
- * information in task[0] is never used.
- */
-asmlinkage void schedule(void)
+static inline void set_task_running(struct task_struct *p, int cpu)
 {
-	struct schedule_data * sched_data;
-	struct task_struct *prev, *next, *p;
-	struct list_head *tmp;
-	int this_cpu, c;
+#ifdef CONFIG_SMP
+	if (p != idle_task(cpu) && hit_cpus(cpu_number_map(cpu)))
+		hit_cpus(cpu_number_map(cpu)) = 0;
+	p->run_jtime = jiffies;
+#endif	/* #ifdef CONFIG_SMP */
+	cpu_curr(cpu_number_map(cpu)) = p;
+	task_set_cpu(p, cpu);
+}
 
+static inline void switch_tasks(struct task_struct *prev, struct task_struct *next)
+{
+	/*
+	 * there are 3 processes which are affected by a context switch:
+	 *
+	 * prev == .... ==> (last => next)
+	 *
+	 * It's the 'much more previous' 'prev' that is on next's stack,
+	 * but prev is set to (the just run) 'last' process by switch_to().
+	 * This might sound slightly confusing but makes tons of sense.
+	 */
+	prepare_to_switch();
+	{
+		struct mm_struct *mm = next->mm;
+		struct mm_struct *oldmm = prev->active_mm;
+		if (!mm) {
+			if (next->active_mm) BUG();
+			next->active_mm = oldmm;
+			atomic_inc(&oldmm->mm_count);
+			enter_lazy_tlb(oldmm, next, prev->processor);
+		} else {
+			if (next->active_mm != mm) BUG();
+			switch_mm(oldmm, mm, next, prev->processor);
+		}
+
+		if (!prev->mm) {
+			prev->active_mm = NULL;
+			mmdrop(oldmm);
+		}
+	}
+
+	kstat.context_swtch++;
+	switch_to(prev, next, prev);
+	__schedule_tail(prev);
+}
 
-	spin_lock_prefetch(&runqueue_lock);
+asmlinkage void schedule(void)
+{
+	struct task_struct *prev, *next, *p;
+	struct list_head *head, *tmp;
+	int this_cpu, c, weight;
 
-	if (!current->active_mm) BUG();
 need_resched_back:
 	prev = current;
 	this_cpu = prev->processor;
 
-	if (unlikely(in_interrupt())) {
-		printk("Scheduling in interrupt\n");
+	if (unlikely(in_interrupt()))
 		BUG();
-	}
 
 	release_kernel_lock(prev, this_cpu);
 
-	/*
-	 * 'sched_data' is protected by the fact that we can run
-	 * only one process per CPU.
-	 */
-	sched_data = & aligned_data[this_cpu].schedule_data;
-
-	spin_lock_irq(&runqueue_lock);
+	rq_lock_irq(prev);
 
 	/* move an exhausted RR process to be last.. */
 	if (unlikely(prev->policy == SCHED_RR))
 		if (!prev->time_slice) {
 			prev->time_slice = TASK_TIMESLICE(prev);
-			move_last_runqueue(prev);
+			__move_last_runqueue(prev);
 		}
 
 	switch (prev->state) {
@@ -599,49 +995,55 @@
 			break;
 		}
 	default:
-		del_from_runqueue(prev);
+		if (task_on_runqueue(prev))
+			__del_from_runqueue(prev, prev->task_qid);
 	case TASK_RUNNING:;
 	}
 	prev->need_resched = 0;
 
 	/*
-	 * this is the scheduler proper:
+	 * check global rt queue first without held locks and if it's not empty
+	 * try to pickup the rt task first. despite to the new "unlikely" feature
+	 * the code for rt task selection is kept out.
 	 */
+	if (grt_chkp != rtt_chkp(cpu_number_map(this_cpu)) &&
+		!list_empty(&runqueue_head(RT_QID)))
+		goto rt_queue_select;
 
-repeat_schedule:
 	/*
-	 * Default process to select..
+	 * this is true for running tasks moved with move_to_rqn() ( the first time
+	 * they call schedule() ) and for global RT tasks.
 	 */
+	if (unlikely(task_foreign(prev))) {
+		rq_unlock(prev);
+		spin_lock(&runqueue_lock(cpu_number_map(this_cpu)));
+	}
+
+repeat_schedule:
 	next = idle_task(this_cpu);
 	c = -1000;
-	list_for_each(tmp, &runqueue_head) {
+	head = &runqueue_head(cpu_number_map(this_cpu));
+	list_for_each(tmp, head) {
 		p = list_entry(tmp, struct task_struct, run_list);
-		if (can_schedule(p, this_cpu)) {
-			int weight = goodness(p, this_cpu, prev->active_mm);
-			if (weight > c)
-				c = weight, next = p;
-		}
+		if ((weight = goodness(p, prev->active_mm)) > c)
+			c = weight, next = p;
 	}
 
-	/* Do we need to re-calculate counters? */
+	/* Do we need to re-calculate time slices? */
 	if (unlikely(!c)) {
-		rcl_curr++;
-		list_for_each(tmp, &runqueue_head) {
+		rcl_curr(cpu_number_map(this_cpu))++;
+		head = &runqueue_head(cpu_number_map(this_cpu));
+		list_for_each(tmp, head) {
 			p = list_entry(tmp, struct task_struct, run_list);
 			p->time_slice = TASK_TIMESLICE(p);
-			p->rcl_last = rcl_curr;
 		}
 		goto repeat_schedule;
 	}
 
-	/*
-	 * from this point on nothing can prevent us from
-	 * switching to the next task, save this fact in
-	 * sched_data.
-	 */
-	sched_data->curr = next;
-	task_set_cpu(next, this_cpu);
-	spin_unlock_irq(&runqueue_lock);
+	set_task_running(next, this_cpu);
+	spin_unlock_irq(&runqueue_lock(cpu_number_map(this_cpu)));
+
+rt_task_selected:
 
 	if (unlikely(prev == next)) {
 		/* We won't go through the normal tail, so do this by hand */
@@ -649,66 +1051,59 @@
 		goto same_process;
 	}
 
-#ifdef CONFIG_SMP
- 	/*
- 	 * maintain the per-process 'last schedule' value.
- 	 * (this has to be recalculated even if we reschedule to
- 	 * the same process) Currently this is only used on SMP,
-	 * and it's approximate, so we do not have to maintain
-	 * it while holding the runqueue spinlock.
- 	 */
- 	sched_data->last_schedule = get_cycles();
+	switch_tasks(prev, next);
 
-	/*
-	 * We drop the scheduler lock early (it's a global spinlock),
-	 * thus we have to lock the previous process from getting
-	 * rescheduled during switch_to().
-	 */
+same_process:
+	reacquire_kernel_lock(current);
 
-#endif /* CONFIG_SMP */
+#ifdef CONFIG_SMP
+	if (unlikely(current == idle_task(this_cpu)))
+		if (get_remote_task(this_cpu))
+			goto need_resched_back;
+#endif	/* #ifdef CONFIG_SMP */
 
-	kstat.context_swtch++;
-	/*
-	 * there are 3 processes which are affected by a context switch:
-	 *
-	 * prev == .... ==> (last => next)
-	 *
-	 * It's the 'much more previous' 'prev' that is on next's stack,
-	 * but prev is set to (the just run) 'last' process by switch_to().
-	 * This might sound slightly confusing but makes tons of sense.
-	 */
-	prepare_to_switch();
-	{
-		struct mm_struct *mm = next->mm;
-		struct mm_struct *oldmm = prev->active_mm;
-		if (!mm) {
-			if (next->active_mm) BUG();
-			next->active_mm = oldmm;
-			atomic_inc(&oldmm->mm_count);
-			enter_lazy_tlb(oldmm, next, this_cpu);
-		} else {
-			if (next->active_mm != mm) BUG();
-			switch_mm(oldmm, mm, next, this_cpu);
-		}
+	if (current->need_resched)
+		goto need_resched_back;
+	return;
 
-		if (!prev->mm) {
-			prev->active_mm = NULL;
-			mmdrop(oldmm);
+rt_queue_select:
+	/*
+	 * the fast lockless check reported that it might be a successful
+	 * pickup inside the global rt queue, so we try here. this section
+	 * is entered with "prev" locked. if the "prev" task qid is not RT_QID
+	 * then it must be unlocked and RT_QID lock must be acquired.
+	 */
+	if (!global_rttask(prev)) {
+		rq_unlock(prev);
+		spin_lock(&runqueue_lock(RT_QID));
+	}
+	c = 0;
+	head = &runqueue_head(RT_QID);
+	list_for_each(tmp, head) {
+		p = list_entry(tmp, struct task_struct, run_list);
+		if (can_schedule(p, this_cpu)) {
+			if ((weight = rt_goodness(p)) > c)
+				c = weight, next = p;
 		}
 	}
-
+	if (!c) {
+		/*
+		 * the fast test reported a false positive so we go back to
+		 * the local CPU runqueue selection. we also update the checkpoint
+		 * to avoid future false lookups.
+		 */
+		rtt_chkp(cpu_number_map(this_cpu)) = grt_chkp;
+		spin_unlock(&runqueue_lock(RT_QID));
+		spin_lock(&runqueue_lock(cpu_number_map(this_cpu)));
+		goto repeat_schedule;
+	}
 	/*
-	 * This just switches the register state and the
-	 * stack.
+	 * the global rt task has been selected and final setup is needed.
 	 */
-	switch_to(prev, next, prev);
-	__schedule_tail(prev);
+	set_task_running(next, this_cpu);
+	spin_unlock_irq(&runqueue_lock(RT_QID));
+	goto rt_task_selected;
 
-same_process:
-	reacquire_kernel_lock(current);
-	if (current->need_resched)
-		goto need_resched_back;
-	return;
 }
 
 /*
@@ -900,9 +1295,9 @@
 static int setscheduler(pid_t pid, int policy, 
 			struct sched_param *param)
 {
+	int retval, grt, pgrt, rqn;
 	struct sched_param lp;
 	struct task_struct *p;
-	int retval;
 
 	retval = -EINVAL;
 	if (!param || pid < 0)
@@ -914,22 +1309,31 @@
 
 	/*
 	 * We play safe to avoid deadlocks.
+	 * It's possible that we need a write lock to move the task in/out the
+	 * RT_QID run queue so instead of getting a read lock and having to
+	 * release/writelock again, it's better to get directly the write one.
 	 */
-	read_lock_irq(&tasklist_lock);
-	spin_lock(&runqueue_lock);
+	write_lock_irq(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 
 	retval = -ESRCH;
 	if (!p)
-		goto out_unlock;
+		goto out_unlock_tkll;
 			
-	if (policy < 0)
+	rq_lock(p);
+	if (policy < 0) {
 		policy = p->policy;
-	else {
+		grt = pgrt = global_rttask(p);
+	} else {
+		grt = (policy & SCHED_RTLOCAL) == 0;
+		policy &= ~SCHED_RTLOCAL;
+		grt = grt && (policy == SCHED_FIFO || policy == SCHED_RR);
+		pgrt = global_rttask(p);
+
 		retval = -EINVAL;
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
-				policy != SCHED_OTHER)
+			policy != SCHED_OTHER)
 			goto out_unlock;
 	}
 	
@@ -954,14 +1358,28 @@
 	retval = 0;
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;
-	if (task_on_runqueue(p))
-		move_first_runqueue(p);
+	if (pgrt == grt) {
+		rqn = p->task_qid;
+		if (task_on_runqueue(p))
+			__move_first_runqueue(p);
+	} else {
+		rqn = cpu_number_map(p->processor);
+		move_to_rqn(p, grt ? RT_QID: rqn, 0);
+		if (grt) grt_chkp++;
+	}
 
-	current->need_resched = 1;
+	if (grt || rqn == cpu_number_map(smp_processor_id()))
+		current->need_resched = 1;
+	else {
+#ifdef CONFIG_SMP
+		smp_send_reschedule(cpu_logical_map(rqn));
+#endif	/* #ifdef CONFIG_SMP */
+	}
 
 out_unlock:
-	spin_unlock(&runqueue_lock);
-	read_unlock_irq(&tasklist_lock);
+	rq_unlock(p);
+out_unlock_tkll:
+	write_unlock_irq(&tasklist_lock);
 
 out_nounlock:
 	return retval;
@@ -1031,41 +1449,18 @@
 
 asmlinkage long sys_sched_yield(void)
 {
-	/*
-	 * Trick. sched_yield() first counts the number of truly 
-	 * 'pending' runnable processes, then returns if it's
-	 * only the current processes. (This test does not have
-	 * to be atomic.) In threaded applications this optimization
-	 * gets triggered quite often.
-	 */
-
-	int nr_pending = nr_running;
-
-#if CONFIG_SMP
-	int i;
-
-	// Subtract non-idle processes running on other CPUs.
-	for (i = 0; i < smp_num_cpus; i++) {
-		int cpu = cpu_logical_map(i);
-		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
-			nr_pending--;
-	}
-#else
-	// on UP this process is on the runqueue as well
-	nr_pending--;
-#endif
-	if (nr_pending) {
+	if (qnr_running(current->task_qid) > 1) {
 		/*
-		 * This process can only be rescheduled by us,
+         * This process can only be rescheduled by us,
 		 * so this is safe without any locking.
 		 */
 		if (current->policy == SCHED_OTHER)
 			current->policy |= SCHED_YIELD;
-		current->need_resched = 1;
-
+		local_irq_disable();
 		current->time_slice = 0;
-		if (++current->dyn_prio > MAX_DYNPRIO)
-			current->dyn_prio = MAX_DYNPRIO;
+		if (++current->dyn_prio > MAX_DYNPRIO) current->dyn_prio = MAX_DYNPRIO;
+		local_irq_enable();
+		current->need_resched = 1;
 	}
 	return 0;
 }
@@ -1243,7 +1638,7 @@
 
 	/* We also take the runqueue_lock while altering task fields
 	 * which affect scheduling decisions */
-	spin_lock(&runqueue_lock);
+	rq_lock(current);
 
 	current->ptrace = 0;
 	current->nice = DEF_NICE;
@@ -1258,7 +1653,7 @@
 	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
 	current->user = INIT_USER;
 
-	spin_unlock(&runqueue_lock);
+	rq_unlock(current);
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -1298,18 +1693,24 @@
 
 void __init init_idle(void)
 {
-	struct schedule_data * sched_data;
-	sched_data = &aligned_data[smp_processor_id()].schedule_data;
-
 	if (current != &init_task && task_on_runqueue(current)) {
 		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
-			   smp_processor_id(), current->pid);
-		del_from_runqueue(current);
+			smp_processor_id(), current->pid);
+		__del_from_runqueue(current, current->task_qid);
 	}
-	current->dyn_prio = 0;
-	sched_data->curr = current;
-	sched_data->last_schedule = get_cycles();
+	current->task_qid = cpu_number_map(current->processor);
+	current->nice = 20;
+	current->dyn_prio = -100;
+	current->time_slice = 0;
+	cpu_curr(cpu_number_map(smp_processor_id())) = current;
 	clear_bit(current->processor, &wait_init_idle);
+
+	/* Wait for the other cpus to set up their idle processes */
+	printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
+	while (wait_init_idle) {
+		cpu_relax();
+		barrier();
+	}
 }
 
 extern void init_timervecs (void);
@@ -1326,19 +1727,55 @@
 	}
 }
 
+void __init sched_cpudmap_init(void)
+{
+	int i, j;
+
+	/*
+	 * this should use ( if provided ) a topology api to setup
+	 * the distance map. right now it assignes same distance to
+	 * 4 cpus groups
+	 */
+	for (i = 0; i < NR_CPUS; i++)
+		for (j = 0; j <= i; j++)
+			if ((i & ~0x03) == (j & ~0x03))
+				cpus_dmap[i][j] = cpus_dmap[j][i] = DEF_CPU_DIST;
+			else
+				cpus_dmap[i][j] = cpus_dmap[j][i] = FAR_CPU_DIST;
+
+}
+
 void __init sched_init(void)
 {
 	/*
 	 * We have to do a little magic to get the first
 	 * process right in SMP mode.
 	 */
-	int cpu = smp_processor_id();
-	int nr;
+	int i, j, cpu = smp_processor_id();
+
+	for (i = 0; i <= NR_CPUS; i++) {
+		qnr_processes(i) = 0;
+		qnr_running(i) = 0;
+		cpu_curr(i) = &init_task;
+		rtt_chkp(i) = grt_chkp;
+		hit_cpus(i) = 0;
+		rcl_curr(i) = 0;
+		INIT_LIST_HEAD(&runqueue_head(i));
+		INIT_LIST_HEAD(&proclist_head(i));
+		runqueue_lock(i) = SPIN_LOCK_UNLOCKED;
+	}
+
+	/*
+	 * give a first initialization to the cpu distance map
+	 */
+	for (i = 0; i < NR_CPUS; i++)
+		for (j = 0; j <= i; j++)
+			cpus_dmap[i][j] = cpus_dmap[j][i] = DEF_CPU_DIST;
 
 	init_task.processor = cpu;
 
-	for(nr = 0; nr < PIDHASH_SZ; nr++)
-		pidhash[nr] = NULL;
+	for(i = 0; i < PIDHASH_SZ; i++)
+		pidhash[i] = NULL;
 
 	fill_tslice_map();
 
@@ -1354,3 +1791,4 @@
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current, cpu);
 }
+
diff -Nru linux-2.5.2-pre9.vanilla/kernel/signal.c linux-2.5.2-pre9.xs2/kernel/signal.c
--- linux-2.5.2-pre9.vanilla/kernel/signal.c	Sat Jan  5 19:46:31 2002
+++ linux-2.5.2-pre9.xs2/kernel/signal.c	Sun Jan  6 15:04:05 2002
@@ -478,10 +478,10 @@
 	 * process of changing - but no harm is done by that
 	 * other than doing an extra (lightweight) IPI interrupt.
 	 */
-	spin_lock(&runqueue_lock);
+	runqueue_spin_lock(t);
 	if (task_has_cpu(t) && t->processor != smp_processor_id())
 		smp_send_reschedule(t->processor);
-	spin_unlock(&runqueue_lock);
+	runqueue_spin_unlock(t);
 #endif /* CONFIG_SMP */
 
 	if (t->state & TASK_INTERRUPTIBLE) {
diff -Nru linux-2.5.2-pre9.vanilla/kernel/softirq.c linux-2.5.2-pre9.xs2/kernel/softirq.c
--- linux-2.5.2-pre9.vanilla/kernel/softirq.c	Fri Nov 30 15:53:28 2001
+++ linux-2.5.2-pre9.xs2/kernel/softirq.c	Sun Jan  6 15:04:05 2002
@@ -369,7 +369,7 @@
 	sigfillset(&current->blocked);
 
 	/* Migrate to the right CPU */
-	current->cpus_allowed = 1UL << cpu;
+	if (move_to_cpu(current, cpu, 1) < 0) BUG();
 	while (smp_processor_id() != cpu)
 		schedule();
 
diff -Nru linux-2.5.2-pre9.vanilla/kernel/timer.c linux-2.5.2-pre9.xs2/kernel/timer.c
--- linux-2.5.2-pre9.vanilla/kernel/timer.c	Sat Jan  5 19:46:31 2002
+++ linux-2.5.2-pre9.xs2/kernel/timer.c	Sun Jan  6 15:04:05 2002
@@ -589,8 +589,11 @@
 		else
 			kstat.per_cpu_user[cpu] += user_tick;
 		kstat.per_cpu_system[cpu] += system;
-	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
-		kstat.per_cpu_system[cpu] += system;
+	} else {
+		sched_wake_idle();
+		if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+			kstat.per_cpu_system[cpu] += system;
+	}
 }
 
 /*