diff -Nru linux-2.5.1-pre11.vanilla/Makefile linux-2.5.1-pre11.xs2/Makefile
--- linux-2.5.1-pre11.vanilla/Makefile	Thu Dec 13 11:05:02 2001
+++ linux-2.5.1-pre11.xs2/Makefile	Thu Dec 13 11:03:41 2001
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 5
 SUBLEVEL = 1
-EXTRAVERSION =-pre11
+EXTRAVERSION = -pre11-xs2
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -Nru linux-2.5.1-pre11.vanilla/arch/i386/kernel/smpboot.c linux-2.5.1-pre11.xs2/arch/i386/kernel/smpboot.c
--- linux-2.5.1-pre11.vanilla/arch/i386/kernel/smpboot.c	Wed Nov 21 10:35:48 2001
+++ linux-2.5.1-pre11.xs2/arch/i386/kernel/smpboot.c	Thu Dec 13 11:02:58 2001
@@ -799,15 +799,13 @@
 	if (!idle)
 		panic("No idle process for CPU %d", cpu);
 
-	idle->processor = cpu;
-	idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
-
 	map_cpu_to_boot_apicid(cpu, apicid);
 
-	idle->thread.eip = (unsigned long) start_secondary;
-
 	del_from_runqueue(idle);
 	unhash_process(idle);
+	idle->processor = cpu;
+	idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
+	idle->thread.eip = (unsigned long) start_secondary;
 	init_tasks[cpu] = idle;
 
 	/* start_eip had better be page-aligned! */
diff -Nru linux-2.5.1-pre11.vanilla/drivers/net/slip.c linux-2.5.1-pre11.xs2/drivers/net/slip.c
--- linux-2.5.1-pre11.vanilla/drivers/net/slip.c	Sun Sep 30 12:26:07 2001
+++ linux-2.5.1-pre11.xs2/drivers/net/slip.c	Thu Dec 13 11:02:58 2001
@@ -1395,6 +1395,7 @@
 		do {
 			if (busy) {
 				current->counter = 0;
+				current->timer_ticks = 0;
 				schedule();
 			}
 
diff -Nru linux-2.5.1-pre11.vanilla/include/linux/sched.h linux-2.5.1-pre11.xs2/include/linux/sched.h
--- linux-2.5.1-pre11.vanilla/include/linux/sched.h	Thu Dec 13 14:28:40 2001
+++ linux-2.5.1-pre11.xs2/include/linux/sched.h	Thu Dec 20 15:41:57 2001
@@ -15,6 +15,7 @@
 #include <linux/rbtree.h>
 
 #include <asm/system.h>
+#include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -71,7 +72,10 @@
 #define CT_TO_SECS(x)	((x) / HZ)
 #define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
 
-extern int nr_running, nr_threads;
+#define nr_running	atomic_read(&gnr_running)
+
+extern atomic_t gnr_running;
+extern int nr_threads;
 extern int last_pid;
 
 #include <linux/fs.h>
@@ -120,6 +124,7 @@
  * yield the CPU for one re-schedule..
  */
 #define SCHED_YIELD		0x10
+#define SCHED_RTLOCAL	0x20
 
 struct sched_param {
 	int sched_priority;
@@ -138,9 +143,9 @@
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
-extern spinlock_t runqueue_lock;
 extern spinlock_t mmlist_lock;
 
+extern void sched_cpudmap_init(void);
 extern void sched_init(void);
 extern void init_idle(void);
 extern void show_state(void);
@@ -318,7 +323,10 @@
 	 * that's just fine.)
 	 */
 	struct list_head run_list;
+	int task_qid;
 	unsigned long sleep_time;
+	long timer_ticks;
+	unsigned long run_jtime;
 
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
@@ -400,6 +408,9 @@
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
+
+/* per cpu proc list */
+	struct list_head proclist_cpu;
 	
 /* Thread group tracking */
    	u32 parent_exec_id;
@@ -448,6 +459,10 @@
 #define MAX_COUNTER	(20*HZ/100)
 #define DEF_NICE	(0)
 
+/*
+ * see sched.c comment on this variable
+ */
+extern int decay_ticks;
 
 /*
  * The default (Linux) execution domain.
@@ -467,15 +482,17 @@
     exec_domain:	&default_exec_domain,				\
     lock_depth:		-1,						\
     counter:		DEF_COUNTER,					\
+	timer_ticks:		0,						\
     nice:		DEF_NICE,					\
     policy:		SCHED_OTHER,					\
     mm:			NULL,						\
     active_mm:		&init_mm,					\
     cpus_runnable:	-1,						\
     cpus_allowed:	-1,						\
-    run_list:		LIST_HEAD_INIT(tsk.run_list),			\
+    run_list:		{ NULL, NULL },			\
     next_task:		&tsk,						\
     prev_task:		&tsk,						\
+	pid:			0,						\
     p_opptr:		&tsk,						\
     p_pptr:		&tsk,						\
     thread_group:	LIST_HEAD_INIT(tsk.thread_group),		\
@@ -499,6 +516,9 @@
     blocked:		{{0}},						\
     alloc_lock:		SPIN_LOCK_UNLOCKED,				\
     journal_info:	NULL,						\
+	task_qid:       0,                      \
+	proclist_cpu:   LIST_HEAD_INIT(tsk.proclist_cpu),           \
+	run_jtime:		0,						\
 }
 
 
@@ -791,6 +811,16 @@
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
+extern void del_from_runqueue(struct task_struct * p);
+extern void add_to_proclist(struct task_struct * p);
+extern void del_from_proclist(struct task_struct * p);
+extern void sched_wake_idle(void);
+extern int move_to_cpu(struct task_struct * p, int cpu, int stick);
+extern int get_best_cpu(struct task_struct *p);
+extern void runqueue_spin_lock(struct task_struct * p);
+extern void runqueue_spin_unlock(struct task_struct * p);
+
+
 #define __wait_event(wq, condition) 					\
 do {									\
 	wait_queue_t __wait;						\
@@ -844,6 +874,7 @@
 })
 
 #define REMOVE_LINKS(p) do { \
+	del_from_proclist(p); \
 	(p)->next_task->prev_task = (p)->prev_task; \
 	(p)->prev_task->next_task = (p)->next_task; \
 	if ((p)->p_osptr) \
@@ -855,6 +886,7 @@
 	} while (0)
 
 #define SET_LINKS(p) do { \
+	add_to_proclist(p); \
 	(p)->next_task = &init_task; \
 	(p)->prev_task = init_task.prev_task; \
 	init_task.prev_task->next_task = (p); \
@@ -871,17 +903,20 @@
 #define next_thread(p) \
 	list_entry((p)->thread_group.next, struct task_struct, thread_group)
 
-static inline void del_from_runqueue(struct task_struct * p)
-{
-	nr_running--;
-	p->sleep_time = jiffies;
-	list_del(&p->run_list);
-	p->run_list.next = NULL;
-}
 
 static inline int task_on_runqueue(struct task_struct *p)
 {
 	return (p->run_list.next != NULL);
+}
+
+static inline int task_on_proclist(struct task_struct *p)
+{
+	return (p->proclist_cpu.next != NULL);
+}
+
+static inline int task_realtime(struct task_struct *p)
+{
+	return ((p->policy & ~SCHED_YIELD) != SCHED_OTHER);
 }
 
 static inline void unhash_process(struct task_struct *p)
diff -Nru linux-2.5.1-pre11.vanilla/init/main.c linux-2.5.1-pre11.xs2/init/main.c
--- linux-2.5.1-pre11.vanilla/init/main.c	Thu Dec 13 11:05:12 2001
+++ linux-2.5.1-pre11.xs2/init/main.c	Thu Dec 20 12:30:38 2001
@@ -427,6 +427,10 @@
 	 *	make syscalls (and thus be locked).
 	 */
 	smp_init();
+	/*
+	 * after smp initialization we can finally setup the cpu distance map
+	 */
+	sched_cpudmap_init();
 	rest_init();
 }
 
diff -Nru linux-2.5.1-pre11.vanilla/kernel/exit.c linux-2.5.1-pre11.xs2/kernel/exit.c
--- linux-2.5.1-pre11.vanilla/kernel/exit.c	Thu Dec 13 11:05:12 2001
+++ linux-2.5.1-pre11.xs2/kernel/exit.c	Thu Dec 13 11:02:58 2001
@@ -63,8 +63,14 @@
 		 * was given away by the parent in the first place.)
 		 */
 		current->counter += p->counter;
-		if (current->counter >= MAX_COUNTER)
+		if (current->counter >= MAX_COUNTER) {
 			current->counter = MAX_COUNTER;
+			if (current->timer_ticks >= current->counter) {
+				current->counter = 0;
+				current->timer_ticks = 0;
+				current->need_resched = 1;
+			}
+		}
 		p->pid = 0;
 		free_task_struct(p);
 	} else {
diff -Nru linux-2.5.1-pre11.vanilla/kernel/fork.c linux-2.5.1-pre11.xs2/kernel/fork.c
--- linux-2.5.1-pre11.vanilla/kernel/fork.c	Thu Dec 13 11:05:12 2001
+++ linux-2.5.1-pre11.xs2/kernel/fork.c	Thu Dec 13 11:02:58 2001
@@ -21,6 +21,7 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 
+#include <asm/atomic.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -28,7 +29,7 @@
 
 /* The idle threads do not count.. */
 int nr_threads;
-int nr_running;
+atomic_t gnr_running = ATOMIC_INIT(0);
 
 int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
@@ -614,6 +615,9 @@
 	p->run_list.next = NULL;
 	p->run_list.prev = NULL;
 
+	p->proclist_cpu.next = NULL;
+	p->proclist_cpu.prev = NULL;
+
 	p->p_cptr = NULL;
 	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
@@ -639,7 +643,13 @@
 	{
 		int i;
 		p->cpus_runnable = ~0UL;
-		p->processor = current->processor;
+		/*
+		 * if it's a real time task we leave it on the same processor/task_qid
+		 */
+		if (!task_realtime(p) && !(clone_flags & CLONE_PID)) {
+			p->processor = get_best_cpu(p);
+			p->task_qid = cpu_number_map(p->processor);
+		}
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < smp_num_cpus; i++)
 			p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
@@ -684,8 +694,14 @@
 	 */
 	p->counter = (current->counter + 1) >> 1;
 	current->counter >>= 1;
-	if (!current->counter)
+	p->timer_ticks = (current->timer_ticks + 1) >> 1;
+	current->timer_ticks >>= 1;
+	if (!current->counter) {
 		current->need_resched = 1;
+		current->timer_ticks = 0;
+	}
+
+	p->run_jtime = 0;
 
 	/*
 	 * Ok, add it to the run-queues and make it
diff -Nru linux-2.5.1-pre11.vanilla/kernel/ksyms.c linux-2.5.1-pre11.xs2/kernel/ksyms.c
--- linux-2.5.1-pre11.vanilla/kernel/ksyms.c	Thu Dec 13 11:05:12 2001
+++ linux-2.5.1-pre11.xs2/kernel/ksyms.c	Thu Dec 13 11:02:58 2001
@@ -447,7 +447,6 @@
 #endif
 
 EXPORT_SYMBOL(kstat);
-EXPORT_SYMBOL(nr_running);
 
 /* misc */
 EXPORT_SYMBOL(panic);
diff -Nru linux-2.5.1-pre11.vanilla/kernel/sched.c linux-2.5.1-pre11.xs2/kernel/sched.c
--- linux-2.5.1-pre11.vanilla/kernel/sched.c	Wed Nov 21 16:25:48 2001
+++ linux-2.5.1-pre11.xs2/kernel/sched.c	Sat Dec 22 13:10:47 2001
@@ -81,18 +81,85 @@
 /*
  * The tasklist_lock protects the linked list of processes.
  *
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
+ * This is the lock order :
+ * 1) tasklist_lock
+ * 2) RT_QID
+ * 3) lock(0)
+ * ...
+ * M) lock(N)
  *
- * If both locks are to be concurrently held, the runqueue_lock
- * nests inside the tasklist_lock.
+ * This does not mean that if a lock(3) is needed we've to lock the
+ * whole chain down to lock(3) but it states that if RT_QID and lock(3)
+ * are needed, RT_QID must be locked first.
+ * The lock patterns are tasklist_lock+lock(i) when the task is moved to
+ * a different runqueue, RT_QID+lock(i) inside rechedule_idle() of a global
+ * RT task ( only if the best CPU of the RT task is running another RT task ),
+ * RT_QID+lock(0)+..+lock(N) inside rechedule_idle() of a global RT task
+ * when all CPUs are running RT tasks ).
+ * No other patterns are used, ie: lock(i)+lock(j)
  *
  * task->alloc_lock nests inside tasklist_lock.
  */
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
-static LIST_HEAD(runqueue_head);
+/*
+ * dynamic priority ( counter ) decay limit.
+ * each timer tick, if counter > decay_ticks it gets decremented by 1
+ * while if counter <= decay_ticks, timer ticks accumulates in timer_ticks
+ * and the task is preempted when timer_ticks == counter
+ */
+int decay_ticks = NICE_TO_TICKS(0);
+
+/*
+ * this is the distance map ( move cost ) between cpus.
+ * the move cost from cpu I to cpu J is : cpus_dmap[I][J]
+ * this value can be seen as the number of milliseconds we can
+ * tolerate to have an idle cpu before grabbing a remote task
+ * to run on the idle cpu
+ */
+#define DEF_CPU_DIST_MS	10
+#define FAR_CPU_DIST_MS	20
+#define MS_TO_DIST(t)	(((t) * HZ) / 1000)
+#define DEF_CPU_DIST	MS_TO_DIST(DEF_CPU_DIST_MS)
+#define FAR_CPU_DIST	MS_TO_DIST(FAR_CPU_DIST_MS)
+
+#define cpu_distance(i, j)	((unsigned int) cpus_dmap[i][j])
+
+/*
+ * this is a bonus that we give to cpus that have previously run
+ * an affine mm struct. the bonus value is in milliseconds
+ */
+#define MOVE_MM_BONUS_MS	20
+#define MOVE_MM_BONUS	MS_TO_DIST(MOVE_MM_BONUS_MS)
+
+/*
+ * this is the cpu distance map that should be compiled by the architecture
+ * dependent code or by the common code using a provided abstract topology
+ * interface
+ */
+unsigned char cpus_dmap[NR_CPUS][NR_CPUS];
+
+/*
+ * this is the minimum run queue length that trigger balancing decisions
+ */
+int min_mov_rqlen = 2;
+
+/*
+ * this is the weight ( in milliseconds ) that a remote process has and is
+ * used together with the cpu distance ( metric ) map to build a uniform
+ * cost of move
+ */
+int mvtsk_cost = DEF_CPU_DIST_MS / 2 - 1;
+
+/*
+ * this is used for global real time tasks checkpointing. to avoid the global
+ * real time task selection every time that there's a global real time task
+ * running, this variable is incremented at every global real time task wakeup
+ * and when the first global real time task queue pickup fails for a cpu, its
+ * cpu-local variable is aligned to this one avoiding subsequent failing list
+ * lookup.
+ */
+static volatile unsigned long grt_chkp = 0;
 
 /*
  * We align per-CPU scheduling data on cacheline boundaries,
@@ -100,33 +167,80 @@
  */
 static union {
 	struct schedule_data {
-		struct task_struct * curr;
-		cycles_t last_schedule;
+		int qnr_processes;
+		int qnr_running;
+		struct list_head proclist_head;
+		struct list_head runqueue_head;
+		struct task_struct *curr;
+		unsigned long hit_cpus;
+		unsigned char ldhits[NR_CPUS];
+		unsigned long rtt_chkp;
+		spinlock_t runqueue_lock ____cacheline_aligned;
 	} schedule_data;
 	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+} aligned_data [NR_CPUS + 1] __cacheline_aligned;
+
+#define RT_QID	NR_CPUS
+#define global_rttask(p)	((p)->task_qid == RT_QID)
+#define task_foreign(p)	(cpu_number_map((p)->processor) != (p)->task_qid)
+#define cpu_next(cpu)	(((cpu) + 1) < smp_num_cpus ? (cpu) + 1: 0)
 
 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#define rtt_chkp(cpu) aligned_data[(cpu)].schedule_data.rtt_chkp
+#define hit_cpus(cpu) aligned_data[(cpu)].schedule_data.hit_cpus
+#define ldhits(cpu, i) aligned_data[(cpu)].schedule_data.ldhits[i]
+#define qnr_processes(cpu) aligned_data[(cpu)].schedule_data.qnr_processes
+#define qnr_running(cpu) aligned_data[(cpu)].schedule_data.qnr_running
+#define proclist_head(cpu) aligned_data[(cpu)].schedule_data.proclist_head
+#define runqueue_head(cpu) aligned_data[(cpu)].schedule_data.runqueue_head
+#define runqueue_lock(cpu) aligned_data[(cpu)].schedule_data.runqueue_lock
+
+
+#define rq_lock(p)	lock_task_rq(p)
+#define rq_unlock(p)	spin_unlock(&runqueue_lock((p)->task_qid))
+#define rq_lock_irq(p)	do { local_irq_disable(); lock_task_rq(p); } while (0)
+#define rq_unlock_irq(p)	do { spin_unlock(&runqueue_lock((p)->task_qid)); local_irq_enable(); } while (0)
+#define rq_lock_irqsave(p, f)	do { local_irq_save(f); lock_task_rq(p); } while (0)
+#define rq_unlock_irqrestore(p, f)	do { spin_unlock(&runqueue_lock((p)->task_qid)); local_irq_restore(f); } while (0)
+
 
 struct kernel_stat kstat;
 extern struct task_struct *child_reaper;
 
+
 #ifdef CONFIG_SMP
 
 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
-#define can_schedule(p,cpu) \
-	((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
+#define can_schedule(p, cpu) \
+	((p)->cpus_runnable & (p)->cpus_allowed & (1 << (cpu)))
+#define can_move(p, cpu) \
+	((p)->cpus_runnable == ~0L && (p)->cpus_allowed & (1 << (cpu)))
+#define run_allowed(p, cpu)	((p)->cpus_allowed & (1 << (cpu)))
 
 #else
 
 #define idle_task(cpu) (&init_task)
-#define can_schedule(p,cpu) (1)
+#define can_schedule(p, cpu) (1)
+#define can_move(p, cpu) (1)
+#define run_allowed(p, cpu)	(1)
 
 #endif
 
+
 void scheduling_functions_start_here(void) { }
 
+static inline void lock_task_rq(struct task_struct *p)
+{
+	int rqn = p->task_qid;
+
+	spin_lock(&runqueue_lock(rqn));
+	while (p->task_qid != rqn) {
+		spin_unlock(&runqueue_lock(rqn));
+		rqn = p->task_qid;
+		spin_lock(&runqueue_lock(rqn));
+	}
+}
+
 /*
  * This is the function that decides how desirable a process is..
  * You can weigh different processes against each other depending
@@ -141,7 +255,7 @@
  *	 +1000: realtime process, select this.
  */
 
-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+static inline int goodness(struct task_struct * p, struct mm_struct *this_mm)
 {
 	int weight;
 
@@ -169,13 +283,6 @@
 		if (!weight)
 			goto out;
 			
-#ifdef CONFIG_SMP
-		/* Give a largish advantage to the same processor...   */
-		/* (this is equivalent to penalizing other processors) */
-		if (p->processor == this_cpu)
-			weight += PROC_CHANGE_PENALTY;
-#endif
-
 		/* .. and a slight advantage to the current MM */
 		if (p->mm == this_mm || !p->mm)
 			weight += 1;
@@ -197,107 +304,160 @@
  * the 'goodness value' of replacing a process on a given CPU.
  * positive value means 'replace', zero or negative means 'dont'.
  */
-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p)
 {
-	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+	return goodness(p, prev->active_mm) - goodness(prev, prev->active_mm);
+}
+
+#ifdef CONFIG_SMP
+
+static inline void lock_queues(void)
+{
+	int cpu;
+	for (cpu = 0; cpu < smp_num_cpus; cpu++)
+		spin_lock(&runqueue_lock(cpu));
+}
+
+static inline void unlock_queues(void)
+{
+	int cpu;
+	for (cpu = smp_num_cpus - 1; cpu >= 0; cpu--)
+		spin_unlock(&runqueue_lock(cpu));
 }
 
 /*
- * This is ugly, but reschedule_idle() is very timing-critical.
- * We are called with the runqueue spinlock held and we must
- * not claim the tasklist_lock.
+ * this is used to try to find a place to run the global rt task.
+ * it's called with the RT_QID lock held and with local irq disabled.
  */
-static FASTCALL(void reschedule_idle(struct task_struct * p));
-
-static void reschedule_idle(struct task_struct * p)
+static inline void rtt_reschedule_idle(struct task_struct * p)
 {
-#ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, i, max_prio;
-	cycles_t oldest_idle;
-
-	/*
-	 * shortcut if the woken up task's last CPU is
-	 * idle now.
-	 */
-	best_cpu = p->processor;
-	if (can_schedule(p, best_cpu)) {
-		tsk = idle_task(best_cpu);
-		if (cpu_curr(best_cpu) == tsk) {
-			int need_resched;
-send_now_idle:
-			/*
-			 * If need_resched == -1 then we can skip sending
-			 * the IPI altogether, tsk->need_resched is
-			 * actively watched by the idle thread.
-			 */
+	int cpu, best_cpu = cpu_number_map(p->processor),
+		this_cpu = cpu_number_map(smp_processor_id()), need_resched, maxpg = 0, pg;
+	struct task_struct *tsk, *ttsk = NULL;
+
+	/*
+	 * increment global real time tasks checkpoint. this is protected by the RT_QID
+	 * lock so we can use '++' w/out atomic ops.
+	 */
+	++grt_chkp;
+
+	/*
+	 * if the best cpu for the global rt task is not currently running
+	 * another rt task, that's the choice.
+	 */
+	if (can_schedule(p, cpu_logical_map(best_cpu))) {
+		spin_lock(&runqueue_lock(best_cpu));
+		tsk = cpu_curr(best_cpu);
+		if (!task_realtime(tsk)) {
 			need_resched = tsk->need_resched;
 			tsk->need_resched = 1;
-			if ((best_cpu != this_cpu) && !need_resched)
-				smp_send_reschedule(best_cpu);
+			if (best_cpu != this_cpu &&
+				(!need_resched || tsk != idle_task(cpu_logical_map(best_cpu))))
+				smp_send_reschedule(cpu_logical_map(best_cpu));
+			spin_unlock(&runqueue_lock(best_cpu));
 			return;
 		}
+		spin_unlock(&runqueue_lock(best_cpu));
 	}
-
 	/*
-	 * We know that the preferred CPU has a cache-affine current
-	 * process, lets try to find a new idle CPU for the woken-up
-	 * process. Select the least recently active idle CPU. (that
-	 * one will have the least active cache context.) Also find
-	 * the executing process which has the least priority.
-	 */
-	oldest_idle = (cycles_t) -1;
-	target_tsk = NULL;
-	max_prio = 0;
-
-	for (i = 0; i < smp_num_cpus; i++) {
-		cpu = cpu_logical_map(i);
-		if (!can_schedule(p, cpu))
-			continue;
-		tsk = cpu_curr(cpu);
-		/*
-		 * We use the first available idle CPU. This creates
-		 * a priority list between idle CPUs, but this is not
-		 * a problem.
-		 */
-		if (tsk == idle_task(cpu)) {
-			if (last_schedule(cpu) < oldest_idle) {
-				oldest_idle = last_schedule(cpu);
-				target_tsk = tsk;
-			}
-		} else {
-			if (oldest_idle == -1ULL) {
-				int prio = preemption_goodness(tsk, p, cpu);
-
-				if (prio > max_prio) {
-					max_prio = prio;
-					target_tsk = tsk;
-				}
+	 * the best cpu for the global rt task is running another rt task.
+	 * instead of using preemption_goodness() to try to schedule on that cpu
+	 * we try to find a cpu that is not running another rt task.
+	 */
+	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+		if (can_schedule(p, cpu_logical_map(cpu))) {
+			spin_lock(&runqueue_lock(cpu));
+			tsk = cpu_curr(cpu);
+			if (!task_realtime(tsk)) {
+				need_resched = tsk->need_resched;
+				tsk->need_resched = 1;
+				if (cpu != this_cpu &&
+					(!need_resched || tsk != idle_task(cpu_logical_map(cpu))))
+					smp_send_reschedule(cpu_logical_map(cpu));
+				spin_unlock(&runqueue_lock(cpu));
+				return;
 			}
+			spin_unlock(&runqueue_lock(cpu));
 		}
 	}
-	tsk = target_tsk;
-	if (tsk) {
-		if (oldest_idle != -1ULL) {
-			best_cpu = tsk->processor;
-			goto send_now_idle;
+	/*
+	 * it's not our lucky day ..., all the cpus are running rt tasks and
+	 * a preemption_goodness() loop is needed to ensure that the global
+	 * priority is respected among rt tasks.
+	 */
+	lock_queues();
+	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+		if (can_schedule(p, cpu_logical_map(cpu))) {
+			tsk = cpu_curr(cpu);
+			if ((pg = preemption_goodness(tsk, p)) > maxpg) {
+				ttsk = tsk;
+				maxpg = pg;
+				if (tsk == idle_task(cpu_logical_map(cpu)))
+					break;
+			}
 		}
+	}
+	if (ttsk) {
+		need_resched = ttsk->need_resched;
+		ttsk->need_resched = 1;
+		if (ttsk->processor != smp_processor_id() && !need_resched)
+			smp_send_reschedule(ttsk->processor);
+	}
+	unlock_queues();
+}
+
+static inline void std_reschedule_idle(struct task_struct * p)
+{
+	int best_cpu = p->task_qid, this_cpu = cpu_number_map(smp_processor_id());
+	struct task_struct *tsk;
+
+	tsk = cpu_curr(best_cpu);
+	if (tsk == idle_task(cpu_logical_map(best_cpu))) {
+		/*
+		 * If need_resched == -1 then we can skip sending
+		 * the IPI altogether, tsk->need_resched is
+		 * actively watched by the idle thread.
+		 */
+		int need_resched = tsk->need_resched;
+		tsk->need_resched = 1;
+		if ((best_cpu != this_cpu) && !need_resched)
+			smp_send_reschedule(cpu_logical_map(best_cpu));
+	} else if (tsk != p && preemption_goodness(tsk, p) > 0) {
 		tsk->need_resched = 1;
-		if (tsk->processor != this_cpu)
-			smp_send_reschedule(tsk->processor);
+		if (tsk->task_qid != this_cpu)
+			smp_send_reschedule(cpu_logical_map(tsk->task_qid));
 	}
-	return;
+}
 		
+#endif	/* #ifdef CONFIG_SMP */
+
+/*
+ * This is ugly, but reschedule_idle() is very timing-critical.
+ * We are called with the runqueue spinlock held and we must
+ * not claim the tasklist_lock.
+ */
+static FASTCALL(void reschedule_idle(struct task_struct * p));
 
-#else /* UP */
-	int this_cpu = smp_processor_id();
+static void reschedule_idle(struct task_struct * p)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * it's better to fork the path here instead of having complex if()s
+	 * inside the function itself. rt tasks really have different wakeup
+	 * methods compared with local cpu ones
+	 */
+	if (!global_rttask(p))
+		std_reschedule_idle(p);
+	else
+		rtt_reschedule_idle(p);
+
+#else	/* #ifdef CONFIG_SMP */
 	struct task_struct *tsk;
 
-	tsk = cpu_curr(this_cpu);
-	if (preemption_goodness(tsk, p, this_cpu) > 0)
+	tsk = cpu_curr(smp_processor_id());
+	if (preemption_goodness(tsk, p) > 0)
 		tsk->need_resched = 1;
-#endif
+#endif	/* #ifdef CONFIG_SMP */
 }
 
 /*
@@ -307,25 +467,288 @@
  * run-queue, not the end. See the comment about "This is
  * subtle" in the scheduler proper..
  */
-static inline void add_to_runqueue(struct task_struct * p)
+static inline void __add_to_runqueue(struct task_struct * p, int task_qid)
+{
+	list_add(&p->run_list, &runqueue_head(task_qid));
+	++qnr_running(task_qid);
+	atomic_inc(&gnr_running);
+}
+
+static inline void __del_from_runqueue(struct task_struct * p, int task_qid)
+{
+	atomic_dec(&gnr_running);
+	--qnr_running(task_qid);
+	p->sleep_time = jiffies;
+	list_del(&p->run_list);
+	p->run_list.next = NULL;
+}
+
+void del_from_runqueue(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__del_from_runqueue(p, p->task_qid);
+	rq_unlock_irqrestore(p, flags);
+}
+
+static inline void __add_to_proclist(struct task_struct * p, int task_qid)
+{
+	list_add(&p->proclist_cpu, &proclist_head(task_qid));
+	++qnr_processes(task_qid);
+}
+
+void add_to_proclist(struct task_struct * p)
 {
-	list_add(&p->run_list, &runqueue_head);
-	nr_running++;
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__add_to_proclist(p, p->task_qid);
+	rq_unlock_irqrestore(p, flags);
+}
+
+static inline void __del_from_proclist(struct task_struct * p, int task_qid)
+{
+	list_del(&p->proclist_cpu);
+	--qnr_processes(task_qid);
+	p->proclist_cpu.next = NULL;
+}
+
+void del_from_proclist(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__del_from_proclist(p, p->task_qid);
+	rq_unlock_irqrestore(p, flags);
 }
 
-static inline void move_last_runqueue(struct task_struct * p)
+void runqueue_spin_lock(struct task_struct * p)
+{
+	rq_lock(p);
+}
+
+void runqueue_spin_unlock(struct task_struct * p)
+{
+	rq_unlock(p);
+}
+
+static inline void __move_last_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+	list_add_tail(&p->run_list, &runqueue_head(p->task_qid));
 }
 
-static inline void move_first_runqueue(struct task_struct * p)
+static inline void __move_first_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	list_add(&p->run_list, &runqueue_head(p->task_qid));
+}
+
+/*
+ * move_to_rqn() must be called with 1) local irq disabled
+ * 2) tasklist_lock write-locked 3) task locked
+ */
+static int move_to_rqn(struct task_struct * p, int rqn, int stick)
+{
+	int task_cpu;
+	unsigned long cpus_allowed;
+
+	if (p->task_qid == rqn) {
+		if (stick)
+			p->cpus_allowed = (1 << cpu_logical_map(rqn));
+		return rqn;
+	}
+	if (task_on_runqueue(p))
+		__del_from_runqueue(p, p->task_qid);
+	if (task_on_proclist(p))
+		__del_from_proclist(p, p->task_qid);
+	cpus_allowed = stick ? (1 << cpu_logical_map(rqn)): p->cpus_allowed;
+	p->cpus_allowed = 0;
+	task_cpu = p->task_qid;
+	p->task_qid = rqn;
+	spin_unlock(&runqueue_lock(task_cpu));
+
+	rq_lock(p);
+	__add_to_proclist(p, p->task_qid);
+	if (!task_on_runqueue(p) && p->state == TASK_RUNNING)
+		__add_to_runqueue(p, p->task_qid);
+	p->cpus_allowed = cpus_allowed;
+	return task_cpu;
+}
+
+/*
+ * this is only called by softirq.c::ksoftirqd() and is used to place
+ * ksoftirqd tasks over different cpus.
+ */
+int move_to_cpu(struct task_struct * p, int cpu, int stick)
+{
+#ifdef CONFIG_SMP
+	unsigned long flags;
+
+	write_lock_irqsave(&tasklist_lock, flags);
+	rq_lock(p);
+	move_to_rqn(p, cpu_number_map(cpu), stick);
+	rq_unlock(p);
+	write_unlock_irqrestore(&tasklist_lock, flags);
+	if (cpu != smp_processor_id())
+		smp_send_reschedule(cpu);
+	return cpu;
+#else	/* #ifdef CONFIG_SMP */
+	return 0;
+#endif	/* #ifdef CONFIG_SMP */
+}
+
+/*
+ * this function gets called inside kernel/timer.c when the timer
+ * tick hit the idle task. maybe architectures with huge HZ might
+ * want to not wake up the idle at every timer tick
+ */
+void sched_wake_idle(void)
+{
+	if (smp_num_cpus > 1)
+		current->need_resched = 1;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * the runtime cpu distance is the sum of the base cpu distance plus the
+ * load on the remote cpu
+ */
+static inline long rt_cpu_dist(int src_cpu, int dst_cpu)
+{
+	return (cpu_distance(src_cpu, dst_cpu) << 4) +
+		(qnr_running(src_cpu) * mvtsk_cost  * (HZ << 4)) / 1000;
+}
+
+/*
+ * try to find the best cpu to run a fresh new process, no locks are held
+ * during this function. it gets called by do_fork() in SMP mode
+ */
+int get_best_cpu(struct task_struct *p)
+{
+	int i, best_cpu, this_cpu = cpu_number_map(smp_processor_id());
+	long cdist, min_cdist;
+
+	best_cpu = this_cpu;
+	min_cdist = rt_cpu_dist(this_cpu, this_cpu);
+	for (i = 0; i < smp_num_cpus; i++) {
+		if (i == this_cpu || !run_allowed(p, cpu_logical_map(i))) continue;
+		if ((cdist = rt_cpu_dist(i, this_cpu)) < min_cdist) {
+			min_cdist = cdist;
+			best_cpu = i;
+		}
+	}
+	return cpu_logical_map(best_cpu);
+}
+
+static inline long move_goodness(struct task_struct *p, struct mm_struct *this_mm)
+{
+	long mgds = (long) (jiffies - p->run_jtime);
+	if (p->mm == this_mm || !p->mm)
+		mgds += MOVE_MM_BONUS;
+	return mgds;
+}
+
+static inline struct task_struct *try_steal_task(int src_cpu, int dst_cpu)
+{
+	int ldst_cpu = cpu_logical_map(dst_cpu);
+	long mgdns = -1, mvg;
+	struct mm_struct *this_mm = current->active_mm;
+	struct task_struct *tsk, *mvtsk = NULL;
+	struct list_head *head, *tmp;
+
+	spin_lock_irq(&runqueue_lock(src_cpu));
+	head = &runqueue_head(src_cpu);
+	list_for_each(tmp, head) {
+		tsk = list_entry(tmp, struct task_struct, run_list);
+		if (can_move(tsk, ldst_cpu) && !task_foreign(tsk) &&
+			(mvg = move_goodness(tsk, this_mm)) > mgdns) {
+			mvtsk = tsk;
+			mgdns = mvg;
+		}
+	}
+	if (mvtsk) {
+		unsigned long cpus_allowed = mvtsk->cpus_allowed;
+
+		mvtsk->cpus_allowed = 0;
+		__del_from_runqueue(mvtsk, src_cpu);
+		spin_unlock(&runqueue_lock(src_cpu));
+		write_lock(&tasklist_lock);
+		spin_lock(&runqueue_lock(src_cpu));
+		__del_from_proclist(mvtsk, src_cpu);
+		spin_unlock(&runqueue_lock(src_cpu));
+		spin_lock(&runqueue_lock(dst_cpu));
+		__add_to_runqueue(mvtsk, dst_cpu);
+		__add_to_proclist(mvtsk, dst_cpu);
+		mvtsk->counter -= mvtsk->timer_ticks;
+		mvtsk->timer_ticks = 0;
+		mvtsk->cpus_allowed = cpus_allowed;
+		mvtsk->task_qid = dst_cpu;
+		spin_unlock(&runqueue_lock(dst_cpu));
+		write_unlock_irq(&tasklist_lock);
+	} else
+		spin_unlock_irq(&runqueue_lock(src_cpu));
+	return mvtsk;
 }
 
 /*
+ * the move cost is the difference from the cpu distance and the run queue
+ * load on the remote cpu. both terms are scaled by a factor 16 ( << 4 ) and
+ * the cost for each remote cpu task depend on mvtsk_cost
+ */
+static inline long move_cost(int src_cpu, int dst_cpu)
+{
+	return (cpu_distance(src_cpu, dst_cpu) << 4) -
+		(qnr_running(src_cpu) * mvtsk_cost  * (HZ << 4)) / 1000;
+}
+
+static inline struct task_struct *get_remote_task(int this_cpu)
+{
+	int i, max_cpu;
+	unsigned long hcpus = 0;
+	long ccost, min_cost;
+	struct task_struct *rtask;
+
+	this_cpu = cpu_number_map(this_cpu);
+	for (i = 0; i < smp_num_cpus; i++) {
+		if (i == this_cpu) continue;
+		if (qnr_running(i) >= min_mov_rqlen) {
+			if (hit_cpus(this_cpu) & (1 << i))
+				++ldhits(this_cpu, i);
+			else {
+				hit_cpus(this_cpu) |= (1 << i);
+				ldhits(this_cpu, i) = 1;
+			}
+			if (ldhits(this_cpu, i) >= cpu_distance(this_cpu, i))
+				hcpus |= (1 << i);
+		} else
+			hit_cpus(this_cpu) &= ~(1 << i);
+	}
+	while (hcpus) {
+		max_cpu = -1;
+		min_cost = 1000;
+		for (i = 0; i < smp_num_cpus; i++) {
+			if (!(hcpus & (1 << i))) continue;
+			if ((ccost = move_cost(i, this_cpu)) < min_cost) {
+				min_cost = ccost;
+				max_cpu = i;
+			}
+		}
+		if (max_cpu < 0) break;
+		if ((rtask = try_steal_task(max_cpu, this_cpu))) {
+			hit_cpus(this_cpu) = 0;
+			return rtask;
+		}
+		hcpus &= ~(1 << max_cpu);
+	}
+	return NULL;
+}
+
+#endif	/* #ifdef CONFIG_SMP */
+
+/*
  * Wake up a process. Put it on the run-queue if it's not
  * already there.  The "current" process is always on the
  * run-queue (except when the actual re-schedule is in
@@ -341,16 +764,21 @@
 	/*
 	 * We want the common case fall through straight, thus the goto.
 	 */
-	spin_lock_irqsave(&runqueue_lock, flags);
+	rq_lock_irqsave(p, flags);
 	p->state = TASK_RUNNING;
-	if (task_on_runqueue(p))
+	/*
+	 * cpus_allowed is cleared when a task is moving from one cpu
+	 * to another and it is used to avoid to be hit while we're
+	 * switching locks.
+	 */
+	if (task_on_runqueue(p) || !p->cpus_allowed)
 		goto out;
-	add_to_runqueue(p);
-	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
+	__add_to_runqueue(p, p->task_qid);
+	if (!synchronous || p->task_qid != cpu_number_map(smp_processor_id()))
 		reschedule_idle(p);
 	success = 1;
 out:
-	spin_unlock_irqrestore(&runqueue_lock, flags);
+	rq_unlock_irqrestore(p, flags);
 	return success;
 }
 
@@ -476,41 +904,8 @@
 	task_lock(prev);
 	task_release_cpu(prev);
 	mb();
-	if (prev->state == TASK_RUNNING)
-		goto needs_resched;
-
-out_unlock:
 	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
 	return;
-
-	/*
-	 * Slow path - we 'push' the previous process and
-	 * reschedule_idle() will attempt to find a new
-	 * processor for it. (but it might preempt the
-	 * current process as well.) We must take the runqueue
-	 * lock and re-check prev->state to be correct. It might
-	 * still happen that this process has a preemption
-	 * 'in progress' already - but this is not a problem and
-	 * might happen in other circumstances as well.
-	 */
-needs_resched:
-	{
-		unsigned long flags;
-
-		/*
-		 * Avoid taking the runqueue lock in cases where
-		 * no preemption-check is necessery:
-		 */
-		if ((prev == idle_task(smp_processor_id())) ||
-						(policy & SCHED_YIELD))
-			goto out_unlock;
-
-		spin_lock_irqsave(&runqueue_lock, flags);
-		if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
-			reschedule_idle(prev);
-		spin_unlock_irqrestore(&runqueue_lock, flags);
-		goto out_unlock;
-	}
 #else
 	prev->policy &= ~SCHED_YIELD;
 #endif /* CONFIG_SMP */
@@ -521,6 +916,17 @@
 	__schedule_tail(prev);
 }
 
+static inline void set_task_running(struct task_struct *p, int cpu)
+{
+#ifdef CONFIG_SMP
+	if (p != idle_task(cpu) && hit_cpus(cpu_number_map(cpu)))
+		hit_cpus(cpu_number_map(cpu)) = 0;
+	p->run_jtime = jiffies;
+#endif	/* #ifdef CONFIG_SMP */
+	cpu_curr(cpu_number_map(cpu)) = p;
+	task_set_cpu(p, cpu);
+}
+
 /*
  *  'schedule()' is the scheduler function. It's a very simple and nice
  * scheduler: it's not perfect, but certainly works for most things.
@@ -533,13 +939,11 @@
  */
 asmlinkage void schedule(void)
 {
-	struct schedule_data * sched_data;
 	struct task_struct *prev, *next, *p;
-	struct list_head *tmp;
+	struct list_head *head, *tmp;
 	int this_cpu, c;
 
-
-	spin_lock_prefetch(&runqueue_lock);
+	spin_lock_prefetch(&runqueue_lock(current->processor));
 
 	if (!current->active_mm) BUG();
 need_resched_back:
@@ -554,33 +958,50 @@
 	release_kernel_lock(prev, this_cpu);
 
 	/*
-	 * 'sched_data' is protected by the fact that we can run
-	 * only one process per CPU.
+	 * lock the task run queue to perform task related ops like
+	 * move to last and del from runqueue.
 	 */
-	sched_data = & aligned_data[this_cpu].schedule_data;
-
-	spin_lock_irq(&runqueue_lock);
+	rq_lock_irq(prev);
 
 	/* move an exhausted RR process to be last.. */
 	if (unlikely(prev->policy == SCHED_RR))
 		if (!prev->counter) {
 			prev->counter = NICE_TO_TICKS(prev->nice);
-			move_last_runqueue(prev);
+			__move_last_runqueue(prev);
 		}
 
 	switch (prev->state) {
-		case TASK_INTERRUPTIBLE:
-			if (signal_pending(prev)) {
-				prev->state = TASK_RUNNING;
-				break;
-			}
-		default:
-			del_from_runqueue(prev);
-		case TASK_RUNNING:;
+	case TASK_INTERRUPTIBLE:
+		if (signal_pending(prev)) {
+			prev->state = TASK_RUNNING;
+			break;
+		}
+	default:
+		if (task_on_runqueue(prev))
+			__del_from_runqueue(prev, prev->task_qid);
+	case TASK_RUNNING:;
 	}
 	prev->need_resched = 0;
 
 	/*
+	 * check global rt queue first without held locks and if it's not empty
+	 * try to pickup the rt task first. despite to the new "unlikely" feature
+	 * the code for rt task selection is kept out.
+	 */
+	if (grt_chkp != rtt_chkp(cpu_number_map(this_cpu)) &&
+		!list_empty(&runqueue_head(RT_QID)))
+		goto rt_queue_select;
+
+	/*
+	 * this is true for running tasks moved with move_to_rqn() ( the first time
+	 * they call schedule() ) and for global RT tasks.
+	 */
+	if (unlikely(task_foreign(prev))) {
+		rq_unlock(prev);
+		spin_lock(&runqueue_lock(cpu_number_map(this_cpu)));
+	}
+
+	/*
 	 * this is the scheduler proper:
 	 */
 
@@ -590,10 +1011,11 @@
 	 */
 	next = idle_task(this_cpu);
 	c = -1000;
-	list_for_each(tmp, &runqueue_head) {
+	head = &runqueue_head(cpu_number_map(this_cpu));
+	list_for_each(tmp, head) {
 		p = list_entry(tmp, struct task_struct, run_list);
 		if (can_schedule(p, this_cpu)) {
-			int weight = goodness(p, this_cpu, prev->active_mm);
+			int weight = goodness(p, prev->active_mm);
 			if (weight > c)
 				c = weight, next = p;
 		}
@@ -601,25 +1023,26 @@
 
 	/* Do we need to re-calculate counters? */
 	if (unlikely(!c)) {
-		struct task_struct *p;
-
-		spin_unlock_irq(&runqueue_lock);
+		spin_unlock_irq(&runqueue_lock(cpu_number_map(this_cpu)));
 		read_lock(&tasklist_lock);
-		for_each_task(p)
+		head = &proclist_head(cpu_number_map(this_cpu));
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, struct task_struct, proclist_cpu);
 			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
+		}
 		read_unlock(&tasklist_lock);
-		spin_lock_irq(&runqueue_lock);
+		spin_lock_irq(&runqueue_lock(cpu_number_map(this_cpu)));
 		goto repeat_schedule;
 	}
 
 	/*
 	 * from this point on nothing can prevent us from
-	 * switching to the next task, save this fact in
-	 * sched_data.
+	 * switching to the next task.
 	 */
-	sched_data->curr = next;
-	task_set_cpu(next, this_cpu);
-	spin_unlock_irq(&runqueue_lock);
+	set_task_running(next, this_cpu);
+	spin_unlock_irq(&runqueue_lock(cpu_number_map(this_cpu)));
+
+rt_task_selected:
 
 	if (unlikely(prev == next)) {
 		/* We won't go through the normal tail, so do this by hand */
@@ -627,24 +1050,6 @@
 		goto same_process;
 	}
 
-#ifdef CONFIG_SMP
- 	/*
- 	 * maintain the per-process 'last schedule' value.
- 	 * (this has to be recalculated even if we reschedule to
- 	 * the same process) Currently this is only used on SMP,
-	 * and it's approximate, so we do not have to maintain
-	 * it while holding the runqueue spinlock.
- 	 */
- 	sched_data->last_schedule = get_cycles();
-
-	/*
-	 * We drop the scheduler lock early (it's a global spinlock),
-	 * thus we have to lock the previous process from getting
-	 * rescheduled during switch_to().
-	 */
-
-#endif /* CONFIG_SMP */
-
 	kstat.context_swtch++;
 	/*
 	 * there are 3 processes which are affected by a context switch:
@@ -684,9 +1089,55 @@
 
 same_process:
 	reacquire_kernel_lock(current);
+
+#ifdef CONFIG_SMP
+	if (unlikely(current == idle_task(this_cpu)))
+		if (get_remote_task(this_cpu))
+			goto need_resched_back;
+#endif	/* #ifdef CONFIG_SMP */
+
 	if (current->need_resched)
 		goto need_resched_back;
 	return;
+
+rt_queue_select:
+	/*
+	 * the fast lockless check reported that it might be a successful
+	 * pickup inside the global rt queue, so we try here. this section
+	 * is entered with "prev" locked. if the "prev" task qid is not RT_QID
+	 * then it must be unlocked and RT_QID lock must be acquired.
+	 */
+	if (!global_rttask(prev)) {
+		rq_unlock(prev);
+		spin_lock(&runqueue_lock(RT_QID));
+	}
+	c = 0;
+	head = &runqueue_head(RT_QID);
+	list_for_each(tmp, head) {
+		p = list_entry(tmp, struct task_struct, run_list);
+		if (can_schedule(p, this_cpu)) {
+			int weight = goodness(p, prev->active_mm);
+			if (weight > c)
+				c = weight, next = p;
+		}
+	}
+	if (!c) {
+		/*
+		 * the fast test reported a false positive so we go back to
+		 * the local CPU runqueue selection.
+		 */
+		rtt_chkp(cpu_number_map(this_cpu)) = grt_chkp;
+		spin_unlock(&runqueue_lock(RT_QID));
+		spin_lock(&runqueue_lock(cpu_number_map(this_cpu)));
+		goto repeat_schedule;
+	}
+	/*
+	 * the global rt task has been selected and final setup is needed.
+	 */
+	set_task_running(next, this_cpu);
+	spin_unlock_irq(&runqueue_lock(RT_QID));
+	goto rt_task_selected;
+
 }
 
 /*
@@ -886,6 +1337,7 @@
 static int setscheduler(pid_t pid, int policy, 
 			struct sched_param *param)
 {
+	int grt = 0, pgrt = 0, rqn;
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval;
@@ -900,22 +1352,29 @@
 
 	/*
 	 * We play safe to avoid deadlocks.
+	 * It's possible that we need a write lock to move the task in/out the
+	 * RT_QID run queue so instead of getting a read lock and having to
+	 * release/writelock again, it's better to get directly the write one.
 	 */
-	read_lock_irq(&tasklist_lock);
-	spin_lock(&runqueue_lock);
+	write_lock_irq(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 
 	retval = -ESRCH;
 	if (!p)
-		goto out_unlock;
+		goto out_unlock_tkll;
 			
+	rq_lock(p);
 	if (policy < 0)
 		policy = p->policy;
 	else {
+		grt = (policy & SCHED_RTLOCAL) == 0;
+		policy &= ~SCHED_RTLOCAL;
+		pgrt = global_rttask(p);
+
 		retval = -EINVAL;
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
-				policy != SCHED_OTHER)
+			policy != SCHED_OTHER)
 			goto out_unlock;
 	}
 	
@@ -940,14 +1399,27 @@
 	retval = 0;
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;
-	if (task_on_runqueue(p))
-		move_first_runqueue(p);
+	if (pgrt == grt) {
+		rqn = p->task_qid;
+		if (task_on_runqueue(p))
+			__move_first_runqueue(p);
+	} else {
+		rqn = cpu_number_map(p->processor);
+		move_to_rqn(p, grt ? RT_QID: rqn, 0);
+	}
 
-	current->need_resched = 1;
+	if (grt || rqn == cpu_number_map(smp_processor_id()))
+		current->need_resched = 1;
+	else {
+#ifdef CONFIG_SMP
+		smp_send_reschedule(cpu_logical_map(rqn));
+#endif	/* #ifdef CONFIG_SMP */
+	}
 
 out_unlock:
-	spin_unlock(&runqueue_lock);
-	read_unlock_irq(&tasklist_lock);
+	rq_unlock(p);
+out_unlock_tkll:
+	write_unlock_irq(&tasklist_lock);
 
 out_nounlock:
 	return retval;
@@ -1017,6 +1489,7 @@
 
 asmlinkage long sys_sched_yield(void)
 {
+	struct task_struct *ctsk = current;
 	/*
 	 * Trick. sched_yield() first counts the number of truly 
 	 * 'pending' runnable processes, then returns if it's
@@ -1024,34 +1497,18 @@
 	 * to be atomic.) In threaded applications this optimization
 	 * gets triggered quite often.
 	 */
-
-	int nr_pending = nr_running;
-
-#if CONFIG_SMP
-	int i;
-
-	// Subtract non-idle processes running on other CPUs.
-	for (i = 0; i < smp_num_cpus; i++) {
-		int cpu = cpu_logical_map(i);
-		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
-			nr_pending--;
-	}
-#else
-	// on UP this process is on the runqueue as well
-	nr_pending--;
-#endif
-	if (nr_pending) {
+	if (qnr_running(ctsk->task_qid) > 1) {
 		/*
 		 * This process can only be rescheduled by us,
 		 * so this is safe without any locking.
 		 */
-		if (current->policy == SCHED_OTHER)
-			current->policy |= SCHED_YIELD;
-		current->need_resched = 1;
-
-		spin_lock_irq(&runqueue_lock);
-		move_last_runqueue(current);
-		spin_unlock_irq(&runqueue_lock);
+		if (ctsk->policy == SCHED_OTHER)
+			ctsk->policy |= SCHED_YIELD;
+		local_irq_disable();
+		if (ctsk->counter > 0)
+			--ctsk->counter;
+		local_irq_enable();
+		ctsk->need_resched = 1;
 	}
 	return 0;
 }
@@ -1231,7 +1688,7 @@
 
 	/* We also take the runqueue_lock while altering task fields
 	 * which affect scheduling decisions */
-	spin_lock(&runqueue_lock);
+	rq_lock(this_task);
 
 	this_task->ptrace = 0;
 	this_task->nice = DEF_NICE;
@@ -1246,7 +1703,7 @@
 	memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
 	this_task->user = INIT_USER;
 
-	spin_unlock(&runqueue_lock);
+	rq_unlock(this_task);
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -1286,34 +1743,65 @@
 
 void __init init_idle(void)
 {
-	struct schedule_data * sched_data;
-	sched_data = &aligned_data[smp_processor_id()].schedule_data;
-
 	if (current != &init_task && task_on_runqueue(current)) {
 		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
 			smp_processor_id(), current->pid);
-		del_from_runqueue(current);
+		__del_from_runqueue(current, current->task_qid);
 	}
-	sched_data->curr = current;
-	sched_data->last_schedule = get_cycles();
+	cpu_curr(cpu_number_map(smp_processor_id())) = current;
 	clear_bit(current->processor, &wait_init_idle);
 }
 
 extern void init_timervecs (void);
 
+void __init sched_cpudmap_init(void)
+{
+	int i, j;
+
+	/*
+	 * this should use ( if provided ) a topology api to setup
+	 * the distance map. right now it assignes same distance to
+	 * 4 cpus groups
+	 */
+	for (i = 0; i < NR_CPUS; i++)
+		for (j = 0; j <= i; j++)
+			if ((i & ~0x03) == (j & ~0x03))
+				cpus_dmap[i][j] = cpus_dmap[j][i] = DEF_CPU_DIST;
+			else
+				cpus_dmap[i][j] = cpus_dmap[j][i] = FAR_CPU_DIST;
+
+}
+
 void __init sched_init(void)
 {
 	/*
 	 * We have to do a little magic to get the first
 	 * process right in SMP mode.
 	 */
-	int cpu = smp_processor_id();
-	int nr;
+	int i, j, cpu = smp_processor_id();
+
+	for (i = 0; i <= NR_CPUS; i++) {
+		qnr_processes(i) = 0;
+		qnr_running(i) = 0;
+		cpu_curr(i) = &init_task;
+		rtt_chkp(i) = 0;
+		hit_cpus(i) = 0;
+		INIT_LIST_HEAD(&runqueue_head(i));
+		INIT_LIST_HEAD(&proclist_head(i));
+		runqueue_lock(i) = SPIN_LOCK_UNLOCKED;
+	}
+
+	/*
+	 * give a first initialization to the cpu distance map
+	 */
+	for (i = 0; i < NR_CPUS; i++)
+		for (j = 0; j <= i; j++)
+			cpus_dmap[i][j] = cpus_dmap[j][i] = DEF_CPU_DIST;
 
 	init_task.processor = cpu;
 
-	for(nr = 0; nr < PIDHASH_SZ; nr++)
-		pidhash[nr] = NULL;
+	for(i = 0; i < PIDHASH_SZ; i++)
+		pidhash[i] = NULL;
 
 	init_timervecs();
 
diff -Nru linux-2.5.1-pre11.vanilla/kernel/signal.c linux-2.5.1-pre11.xs2/kernel/signal.c
--- linux-2.5.1-pre11.vanilla/kernel/signal.c	Wed Nov 21 16:26:27 2001
+++ linux-2.5.1-pre11.xs2/kernel/signal.c	Thu Dec 13 11:02:58 2001
@@ -478,10 +478,10 @@
 	 * process of changing - but no harm is done by that
 	 * other than doing an extra (lightweight) IPI interrupt.
 	 */
-	spin_lock(&runqueue_lock);
+	runqueue_spin_lock(t);
 	if (task_has_cpu(t) && t->processor != smp_processor_id())
 		smp_send_reschedule(t->processor);
-	spin_unlock(&runqueue_lock);
+	runqueue_spin_unlock(t);
 #endif /* CONFIG_SMP */
 
 	if (t->state & TASK_INTERRUPTIBLE) {
diff -Nru linux-2.5.1-pre11.vanilla/kernel/softirq.c linux-2.5.1-pre11.xs2/kernel/softirq.c
--- linux-2.5.1-pre11.vanilla/kernel/softirq.c	Thu Dec 13 11:05:12 2001
+++ linux-2.5.1-pre11.xs2/kernel/softirq.c	Thu Dec 13 11:02:58 2001
@@ -369,7 +369,7 @@
 	sigfillset(&current->blocked);
 
 	/* Migrate to the right CPU */
-	current->cpus_allowed = 1UL << cpu;
+	if (move_to_cpu(current, cpu, 1) < 0) BUG();
 	while (smp_processor_id() != cpu)
 		schedule();
 
diff -Nru linux-2.5.1-pre11.vanilla/kernel/timer.c linux-2.5.1-pre11.xs2/kernel/timer.c
--- linux-2.5.1-pre11.vanilla/kernel/timer.c	Mon Oct  8 10:41:41 2001
+++ linux-2.5.1-pre11.xs2/kernel/timer.c	Thu Dec 13 11:02:58 2001
@@ -583,8 +583,11 @@
 
 	update_one_process(p, user_tick, system, cpu);
 	if (p->pid) {
-		if (--p->counter <= 0) {
+		if (p->counter > decay_ticks)
+			--p->counter;
+		else if (++p->timer_ticks >= p->counter) {
 			p->counter = 0;
+			p->timer_ticks = 0;
 			p->need_resched = 1;
 		}
 		if (p->nice > 0)
@@ -592,8 +595,12 @@
 		else
 			kstat.per_cpu_user[cpu] += user_tick;
 		kstat.per_cpu_system[cpu] += system;
-	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
-		kstat.per_cpu_system[cpu] += system;
+	} else {
+		sched_wake_idle();
+
+		if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+			kstat.per_cpu_system[cpu] += system;
+	}
 }
 
 /*
diff -Nru linux-2.5.1-pre11.vanilla/mm/oom_kill.c linux-2.5.1-pre11.xs2/mm/oom_kill.c
--- linux-2.5.1-pre11.vanilla/mm/oom_kill.c	Sat Nov  3 17:05:25 2001
+++ linux-2.5.1-pre11.xs2/mm/oom_kill.c	Thu Dec 13 11:02:58 2001
@@ -150,6 +150,7 @@
 	 * exit() and clear out its resources quickly...
 	 */
 	p->counter = 5 * HZ;
+	p->timer_ticks = 0;
 	p->flags |= PF_MEMALLOC | PF_MEMDIE;
 
 	/* This process has hardware access, be more careful. */