diff -Nru linux-2.4.13.vanilla/Makefile linux-2.4.13.xsched/Makefile
--- linux-2.4.13.vanilla/Makefile	Tue Oct 23 22:21:20 2001
+++ linux-2.4.13.xsched/Makefile	Thu Oct 25 10:43:37 2001
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 13
-EXTRAVERSION =
+EXTRAVERSION = xsched
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -Nru linux-2.4.13.vanilla/arch/i386/kernel/process.c linux-2.4.13.xsched/arch/i386/kernel/process.c
--- linux-2.4.13.vanilla/arch/i386/kernel/process.c	Thu Oct  4 18:42:54 2001
+++ linux-2.4.13.xsched/arch/i386/kernel/process.c	Thu Oct 25 10:42:59 2001
@@ -135,6 +135,9 @@
 			idle();
 		schedule();
 		check_pgt_cache();
+#ifdef CONFIG_SMP
+		runqueue_balance(IDLE_RQBALANCE);
+#endif	/* #ifdef CONFIG_SMP */
 	}
 }
 
diff -Nru linux-2.4.13.vanilla/arch/i386/kernel/smpboot.c linux-2.4.13.xsched/arch/i386/kernel/smpboot.c
--- linux-2.4.13.vanilla/arch/i386/kernel/smpboot.c	Thu Oct  4 18:42:54 2001
+++ linux-2.4.13.xsched/arch/i386/kernel/smpboot.c	Thu Oct 25 10:42:59 2001
@@ -771,7 +771,7 @@
 
 extern unsigned long cpu_initialized;
 
-static void __init do_boot_cpu (int apicid) 
+static void __init do_boot_cpu (int apicid)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -799,15 +799,14 @@
 	if (!idle)
 		panic("No idle process for CPU %d", cpu);
 
-	idle->processor = cpu;
-
 	map_cpu_to_boot_apicid(cpu, apicid);
 
-	idle->has_cpu = 1; /* we schedule the first task manually */
-	idle->thread.eip = (unsigned long) start_secondary;
-
 	del_from_runqueue(idle);
 	unhash_process(idle);
+
+	idle->has_cpu = 1; /* we schedule the first task manually */
+	idle->thread.eip = (unsigned long) start_secondary;
+	idle->processor = cpu;
 	init_tasks[cpu] = idle;
 
 	/* start_eip had better be page-aligned! */
@@ -830,7 +829,7 @@
 		/* stash the current NMI vector, so we can put things back */
 		nmi_high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
 		nmi_low = *((volatile unsigned short *) TRAMPOLINE_LOW);
-	} 
+	}
 
 	CMOS_WRITE(0xa, 0xf);
 	local_flush_tlb();
diff -Nru linux-2.4.13.vanilla/include/linux/sched.h linux-2.4.13.xsched/include/linux/sched.h
--- linux-2.4.13.vanilla/include/linux/sched.h	Tue Oct 23 21:59:06 2001
+++ linux-2.4.13.xsched/include/linux/sched.h	Mon Oct 29 12:19:46 2001
@@ -15,6 +15,7 @@
 #include <linux/rbtree.h>
 
 #include <asm/system.h>
+#include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -72,7 +73,10 @@
 #define CT_TO_SECS(x)	((x) / HZ)
 #define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
 
-extern int nr_running, nr_threads;
+#define nr_running	atomic_read(&gnr_running)
+
+extern atomic_t gnr_running;
+extern int nr_threads;
 extern int last_pid;
 
 #include <linux/fs.h>
@@ -139,7 +143,6 @@
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
-extern spinlock_t runqueue_lock;
 extern spinlock_t mmlist_lock;
 
 extern void sched_init(void);
@@ -312,6 +315,7 @@
 	 */
 	struct list_head run_list;
 	unsigned long sleep_time;
+	unsigned long cpu_jtime, sched_jtime;
 
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
@@ -332,9 +336,9 @@
 	pid_t tgid;
 	/* boolean value for session group leader */
 	int leader;
-	/* 
+	/*
 	 * pointers to (original) parent process, youngest child, younger sibling,
-	 * older sibling, respectively.  (p->father can be replaced with 
+	 * older sibling, respectively.  (p->father can be replaced with
 	 * p->p_pptr->pid)
 	 */
 	struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
@@ -393,12 +397,15 @@
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
-	
+
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty */
 	spinlock_t alloc_lock;
+/* a better place for these brothers must be found */
+	int move_to_cpu;
+	struct list_head proclist_cpu;
 };
 
 /*
@@ -485,7 +492,10 @@
     sig:		&init_signals,					\
     pending:		{ NULL, &tsk.pending.head, {{0}}},		\
     blocked:		{{0}},						\
-    alloc_lock:		SPIN_LOCK_UNLOCKED				\
+    alloc_lock:		SPIN_LOCK_UNLOCKED,				\
+	cpu_jtime:		0,						\
+	move_to_cpu:	0,						\
+	proclist_cpu:	LIST_HEAD_INIT(tsk.proclist_cpu),			\
 }
 
 
@@ -765,6 +775,20 @@
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
+extern void del_from_runqueue(struct task_struct * p);
+extern void add_to_proclist(struct task_struct * p);
+extern void del_from_proclist(struct task_struct * p);
+extern int move_to_cpu(struct task_struct * p, int cpu, int stick);
+extern int get_best_cpu(void);
+extern int runqueue_balance(int mode);
+extern void runqueue_spin_lock(struct task_struct * p);
+extern void runqueue_spin_unlock(struct task_struct * p);
+
+
+#define IDLE_RQBALANCE	0
+
+
+
 #define __wait_event(wq, condition) 					\
 do {									\
 	wait_queue_t __wait;						\
@@ -808,7 +832,7 @@
 	current->state = TASK_RUNNING;					\
 	remove_wait_queue(&wq, &__wait);				\
 } while (0)
-	
+
 #define wait_event_interruptible(wq, condition)				\
 ({									\
 	int __ret = 0;							\
@@ -818,6 +842,7 @@
 })
 
 #define REMOVE_LINKS(p) do { \
+	del_from_proclist(p); \
 	(p)->next_task->prev_task = (p)->prev_task; \
 	(p)->prev_task->next_task = (p)->next_task; \
 	if ((p)->p_osptr) \
@@ -829,6 +854,7 @@
 	} while (0)
 
 #define SET_LINKS(p) do { \
+	add_to_proclist(p); \
 	(p)->next_task = &init_task; \
 	(p)->prev_task = init_task.prev_task; \
 	init_task.prev_task->next_task = (p); \
@@ -845,13 +871,6 @@
 #define next_thread(p) \
 	list_entry((p)->thread_group.next, struct task_struct, thread_group)
 
-static inline void del_from_runqueue(struct task_struct * p)
-{
-	nr_running--;
-	p->sleep_time = jiffies;
-	list_del(&p->run_list);
-	p->run_list.next = NULL;
-}
 
 static inline int task_on_runqueue(struct task_struct *p)
 {
diff -Nru linux-2.4.13.vanilla/kernel/fork.c linux-2.4.13.xsched/kernel/fork.c
--- linux-2.4.13.vanilla/kernel/fork.c	Tue Oct 23 17:44:15 2001
+++ linux-2.4.13.xsched/kernel/fork.c	Thu Oct 25 10:42:59 2001
@@ -21,6 +21,7 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 
+#include <asm/atomic.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -28,7 +29,7 @@
 
 /* The idle threads do not count.. */
 int nr_threads;
-int nr_running;
+atomic_t gnr_running = ATOMIC_INIT(0);
 
 int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
@@ -598,7 +599,7 @@
 	 */
 	if (nr_threads >= max_threads)
 		goto bad_fork_cleanup_count;
-	
+
 	get_exec_domain(p->exec_domain);
 
 	if (p->binfmt && p->binfmt->module)
@@ -639,7 +640,7 @@
 	{
 		int i;
 		p->has_cpu = 0;
-		p->processor = current->processor;
+		p->processor = clone_flags & CLONE_PID ? current->processor: get_best_cpu();
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < smp_num_cpus; i++)
 			p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
@@ -665,10 +666,10 @@
 	if (retval)
 		goto bad_fork_cleanup_mm;
 	p->semundo = NULL;
-	
+
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
-	   
+
 	p->parent_exec_id = p->self_exec_id;
 
 	/* ok, now we should be set up.. */
@@ -687,6 +688,10 @@
 	if (!current->counter)
 		current->need_resched = 1;
 
+	p->cpu_jtime = 0;
+	p->sched_jtime = jiffies;
+	p->move_to_cpu = 0;
+
 	/*
 	 * Ok, add it to the run-queues and make it
 	 * visible to the rest of the system.
@@ -774,7 +779,7 @@
 		panic("Cannot create signal action SLAB cache");
 
 	files_cachep = kmem_cache_create("files_cache", 
-			 sizeof(struct files_struct), 0, 
+			 sizeof(struct files_struct), 0,
 			 SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!files_cachep) 
 		panic("Cannot create files SLAB cache");
diff -Nru linux-2.4.13.vanilla/kernel/ksyms.c linux-2.4.13.xsched/kernel/ksyms.c
--- linux-2.4.13.vanilla/kernel/ksyms.c	Wed Oct 17 14:32:50 2001
+++ linux-2.4.13.xsched/kernel/ksyms.c	Thu Oct 25 10:42:59 2001
@@ -443,7 +443,6 @@
 #endif
 
 EXPORT_SYMBOL(kstat);
-EXPORT_SYMBOL(nr_running);
 
 /* misc */
 EXPORT_SYMBOL(panic);
diff -Nru linux-2.4.13.vanilla/kernel/sched.c linux-2.4.13.xsched/kernel/sched.c
--- linux-2.4.13.vanilla/kernel/sched.c	Wed Oct 17 14:14:37 2001
+++ linux-2.4.13.xsched/kernel/sched.c	Sun Oct 28 19:59:59 2001
@@ -74,24 +74,19 @@
  *	Init task must be ok at boot for the ix86 as we will check its signals
  *	via the SMP irq return path.
  */
- 
+
 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
 
 /*
  * The tasklist_lock protects the linked list of processes.
  *
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
- *
- * If both locks are to be concurrently held, the runqueue_lock
+ * If both locks are to be concurrently held, the runqueue_lock(cpu)
  * nests inside the tasklist_lock.
  *
  * task->alloc_lock nests inside tasklist_lock.
  */
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
-static LIST_HEAD(runqueue_head);
 
 /*
  * We align per-CPU scheduling data on cacheline boundaries,
@@ -99,14 +94,36 @@
  */
 static union {
 	struct schedule_data {
+		atomic_t qnr_processes;
+		atomic_t qnr_running;
+		struct list_head proclist_head;
+		struct list_head runqueue_head;
+		spinlock_t runqueue_lock;
 		struct task_struct * curr;
-		cycles_t last_schedule;
 	} schedule_data;
 	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+} aligned_data [NR_CPUS] __cacheline_aligned;
+
+#ifdef CONFIG_SMP
 
 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#define qnr_processes(cpu) aligned_data[(cpu)].schedule_data.qnr_processes
+#define qnr_running(cpu) aligned_data[(cpu)].schedule_data.qnr_running
+#define proclist_head(cpu) aligned_data[(cpu)].schedule_data.proclist_head
+#define runqueue_head(cpu) aligned_data[(cpu)].schedule_data.runqueue_head
+#define runqueue_lock(cpu) aligned_data[(cpu)].schedule_data.runqueue_lock
+
+#else	/* #ifdef CONFIG_SMP */
+
+#define cpu_curr(cpu) aligned_data[0].schedule_data.curr
+#define qnr_processes(cpu) aligned_data[0].schedule_data.qnr_processes
+#define qnr_running(cpu) aligned_data[0].schedule_data.qnr_running
+#define proclist_head(cpu) aligned_data[0].schedule_data.proclist_head
+#define runqueue_head(cpu) aligned_data[0].schedule_data.runqueue_head
+#define runqueue_lock(cpu) aligned_data[0].schedule_data.runqueue_lock
+
+#endif	/* #ifdef CONFIG_SMP */
+
 
 struct kernel_stat kstat;
 extern struct task_struct *child_reaper;
@@ -124,8 +141,29 @@
 
 #endif
 
+#define rq_lock(p)	lock_task_rq(p)
+#define rq_unlock(p)	spin_unlock(&runqueue_lock(p->processor))
+#define rq_lock_irq(p)	do { local_irq_disable(); lock_task_rq(p); } while (0)
+#define rq_unlock_irq(p)	do { spin_unlock(&runqueue_lock(p->processor)); local_irq_enable(); } while (0)
+#define rq_lock_irqsave(p, f)	do { local_irq_save(f); lock_task_rq(p); } while (0)
+#define rq_unlock_irqrestore(p, f)	do { spin_unlock(&runqueue_lock(p->processor)); local_irq_restore(f); } while (0)
+
+
+
 void scheduling_functions_start_here(void) { }
 
+static inline void lock_task_rq(struct task_struct *p)
+{
+	int cpu = p->processor;
+
+	spin_lock(&runqueue_lock(cpu));
+	while (p->processor != cpu) {
+		spin_unlock(&runqueue_lock(cpu));
+		cpu = p->processor;
+		spin_lock(&runqueue_lock(cpu));
+	}
+}
+
 /*
  * This is the function that decides how desirable a process is..
  * You can weigh different processes against each other depending
@@ -140,7 +178,7 @@
  *	 +1000: realtime process, select this.
  */
 
-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+static inline int goodness(struct task_struct * p, struct mm_struct *this_mm)
 {
 	int weight;
 
@@ -167,13 +205,12 @@
 		weight = p->counter;
 		if (!weight)
 			goto out;
-			
-#ifdef CONFIG_SMP
-		/* Give a largish advantage to the same processor...   */
-		/* (this is equivalent to penalizing other processors) */
-		if (p->processor == this_cpu)
-			weight += PROC_CHANGE_PENALTY;
-#endif
+
+		/* add advantage related to the history of this task on this cpu
+		 * this try to account the cache footprint of p in this_cpu
+		 */
+		if (p->cpu_jtime > jiffies)
+			weight += p->cpu_jtime - jiffies;
 
 		/* .. and a slight advantage to the current MM */
 		if (p->mm == this_mm || !p->mm)
@@ -196,9 +233,9 @@
  * the 'goodness value' of replacing a process on a given CPU.
  * positive value means 'replace', zero or negative means 'dont'.
  */
-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p)
 {
-	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+	return goodness(p, prev->active_mm) - goodness(prev, prev->active_mm);
 }
 
 /*
@@ -211,92 +248,33 @@
 static void reschedule_idle(struct task_struct * p)
 {
 #ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, i, max_prio;
-	cycles_t oldest_idle;
-
-	/*
-	 * shortcut if the woken up task's last CPU is
-	 * idle now.
-	 */
-	best_cpu = p->processor;
-	if (can_schedule(p, best_cpu)) {
-		tsk = idle_task(best_cpu);
-		if (cpu_curr(best_cpu) == tsk) {
-			int need_resched;
-send_now_idle:
-			/*
-			 * If need_resched == -1 then we can skip sending
-			 * the IPI altogether, tsk->need_resched is
-			 * actively watched by the idle thread.
-			 */
-			need_resched = tsk->need_resched;
-			tsk->need_resched = 1;
-			if ((best_cpu != this_cpu) && !need_resched)
-				smp_send_reschedule(best_cpu);
-			return;
-		}
-	}
+	int best_cpu = p->processor, this_cpu = smp_processor_id(), need_resched;
+	struct task_struct *tsk;
 
-	/*
-	 * We know that the preferred CPU has a cache-affine current
-	 * process, lets try to find a new idle CPU for the woken-up
-	 * process. Select the least recently active idle CPU. (that
-	 * one will have the least active cache context.) Also find
-	 * the executing process which has the least priority.
-	 */
-	oldest_idle = (cycles_t) -1;
-	target_tsk = NULL;
-	max_prio = 0;
-
-	for (i = 0; i < smp_num_cpus; i++) {
-		cpu = cpu_logical_map(i);
-		if (!can_schedule(p, cpu))
-			continue;
-		tsk = cpu_curr(cpu);
+	tsk = cpu_curr(best_cpu);
+	if (tsk == idle_task(best_cpu)) {
 		/*
-		 * We use the first available idle CPU. This creates
-		 * a priority list between idle CPUs, but this is not
-		 * a problem.
+		 * If need_resched == -1 then we can skip sending
+		 * the IPI altogether, tsk->need_resched is
+		 * actively watched by the idle thread.
 		 */
-		if (tsk == idle_task(cpu)) {
-			if (last_schedule(cpu) < oldest_idle) {
-				oldest_idle = last_schedule(cpu);
-				target_tsk = tsk;
-			}
-		} else {
-			if (oldest_idle == -1ULL) {
-				int prio = preemption_goodness(tsk, p, cpu);
-
-				if (prio > max_prio) {
-					max_prio = prio;
-					target_tsk = tsk;
-				}
-			}
-		}
-	}
-	tsk = target_tsk;
-	if (tsk) {
-		if (oldest_idle != -1ULL) {
-			best_cpu = tsk->processor;
-			goto send_now_idle;
-		}
+		need_resched = tsk->need_resched;
+		tsk->need_resched = 1;
+		if ((best_cpu != this_cpu) && !need_resched)
+			smp_send_reschedule(best_cpu);
+	} else if (tsk != p && preemption_goodness(tsk, p) > 0) {
 		tsk->need_resched = 1;
 		if (tsk->processor != this_cpu)
 			smp_send_reschedule(tsk->processor);
 	}
-	return;
-		
-
-#else /* UP */
+#else	/* #ifdef CONFIG_SMP */
 	int this_cpu = smp_processor_id();
 	struct task_struct *tsk;
 
 	tsk = cpu_curr(this_cpu);
-	if (preemption_goodness(tsk, p, this_cpu) > 0)
+	if (preemption_goodness(tsk, p) > 0)
 		tsk->need_resched = 1;
-#endif
+#endif	/* #ifdef CONFIG_SMP */
 }
 
 /*
@@ -306,22 +284,172 @@
  * run-queue, not the end. See the comment about "This is
  * subtle" in the scheduler proper..
  */
-static inline void add_to_runqueue(struct task_struct * p)
+static inline void __add_to_runqueue(struct task_struct * p)
+{
+	list_add(&p->run_list, &runqueue_head(p->processor));
+	atomic_inc(&qnr_running(p->processor));
+	atomic_inc(&gnr_running);
+}
+
+static inline void __del_from_runqueue(struct task_struct * p)
+{
+	atomic_dec(&gnr_running);
+	atomic_dec(&qnr_running(p->processor));
+	p->sleep_time = jiffies;
+	list_del(&p->run_list);
+	p->run_list.next = NULL;
+}
+
+void del_from_runqueue(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__del_from_runqueue(p);
+	rq_unlock_irqrestore(p, flags);
+}
+
+static inline void __add_to_proclist(struct task_struct * p)
+{
+	list_add(&p->proclist_cpu, &proclist_head(p->processor));
+	atomic_inc(&qnr_processes(p->processor));
+}
+
+void add_to_proclist(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__add_to_proclist(p);
+	rq_unlock_irqrestore(p, flags);
+}
+
+static inline void __del_from_proclist(struct task_struct * p)
 {
-	list_add(&p->run_list, &runqueue_head);
-	nr_running++;
+	list_del(&p->proclist_cpu);
+	atomic_dec(&qnr_processes(p->processor));
+	p->proclist_cpu.next = NULL;
 }
 
-static inline void move_last_runqueue(struct task_struct * p)
+void del_from_proclist(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__del_from_proclist(p);
+	rq_unlock_irqrestore(p, flags);
+}
+
+void runqueue_spin_lock(struct task_struct * p)
+{
+	rq_lock(p);
+}
+
+void runqueue_spin_unlock(struct task_struct * p)
+{
+	rq_unlock(p);
+}
+
+static inline void __move_last_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+	list_add_tail(&p->run_list, &runqueue_head(p->processor));
 }
 
-static inline void move_first_runqueue(struct task_struct * p)
+static inline void __move_first_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	list_add(&p->run_list, &runqueue_head(p->processor));
+}
+
+int move_to_cpu(struct task_struct * p, int cpu, int stick)
+{
+	int res = 0;
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	if (p == idle_task(p->processor)) BUG();
+	if (p->processor != cpu) {
+		if (!p->move_to_cpu) {
+			p->move_to_cpu = stick ? -cpu - 1: cpu + 1;
+			res = 1;
+		}
+	} else {
+		if (stick)
+			p->cpus_allowed = (1 << cpu);
+		res = 1;
+	}
+	rq_unlock_irqrestore(p, flags);
+	return res;
+}
+
+/*
+ * try to find the best cpu to run a fresh new process, no locks are held
+ * during this function. it gets called by do_fork() in SMP mode
+ */
+int get_best_cpu(void)
+{
+	int nr, best_cpu, this_cpu = smp_processor_id();
+	int min_nr_running, cpu_running, cpu_processes, min_nr_processes;
+
+	best_cpu = this_cpu;
+	min_nr_running = atomic_read(&qnr_running(this_cpu));
+	min_nr_processes = atomic_read(&qnr_processes(this_cpu));
+	for (nr = 0; nr < smp_num_cpus; nr++) {
+		if (nr == this_cpu) continue;
+		cpu_running = atomic_read(&qnr_running(nr));
+		if (cpu_running < min_nr_running) {
+			min_nr_running = cpu_running;
+			min_nr_processes = atomic_read(&qnr_processes(nr));
+			best_cpu = nr;
+		} else if (cpu_running == min_nr_running &&
+				(cpu_processes = atomic_read(&qnr_processes(nr))) < min_nr_processes) {
+			min_nr_processes = cpu_processes;
+			best_cpu = nr;
+		}
+	}
+	return best_cpu;
+}
+
+static inline int try_steal_task(int src_cpu, int dst_cpu)
+{
+	int res = 0;
+	unsigned long flags;
+	struct task_struct *tsk;
+	struct list_head *head, *tmp;
+
+	spin_lock_irqsave(&runqueue_lock(src_cpu), flags);
+	head = &runqueue_head(src_cpu);
+	list_for_each(tmp, head) {
+		tsk = list_entry(tmp, struct task_struct, run_list);
+		if (can_schedule(tsk, dst_cpu) && !tsk->move_to_cpu) {
+			tsk->move_to_cpu = dst_cpu + 1;
+			res = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&runqueue_lock(src_cpu), flags);
+	return res;
+}
+
+/*
+ * very basic balancing function that search for the most loaded cpu and
+ * try to steal a process from there, no locks are help during the cpu loop.
+ */
+int runqueue_balance(int mode)
+{
+	int nr, this_cpu = smp_processor_id(), max_nr_running = 0, max_cpu = 0;
+
+	for (nr = 0; nr < smp_num_cpus; nr++) {
+		if (nr == this_cpu) continue;
+		if (atomic_read(&qnr_running(nr)) > max_nr_running) {
+			max_nr_running = atomic_read(&qnr_running(nr));
+			max_cpu = nr;
+		}
+	}
+	if (max_nr_running > (atomic_read(&qnr_running(this_cpu)) + 1))
+		try_steal_task(max_cpu, this_cpu);
+	return 0;
 }
 
 /*
@@ -340,16 +468,16 @@
 	/*
 	 * We want the common case fall through straight, thus the goto.
 	 */
-	spin_lock_irqsave(&runqueue_lock, flags);
+	rq_lock_irqsave(p, flags);
 	p->state = TASK_RUNNING;
-	if (task_on_runqueue(p))
+	if (task_on_runqueue(p) || p->move_to_cpu)
 		goto out;
-	add_to_runqueue(p);
-	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
+	__add_to_runqueue(p);
+	if (!synchronous || p->processor != smp_processor_id())
 		reschedule_idle(p);
 	success = 1;
 out:
-	spin_unlock_irqrestore(&runqueue_lock, flags);
+	rq_unlock_irqrestore(p, flags);
 	return success;
 }
 
@@ -382,7 +510,7 @@
  * delivered to the current task. In this case the remaining time
  * in jiffies will be returned, or 0 if the timer expired in time
  *
- * The current task state is guaranteed to be TASK_RUNNING when this 
+ * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
  *
  * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
@@ -475,41 +603,7 @@
 	task_lock(prev);
 	prev->has_cpu = 0;
 	mb();
-	if (prev->state == TASK_RUNNING)
-		goto needs_resched;
-
-out_unlock:
 	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
-	return;
-
-	/*
-	 * Slow path - we 'push' the previous process and
-	 * reschedule_idle() will attempt to find a new
-	 * processor for it. (but it might preempt the
-	 * current process as well.) We must take the runqueue
-	 * lock and re-check prev->state to be correct. It might
-	 * still happen that this process has a preemption
-	 * 'in progress' already - but this is not a problem and
-	 * might happen in other circumstances as well.
-	 */
-needs_resched:
-	{
-		unsigned long flags;
-
-		/*
-		 * Avoid taking the runqueue lock in cases where
-		 * no preemption-check is necessery:
-		 */
-		if ((prev == idle_task(smp_processor_id())) ||
-						(policy & SCHED_YIELD))
-			goto out_unlock;
-
-		spin_lock_irqsave(&runqueue_lock, flags);
-		if ((prev->state == TASK_RUNNING) && !prev->has_cpu)
-			reschedule_idle(prev);
-		spin_unlock_irqrestore(&runqueue_lock, flags);
-		goto out_unlock;
-	}
 #else
 	prev->policy &= ~SCHED_YIELD;
 #endif /* CONFIG_SMP */
@@ -534,11 +628,10 @@
 {
 	struct schedule_data * sched_data;
 	struct task_struct *prev, *next, *p;
-	struct list_head *tmp;
+	struct list_head *head, *tmp;
 	int this_cpu, c;
 
-
-	spin_lock_prefetch(&runqueue_lock);
+	spin_lock_prefetch(&runqueue_lock(current->processor));
 
 	if (!current->active_mm) BUG();
 need_resched_back:
@@ -556,7 +649,7 @@
 	 */
 	sched_data = & aligned_data[this_cpu].schedule_data;
 
-	spin_lock_irq(&runqueue_lock);
+	spin_lock_irq(&runqueue_lock(this_cpu));
 
 	/* move an exhausted RR process to be last.. */
 	if (prev->policy == SCHED_RR)
@@ -570,10 +663,17 @@
 				break;
 			}
 		default:
-			del_from_runqueue(prev);
+			__del_from_runqueue(prev);
 		case TASK_RUNNING:;
 	}
 	prev->need_resched = 0;
+	/* we certainly do not want to do this onto the idle task */
+	if (prev != idle_task(this_cpu)) {
+		/* this save the cpu time that has not been consumed by the previous preemption */
+		prev->cpu_jtime = prev->cpu_jtime > prev->sched_jtime ? (prev->cpu_jtime - prev->sched_jtime) >> 1: 0;
+		/* recalculate the cpu time */
+		prev->cpu_jtime += (jiffies - prev->sched_jtime) + jiffies;
+	}
 
 	/*
 	 * this is the scheduler proper:
@@ -589,10 +689,11 @@
 		goto still_running;
 
 still_running_back:
-	list_for_each(tmp, &runqueue_head) {
+	head = &runqueue_head(this_cpu);
+	list_for_each(tmp, head) {
 		p = list_entry(tmp, struct task_struct, run_list);
 		if (can_schedule(p, this_cpu)) {
-			int weight = goodness(p, this_cpu, prev->active_mm);
+			int weight = goodness(p, prev->active_mm);
 			if (weight > c)
 				c = weight, next = p;
 		}
@@ -601,6 +702,12 @@
 	/* Do we need to re-calculate counters? */
 	if (!c)
 		goto recalculate;
+
+#ifdef CONFIG_SMP
+	if (next->move_to_cpu)
+		goto cpu_migrate;
+cpu_migrate_back:
+#endif	/* #ifdef CONFIG_SMP */
 	/*
 	 * from this point on nothing can prevent us from
 	 * switching to the next task, save this fact in
@@ -609,9 +716,9 @@
 	sched_data->curr = next;
 #ifdef CONFIG_SMP
  	next->has_cpu = 1;
-	next->processor = this_cpu;
-#endif
-	spin_unlock_irq(&runqueue_lock);
+#endif	/* #ifdef CONFIG_SMP */
+	next->sched_jtime = jiffies;
+	spin_unlock_irq(&runqueue_lock(this_cpu));
 
 	if (prev == next) {
 		/* We won't go through the normal tail, so do this by hand */
@@ -619,24 +726,6 @@
 		goto same_process;
 	}
 
-#ifdef CONFIG_SMP
- 	/*
- 	 * maintain the per-process 'last schedule' value.
- 	 * (this has to be recalculated even if we reschedule to
- 	 * the same process) Currently this is only used on SMP,
-	 * and it's approximate, so we do not have to maintain
-	 * it while holding the runqueue spinlock.
- 	 */
- 	sched_data->last_schedule = get_cycles();
-
-	/*
-	 * We drop the scheduler lock early (it's a global spinlock),
-	 * thus we have to lock the previous process from getting
-	 * rescheduled during switch_to().
-	 */
-
-#endif /* CONFIG_SMP */
-
 	kstat.context_swtch++;
 	/*
 	 * there are 3 processes which are affected by a context switch:
@@ -683,30 +772,71 @@
 
 recalculate:
 	{
-		struct task_struct *p;
-		spin_unlock_irq(&runqueue_lock);
+		spin_unlock_irq(&runqueue_lock(this_cpu));
 		read_lock(&tasklist_lock);
-		for_each_task(p)
+		head = &proclist_head(this_cpu);
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, struct task_struct, proclist_cpu);
 			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
+		}
 		read_unlock(&tasklist_lock);
-		spin_lock_irq(&runqueue_lock);
+		spin_lock_irq(&runqueue_lock(this_cpu));
 	}
 	goto repeat_schedule;
 
 still_running:
 	if (!(prev->cpus_allowed & (1UL << this_cpu)))
 		goto still_running_back;
-	c = goodness(prev, this_cpu, prev->active_mm);
+	c = goodness(prev, prev->active_mm);
 	next = prev;
 	goto still_running_back;
 
 move_rr_last:
 	if (!prev->counter) {
 		prev->counter = NICE_TO_TICKS(prev->nice);
-		move_last_runqueue(prev);
+		__move_last_runqueue(prev);
 	}
 	goto move_rr_back;
 
+#ifdef CONFIG_SMP
+cpu_migrate:
+	{
+		int move_cpu, next_cpu, stick;
+
+		if (next == prev) {
+			next = idle_task(this_cpu);
+			next->need_resched = 1;
+			goto cpu_migrate_back;
+		}
+		if (next->move_to_cpu > 0)
+			move_cpu = next->move_to_cpu - 1, stick = 0;
+		else
+			move_cpu = -next->move_to_cpu - 1, stick = 1;
+		__del_from_runqueue(next);
+		spin_unlock_irq(&runqueue_lock(this_cpu));
+
+		write_lock_irq(&tasklist_lock);
+		lock_task_rq(next);
+		__del_from_proclist(next);
+		next_cpu = next->processor;
+		next->processor = move_cpu;
+		if (stick)
+			next->cpus_allowed = (1 << move_cpu);
+		spin_unlock(&runqueue_lock(next_cpu));
+
+		spin_lock(&runqueue_lock(move_cpu));
+		__add_to_proclist(next);
+		__add_to_runqueue(next);
+		next->move_to_cpu = 0;
+		reschedule_idle(next);
+		spin_unlock(&runqueue_lock(move_cpu));
+		write_unlock_irq(&tasklist_lock);
+
+		spin_lock_irq(&runqueue_lock(this_cpu));
+	}
+	goto repeat_schedule;
+#endif	/* #ifdef CONFIG_SMP */
+
 scheduling_in_interrupt:
 	printk("Scheduling in interrupt\n");
 	BUG();
@@ -730,7 +860,7 @@
 
 	CHECK_MAGIC_WQHEAD(q);
 	WQ_CHECK_LIST_HEAD(&q->task_list);
-	
+
 	list_for_each(tmp,&q->task_list) {
 		unsigned int state;
                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
@@ -849,7 +979,7 @@
 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
-	
+
 	current->state = TASK_UNINTERRUPTIBLE;
 
 	SLEEP_ON_HEAD
@@ -907,7 +1037,7 @@
 	return tsk;
 }
 
-static int setscheduler(pid_t pid, int policy, 
+static int setscheduler(pid_t pid, int policy,
 			struct sched_param *param)
 {
 	struct sched_param lp;
@@ -926,14 +1056,14 @@
 	 * We play safe to avoid deadlocks.
 	 */
 	read_lock_irq(&tasklist_lock);
-	spin_lock(&runqueue_lock);
 
 	p = find_process_by_pid(pid);
 
 	retval = -ESRCH;
 	if (!p)
-		goto out_unlock;
-			
+		goto out_unlock_tkll;
+
+	rq_lock(p);
 	if (policy < 0)
 		policy = p->policy;
 	else {
@@ -942,7 +1072,7 @@
 				policy != SCHED_OTHER)
 			goto out_unlock;
 	}
-	
+
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 	 * priority for SCHED_OTHER is 0.
@@ -954,7 +1084,7 @@
 		goto out_unlock;
 
 	retval = -EPERM;
-	if ((policy == SCHED_FIFO || policy == SCHED_RR) && 
+	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
@@ -965,19 +1095,20 @@
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;
 	if (task_on_runqueue(p))
-		move_first_runqueue(p);
+		__move_first_runqueue(p);
 
 	current->need_resched = 1;
 
 out_unlock:
-	spin_unlock(&runqueue_lock);
+	rq_unlock(p);
+out_unlock_tkll:
 	read_unlock_irq(&tasklist_lock);
 
 out_nounlock:
 	return retval;
 }
 
-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 				      struct sched_param *param)
 {
 	return setscheduler(pid, policy, param);
@@ -1042,29 +1173,13 @@
 asmlinkage long sys_sched_yield(void)
 {
 	/*
-	 * Trick. sched_yield() first counts the number of truly 
+	 * Trick. sched_yield() first counts the number of truly
 	 * 'pending' runnable processes, then returns if it's
 	 * only the current processes. (This test does not have
 	 * to be atomic.) In threaded applications this optimization
 	 * gets triggered quite often.
 	 */
-
-	int nr_pending = nr_running;
-
-#if CONFIG_SMP
-	int i;
-
-	// Subtract non-idle processes running on other CPUs.
-	for (i = 0; i < smp_num_cpus; i++) {
-		int cpu = cpu_logical_map(i);
-		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
-			nr_pending--;
-	}
-#else
-	// on UP this process is on the runqueue as well
-	nr_pending--;
-#endif
-	if (nr_pending) {
+	if (atomic_read(&qnr_running(current->processor)) > 1) {
 		/*
 		 * This process can only be rescheduled by us,
 		 * so this is safe without any locking.
@@ -1259,7 +1374,7 @@
 
 	/* We also take the runqueue_lock while altering task fields
 	 * which affect scheduling decisions */
-	spin_lock(&runqueue_lock);
+	rq_lock(this_task);
 
 	this_task->ptrace = 0;
 	this_task->nice = DEF_NICE;
@@ -1274,7 +1389,7 @@
 	memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
 	this_task->user = INIT_USER;
 
-	spin_unlock(&runqueue_lock);
+	rq_unlock(this_task);
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -1320,10 +1435,11 @@
 	if (current != &init_task && task_on_runqueue(current)) {
 		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
 			smp_processor_id(), current->pid);
-		del_from_runqueue(current);
+		__del_from_runqueue(current);
 	}
+	current->cpu_jtime = 0;
+	current->sched_jtime = jiffies;
 	sched_data->curr = current;
-	sched_data->last_schedule = get_cycles();
 	clear_bit(current->processor, &wait_init_idle);
 }
 
@@ -1335,8 +1451,16 @@
 	 * We have to do a little magic to get the first
 	 * process right in SMP mode.
 	 */
-	int cpu = smp_processor_id();
-	int nr;
+	int nr, cpu = smp_processor_id();
+
+	for (nr = 0; nr < NR_CPUS; nr++) {
+		atomic_set(&qnr_processes(nr), 0);
+		atomic_set(&qnr_running(nr), 0);
+		cpu_curr(nr) = &init_task;
+		INIT_LIST_HEAD(&runqueue_head(nr));
+		INIT_LIST_HEAD(&proclist_head(nr));
+		runqueue_lock(nr) = SPIN_LOCK_UNLOCKED;
+	}
 
 	init_task.processor = cpu;
 
diff -Nru linux-2.4.13.vanilla/kernel/signal.c linux-2.4.13.xsched/kernel/signal.c
--- linux-2.4.13.vanilla/kernel/signal.c	Mon Sep 17 16:40:01 2001
+++ linux-2.4.13.xsched/kernel/signal.c	Thu Oct 25 10:42:59 2001
@@ -478,10 +478,10 @@
 	 * process of changing - but no harm is done by that
 	 * other than doing an extra (lightweight) IPI interrupt.
 	 */
-	spin_lock(&runqueue_lock);
+	runqueue_spin_lock(t);
 	if (t->has_cpu && t->processor != smp_processor_id())
 		smp_send_reschedule(t->processor);
-	spin_unlock(&runqueue_lock);
+	runqueue_spin_unlock(t);
 #endif /* CONFIG_SMP */
 
 	if (t->state & TASK_INTERRUPTIBLE) {
diff -Nru linux-2.4.13.vanilla/kernel/softirq.c linux-2.4.13.xsched/kernel/softirq.c
--- linux-2.4.13.vanilla/kernel/softirq.c	Sat Sep  8 12:02:32 2001
+++ linux-2.4.13.xsched/kernel/softirq.c	Thu Oct 25 10:42:59 2001
@@ -369,7 +369,7 @@
 	sigfillset(&current->blocked);
 
 	/* Migrate to the right CPU */
-	current->cpus_allowed = 1UL << cpu;
+	if (!move_to_cpu(current, cpu, 1)) BUG();
 	while (smp_processor_id() != cpu)
 		schedule();
 
