diff -Nru linux-2.5.2-pre9.vanilla/Makefile linux-2.5.2-pre9.xs2/Makefile --- linux-2.5.2-pre9.vanilla/Makefile Sat Jan 5 19:46:25 2002 +++ linux-2.5.2-pre9.xs2/Makefile Sun Jan 6 15:04:22 2002 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 2 -EXTRAVERSION =-pre9 +EXTRAVERSION = -pre9-xs2 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -Nru linux-2.5.2-pre9.vanilla/arch/i386/kernel/process.c linux-2.5.2-pre9.xs2/arch/i386/kernel/process.c --- linux-2.5.2-pre9.vanilla/arch/i386/kernel/process.c Sat Jan 5 19:46:25 2002 +++ linux-2.5.2-pre9.xs2/arch/i386/kernel/process.c Sun Jan 6 15:04:05 2002 @@ -122,10 +122,6 @@ */ void cpu_idle (void) { - /* endless idle loop with no priority at all */ - init_idle(); - current->nice = 20; - while (1) { void (*idle)(void) = pm_idle; if (!idle) diff -Nru linux-2.5.2-pre9.vanilla/arch/i386/kernel/smpboot.c linux-2.5.2-pre9.xs2/arch/i386/kernel/smpboot.c --- linux-2.5.2-pre9.vanilla/arch/i386/kernel/smpboot.c Sat Jan 5 19:46:25 2002 +++ linux-2.5.2-pre9.xs2/arch/i386/kernel/smpboot.c Sun Jan 6 15:04:05 2002 @@ -471,6 +471,7 @@ */ local_flush_tlb(); + init_idle(); return cpu_idle(); } @@ -803,15 +804,13 @@ if (!idle) panic("No idle process for CPU %d", cpu); - idle->processor = cpu; - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ - map_cpu_to_boot_apicid(cpu, apicid); - idle->thread.eip = (unsigned long) start_secondary; - del_from_runqueue(idle); unhash_process(idle); + idle->processor = cpu; + idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ + idle->thread.eip = (unsigned long) start_secondary; init_tasks[cpu] = idle; /* start_eip had better be page-aligned! */ diff -Nru linux-2.5.2-pre9.vanilla/include/linux/sched.h linux-2.5.2-pre9.xs2/include/linux/sched.h --- linux-2.5.2-pre9.vanilla/include/linux/sched.h Sat Jan 5 19:46:31 2002 +++ linux-2.5.2-pre9.xs2/include/linux/sched.h Sun Jan 6 15:13:35 2002 @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -72,7 +73,11 @@ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_threads; +extern int nr_task_running(void); + +#define nr_running nr_task_running() + +extern int nr_threads; extern int last_pid; #include @@ -121,6 +126,7 @@ * yield the CPU for one re-schedule.. */ #define SCHED_YIELD 0x10 +#define SCHED_RTLOCAL 0x20 struct sched_param { int sched_priority; @@ -139,9 +145,9 @@ * a separate lock). */ extern rwlock_t tasklist_lock; -extern spinlock_t runqueue_lock; extern spinlock_t mmlist_lock; +extern void sched_cpudmap_init(void); extern void sched_init(void); extern void init_idle(void); extern void show_state(void); @@ -321,9 +327,11 @@ * that's just fine.) */ struct list_head run_list; + int task_qid; long time_slice; /* recalculation loop checkpoint */ unsigned long rcl_last; + unsigned long run_jtime; struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; @@ -407,6 +415,9 @@ int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; + +/* per cpu proc list */ + struct list_head proclist_cpu; /* Thread group tracking */ u32 parent_exec_id; @@ -482,11 +493,13 @@ active_mm: &init_mm, \ cpus_runnable: -1, \ cpus_allowed: -1, \ + processor: 0, \ run_list: { NULL, NULL }, \ rcl_last: 0, \ time_slice: DEF_TSLICE, \ next_task: &tsk, \ prev_task: &tsk, \ + pid: 0, \ p_opptr: &tsk, \ p_pptr: &tsk, \ thread_group: LIST_HEAD_INIT(tsk.thread_group), \ @@ -510,6 +523,9 @@ blocked: {{0}}, \ alloc_lock: SPIN_LOCK_UNLOCKED, \ journal_info: NULL, \ + task_qid: 0, \ + proclist_cpu: LIST_HEAD_INIT(tsk.proclist_cpu), \ + run_jtime: 0, \ } @@ -802,6 +818,16 @@ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +extern void del_from_runqueue(struct task_struct * p); +extern void add_to_proclist(struct task_struct * p); +extern void del_from_proclist(struct task_struct * p); +extern void sched_wake_idle(void); +extern int move_to_cpu(struct task_struct * p, int cpu, int stick); +extern int task_cpu_place(struct task_struct *p); +extern void runqueue_spin_lock(struct task_struct * p); +extern void runqueue_spin_unlock(struct task_struct * p); + + #define __wait_event(wq, condition) \ do { \ wait_queue_t __wait; \ @@ -855,6 +881,7 @@ }) #define REMOVE_LINKS(p) do { \ + del_from_proclist(p); \ (p)->next_task->prev_task = (p)->prev_task; \ (p)->prev_task->next_task = (p)->next_task; \ if ((p)->p_osptr) \ @@ -866,6 +893,7 @@ } while (0) #define SET_LINKS(p) do { \ + add_to_proclist(p); \ (p)->next_task = &init_task; \ (p)->prev_task = init_task.prev_task; \ init_task.prev_task->next_task = (p); \ @@ -882,16 +910,19 @@ #define next_thread(p) \ list_entry((p)->thread_group.next, struct task_struct, thread_group) -static inline void del_from_runqueue(struct task_struct * p) +static inline int task_on_runqueue(struct task_struct *p) +{ + return (p->run_list.next != NULL); +} + +static inline int task_on_proclist(struct task_struct *p) { - nr_running--; - list_del(&p->run_list); - p->run_list.next = NULL; + return (p->proclist_cpu.next != NULL); } -static inline int task_on_runqueue(struct task_struct *p) +static inline int task_realtime(struct task_struct *p) { - return (p->run_list.next != NULL); + return ((p->policy & ~SCHED_YIELD) != SCHED_OTHER); } static inline void unhash_process(struct task_struct *p) diff -Nru linux-2.5.2-pre9.vanilla/init/main.c linux-2.5.2-pre9.xs2/init/main.c --- linux-2.5.2-pre9.vanilla/init/main.c Fri Dec 7 16:24:52 2001 +++ linux-2.5.2-pre9.xs2/init/main.c Sun Jan 6 15:04:05 2002 @@ -316,14 +316,6 @@ smp_threads_ready=1; smp_commence(); - - /* Wait for the other cpus to set up their idle processes */ - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); - while (wait_init_idle) { - cpu_relax(); - barrier(); - } - printk("All processors have done init_idle\n"); } #endif @@ -337,6 +329,7 @@ static void rest_init(void) { + init_idle(); kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); unlock_kernel(); current->need_resched = 1; @@ -427,6 +420,10 @@ * make syscalls (and thus be locked). */ smp_init(); + /* + * after smp initialization we can finally setup the cpu distance map + */ + sched_cpudmap_init(); rest_init(); } diff -Nru linux-2.5.2-pre9.vanilla/kernel/fork.c linux-2.5.2-pre9.xs2/kernel/fork.c --- linux-2.5.2-pre9.vanilla/kernel/fork.c Sat Jan 5 19:46:31 2002 +++ linux-2.5.2-pre9.xs2/kernel/fork.c Sun Jan 6 15:04:05 2002 @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -29,7 +30,6 @@ /* The idle threads do not count.. */ int nr_threads; -int nr_running; int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -620,6 +620,9 @@ p->run_list.next = NULL; p->run_list.prev = NULL; + p->proclist_cpu.next = NULL; + p->proclist_cpu.prev = NULL; + p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); p->vfork_done = NULL; @@ -645,7 +648,11 @@ { int i; p->cpus_runnable = ~0UL; - p->processor = current->processor; + /* + * if it's a real time task we leave it on the same processor/task_qid + */ + if (!task_realtime(p) && !(clone_flags & CLONE_PID)) + task_cpu_place(p); /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; diff -Nru linux-2.5.2-pre9.vanilla/kernel/ksyms.c linux-2.5.2-pre9.xs2/kernel/ksyms.c --- linux-2.5.2-pre9.vanilla/kernel/ksyms.c Sat Jan 5 19:46:31 2002 +++ linux-2.5.2-pre9.xs2/kernel/ksyms.c Sun Jan 6 15:04:05 2002 @@ -451,7 +451,6 @@ #endif EXPORT_SYMBOL(kstat); -EXPORT_SYMBOL(nr_running); /* misc */ EXPORT_SYMBOL(panic); diff -Nru linux-2.5.2-pre9.vanilla/kernel/sched.c linux-2.5.2-pre9.xs2/kernel/sched.c --- linux-2.5.2-pre9.vanilla/kernel/sched.c Sat Jan 5 19:46:31 2002 +++ linux-2.5.2-pre9.xs2/kernel/sched.c Mon Jan 7 11:03:24 2002 @@ -72,54 +72,159 @@ /* * The tasklist_lock protects the linked list of processes. * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. + * This is the lock order : + * 1) tasklist_lock + * 2) RT_QID + * 3) lock(0) + * ... + * M) lock(N) * - * If both locks are to be concurrently held, the runqueue_lock - * nests inside the tasklist_lock. + * This does not mean that if a lock(3) is needed we've to lock the + * whole chain down to lock(3) but it states that if RT_QID and lock(3) + * are needed, RT_QID must be locked first. + * The lock patterns are tasklist_lock+lock(i) when the task is moved to + * a different runqueue, RT_QID+lock(i) inside rechedule_idle() of a global + * RT task ( only if the best CPU of the RT task is running another RT task ), + * RT_QID+lock(0)+..+lock(N) inside rechedule_idle() of a global RT task + * when all CPUs are running RT tasks ). + * No other patterns are used, ie: lock(i)+lock(j) * * task->alloc_lock nests inside tasklist_lock. */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ -static LIST_HEAD(runqueue_head); +/* + * this is the distance map ( move cost ) between cpus. + * the move cost from cpu I to cpu J is : cpus_dmap[I][J] + * this value can be seen as the number of milliseconds we can + * tolerate to have an idle cpu before grabbing a remote task + * to run on the idle cpu + */ +#define DEF_CPU_DIST_MS 10 +#define FAR_CPU_DIST_MS 20 +#define MS_TO_DIST(t) (((t) * HZ) / 1000) +#define DEF_CPU_DIST MS_TO_DIST(DEF_CPU_DIST_MS) +#define FAR_CPU_DIST MS_TO_DIST(FAR_CPU_DIST_MS) + +#define cpu_distance(i, j) ((unsigned int) cpus_dmap[i][j]) + +/* + * this is a bonus that we give to cpus that have previously run + * an affine mm struct. the bonus value is in milliseconds + */ +#define MOVE_MM_BONUS_MS 20 +#define MOVE_MM_BONUS MS_TO_DIST(MOVE_MM_BONUS_MS) -static unsigned long rcl_curr; +/* + * this is the cpu distance map that should be compiled by the architecture + * dependent code or by the common code using a provided abstract topology + * interface + */ +unsigned char cpus_dmap[NR_CPUS][NR_CPUS]; + +/* + * this is the minimum run queue length that trigger balancing decisions + */ +int min_mov_rqlen = 2; + +/* + * this is the weight ( in milliseconds ) that a remote process has and is + * used together with the cpu distance ( metric ) map to build a uniform + * cost of move + */ +int mvtsk_cost = DEF_CPU_DIST_MS / 2 - 1; + +/* + * this is used for global real time tasks checkpointing. to avoid the global + * real time task selection every time that there's a global real time task + * running, this variable is incremented at every global real time task wakeup + * and when the first global real time task queue pickup fails for a cpu, its + * cpu-local variable is aligned to this one avoiding subsequent failing list + * lookup. + */ +static volatile unsigned long grt_chkp = 0; /* * We align per-CPU scheduling data on cacheline boundaries, * to prevent cacheline ping-pong. */ -static union { - struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; - } schedule_data; - char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; +struct cpu_sched_data { + int qnr_processes; + int qnr_running; + struct list_head proclist_head; + struct list_head runqueue_head; + struct task_struct *curr; + unsigned long hit_cpus; + unsigned char ldhits[NR_CPUS]; + unsigned long rtt_chkp; + unsigned long rcl_curr; + spinlock_t runqueue_lock ____cacheline_aligned; +}; + +static struct cpu_sched_data aligned_data[NR_CPUS + 1] __cacheline_aligned; + +#define RT_QID NR_CPUS +#define global_rttask(p) ((p)->task_qid == RT_QID) +#define task_foreign(p) (cpu_number_map((p)->processor) != (p)->task_qid) +#define cpu_next(cpu) (((cpu) + 1) < smp_num_cpus ? (cpu) + 1: 0) + +#define cpu_curr(cpu) aligned_data[(cpu)].curr +#define rcl_curr(cpu) aligned_data[(cpu)].rcl_curr +#define rtt_chkp(cpu) aligned_data[(cpu)].rtt_chkp +#define hit_cpus(cpu) aligned_data[(cpu)].hit_cpus +#define ldhits(cpu, i) aligned_data[(cpu)].ldhits[i] +#define qnr_processes(cpu) aligned_data[(cpu)].qnr_processes +#define qnr_running(cpu) aligned_data[(cpu)].qnr_running +#define proclist_head(cpu) aligned_data[(cpu)].proclist_head +#define runqueue_head(cpu) aligned_data[(cpu)].runqueue_head +#define runqueue_lock(cpu) aligned_data[(cpu)].runqueue_lock + + +#define rq_lock(p) lock_task_rq(p) +#define rq_unlock(p) spin_unlock(&runqueue_lock((p)->task_qid)) +#define rq_lock_irq(p) do { local_irq_disable(); lock_task_rq(p); } while (0) +#define rq_unlock_irq(p) do { spin_unlock(&runqueue_lock((p)->task_qid)); local_irq_enable(); } while (0) +#define rq_lock_irqsave(p, f) do { local_irq_save(f); lock_task_rq(p); } while (0) +#define rq_unlock_irqrestore(p, f) do { spin_unlock(&runqueue_lock((p)->task_qid)); local_irq_restore(f); } while (0) -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule struct kernel_stat kstat; extern struct task_struct *child_reaper; + #ifdef CONFIG_SMP #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) -#define can_schedule(p,cpu) \ - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) +#define can_schedule(p, cpu) \ + ((p)->cpus_runnable & (p)->cpus_allowed & (1 << (cpu))) +#define can_move(p, cpu) \ + ((p)->cpus_runnable == ~0L && (p)->cpus_allowed & (1 << (cpu))) +#define run_allowed(p, cpu) ((p)->cpus_allowed & (1 << (cpu))) #else #define idle_task(cpu) (&init_task) -#define can_schedule(p,cpu) (1) +#define can_schedule(p, cpu) (1) +#define can_move(p, cpu) (1) +#define run_allowed(p, cpu) (1) #endif + void scheduling_functions_start_here(void) { } +static inline void lock_task_rq(struct task_struct *p) +{ + int rqn = p->task_qid; + + spin_lock(&runqueue_lock(rqn)); + while (p->task_qid != rqn) { + spin_unlock(&runqueue_lock(rqn)); + rqn = p->task_qid; + spin_lock(&runqueue_lock(rqn)); + } +} + /* * This is the function that decides how desirable a process is.. * You can weigh different processes against each other depending @@ -134,7 +239,7 @@ * +1000: realtime process, select this. */ -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) +static inline int goodness(struct task_struct * p, struct mm_struct *this_mm) { int weight; @@ -152,24 +257,13 @@ */ if (p->policy == SCHED_OTHER) { /* - * Give the process a first-approximation goodness value - * according to the number of clock-ticks it has left. - * - * Don't do any other calculations if the time slice is - * over.. + * if the task is expired return a zero goodness ... */ if (!p->time_slice) return 0; weight = p->dyn_prio + 1; -#ifdef CONFIG_SMP - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; -#endif - /* .. and a slight advantage to the current MM */ if (p->mm == this_mm || !p->mm) weight += MM_AFFINITY_BONUS; @@ -187,158 +281,468 @@ return weight; } +static inline int rt_goodness(struct task_struct * p) +{ + return p->policy & SCHED_YIELD ? -1: 1000 + p->rt_priority; +} + /* * the 'goodness value' of replacing a process on a given CPU. * positive value means 'replace', zero or negative means 'dont'. */ -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p) +{ + return goodness(p, prev->active_mm) - goodness(prev, prev->active_mm); +} + +#ifdef CONFIG_SMP + +static inline void lock_queues(void) +{ + int cpu; + for (cpu = 0; cpu < smp_num_cpus; cpu++) + spin_lock(&runqueue_lock(cpu)); +} + +static inline void unlock_queues(void) { - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); + int cpu; + for (cpu = smp_num_cpus - 1; cpu >= 0; cpu--) + spin_unlock(&runqueue_lock(cpu)); } /* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. + * this is used to try to find a place to run the global rt task. + * it's called with the RT_QID lock held and with local irq disabled. */ -static FASTCALL(void reschedule_idle(struct task_struct * p)); - -static void reschedule_idle(struct task_struct * p) +static inline void rtt_reschedule_idle(struct task_struct * p) { -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; - - /* - * shortcut if the woken up task's last CPU is - * idle now. - */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: - /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. - */ + int cpu, best_cpu = cpu_number_map(p->processor), + this_cpu = cpu_number_map(smp_processor_id()), need_resched, maxpg = 0, pg; + struct task_struct *tsk, *ttsk = NULL; + + /* + * if the best cpu for the global rt task is not currently running + * another rt task, that's the choice. + */ + if (can_schedule(p, cpu_logical_map(best_cpu))) { + spin_lock(&runqueue_lock(best_cpu)); + tsk = cpu_curr(best_cpu); + if (!task_realtime(tsk)) { need_resched = tsk->need_resched; tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); + if (best_cpu != this_cpu && + (!need_resched || tsk != idle_task(cpu_logical_map(best_cpu)))) + smp_send_reschedule(cpu_logical_map(best_cpu)); + spin_unlock(&runqueue_lock(best_cpu)); return; } + spin_unlock(&runqueue_lock(best_cpu)); } - /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 0; - - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); - /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. - */ - if (tsk == idle_task(cpu)) { -#if defined(__i386__) && defined(CONFIG_SMP) - /* - * Check if two siblings are idle in the same - * physical package. Use them if found. - */ - if (smp_num_siblings == 2) { - if (cpu_curr(cpu_sibling_map[cpu]) == - idle_task(cpu_sibling_map[cpu])) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - break; - } - - } -#endif - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - } - } else { - if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); - - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; - } + * the best cpu for the global rt task is running another rt task. + * instead of using preemption_goodness() to try to schedule on that cpu + * we try to find a cpu that is not running another rt task. + */ + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (can_schedule(p, cpu_logical_map(cpu))) { + spin_lock(&runqueue_lock(cpu)); + tsk = cpu_curr(cpu); + if (!task_realtime(tsk)) { + need_resched = tsk->need_resched; + tsk->need_resched = 1; + if (cpu != this_cpu && + (!need_resched || tsk != idle_task(cpu_logical_map(cpu)))) + smp_send_reschedule(cpu_logical_map(cpu)); + spin_unlock(&runqueue_lock(cpu)); + return; } + spin_unlock(&runqueue_lock(cpu)); } } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; + /* + * it's not our lucky day ..., all the cpus are running rt tasks and + * a preemption_goodness() loop is needed to ensure that the global + * priority is respected among rt tasks. + */ + lock_queues(); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (can_schedule(p, cpu_logical_map(cpu))) { + tsk = cpu_curr(cpu); + if ((pg = preemption_goodness(tsk, p)) > maxpg) { + ttsk = tsk; + maxpg = pg; + if (tsk == idle_task(cpu_logical_map(cpu))) + break; + } } + } + if (ttsk) { + need_resched = ttsk->need_resched; + ttsk->need_resched = 1; + if (ttsk->processor != smp_processor_id() && !need_resched) + smp_send_reschedule(ttsk->processor); + } + unlock_queues(); +} + +static inline void std_reschedule_idle(struct task_struct * p) +{ + int best_cpu = p->task_qid, this_cpu = cpu_number_map(smp_processor_id()); + struct task_struct *tsk; + + tsk = cpu_curr(best_cpu); + if (tsk == idle_task(cpu_logical_map(best_cpu))) { + /* + * If need_resched == -1 then we can skip sending + * the IPI altogether, tsk->need_resched is + * actively watched by the idle thread. + */ + int need_resched = tsk->need_resched; + tsk->need_resched = 1; + if ((best_cpu != this_cpu) && !need_resched) + smp_send_reschedule(cpu_logical_map(best_cpu)); + } else if (tsk != p && preemption_goodness(tsk, p) > 0) { tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); + if (tsk->task_qid != this_cpu) + smp_send_reschedule(cpu_logical_map(tsk->task_qid)); } - return; +} +#endif /* #ifdef CONFIG_SMP */ + +/* + * This is ugly, but reschedule_idle() is very timing-critical. + * We are called with the runqueue spinlock held and we must + * not claim the tasklist_lock. + */ +static FASTCALL(void reschedule_idle(struct task_struct * p)); -#else /* UP */ - int this_cpu = smp_processor_id(); +static void reschedule_idle(struct task_struct * p) +{ +#ifdef CONFIG_SMP + /* + * it's better to fork the path here instead of having complex if()s + * inside the function itself. rt tasks really have different wakeup + * methods compared with local cpu ones + */ + if (!global_rttask(p)) + std_reschedule_idle(p); + else + rtt_reschedule_idle(p); + +#else /* #ifdef CONFIG_SMP */ struct task_struct *tsk; - tsk = cpu_curr(this_cpu); - if (preemption_goodness(tsk, p, this_cpu) > 0) + tsk = cpu_curr(smp_processor_id()); + if (preemption_goodness(tsk, p) > 0) tsk->need_resched = 1; -#endif +#endif /* #ifdef CONFIG_SMP */ +} + +int nr_task_running(void) +{ + int i, tsk_running = qnr_running(RT_QID); + + for (i = 0; i < smp_num_cpus; i++) + tsk_running += qnr_running(i); + return tsk_running; } /* - * Careful! - * - * This has to add the process to the _beginning_ of the - * run-queue, not the end. See the comment about "This is - * subtle" in the scheduler proper.. + * if it's a standard task its priority bonus is calculated and merged to + * its dynamic priority. for global real time tasks the checkpoint counter + * is incremented to force cpu's schedulers to try a global rt queue lookup */ -static inline void add_to_runqueue(struct task_struct * p) +static inline void __add_to_runqueue(struct task_struct * p, int task_qid) { - p->dyn_prio += rcl_curr - p->rcl_last; - p->rcl_last = rcl_curr; - if (p->dyn_prio > MAX_DYNPRIO) - p->dyn_prio = MAX_DYNPRIO; - list_add(&p->run_list, &runqueue_head); - nr_running++; + if (task_qid != RT_QID) { + p->dyn_prio += rcl_curr(task_qid) - p->rcl_last; + p->rcl_last = rcl_curr(task_qid); + if (p->dyn_prio > MAX_DYNPRIO) p->dyn_prio = MAX_DYNPRIO; + } else + grt_chkp++; + list_add(&p->run_list, &runqueue_head(task_qid)); + qnr_running(task_qid)++; } -static inline void move_last_runqueue(struct task_struct * p) +static inline void __del_from_runqueue(struct task_struct * p, int task_qid) { + qnr_running(task_qid)--; list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); + p->run_list.next = NULL; + p->rcl_last = rcl_curr(task_qid); +} + +void del_from_runqueue(struct task_struct * p) +{ + unsigned long flags; + + rq_lock_irqsave(p, flags); + __del_from_runqueue(p, p->task_qid); + rq_unlock_irqrestore(p, flags); +} + +static inline void __add_to_proclist(struct task_struct * p, int task_qid) +{ + list_add(&p->proclist_cpu, &proclist_head(task_qid)); + qnr_processes(task_qid)++; } -static inline void move_first_runqueue(struct task_struct * p) +void add_to_proclist(struct task_struct * p) +{ + unsigned long flags; + + rq_lock_irqsave(p, flags); + __add_to_proclist(p, p->task_qid); + rq_unlock_irqrestore(p, flags); +} + +static inline void __del_from_proclist(struct task_struct * p, int task_qid) +{ + list_del(&p->proclist_cpu); + qnr_processes(task_qid)--; + p->proclist_cpu.next = NULL; +} + +void del_from_proclist(struct task_struct * p) +{ + unsigned long flags; + + rq_lock_irqsave(p, flags); + __del_from_proclist(p, p->task_qid); + rq_unlock_irqrestore(p, flags); +} + +void runqueue_spin_lock(struct task_struct * p) +{ + rq_lock(p); +} + +void runqueue_spin_unlock(struct task_struct * p) +{ + rq_unlock(p); +} + +static inline void __move_last_runqueue(struct task_struct * p) +{ + list_del(&p->run_list); + list_add_tail(&p->run_list, &runqueue_head(p->task_qid)); +} + +static inline void __move_first_runqueue(struct task_struct * p) { list_del(&p->run_list); - list_add(&p->run_list, &runqueue_head); + list_add(&p->run_list, &runqueue_head(p->task_qid)); } /* + * move_to_rqn() must be called with 1) local irq disabled + * 2) tasklist_lock write-locked 3) task locked + */ +static int move_to_rqn(struct task_struct * p, int rqn, int stick) +{ + int task_cpu, onpslist = 0, onrqlist = 0; + unsigned long cpus_allowed; + + if (p->task_qid == rqn) { + if (stick) + p->cpus_allowed = (1 << cpu_logical_map(rqn)); + return rqn; + } + if (task_on_runqueue(p)) + __del_from_runqueue(p, p->task_qid), onrqlist++; + if (task_on_proclist(p)) + __del_from_proclist(p, p->task_qid), onpslist++; + cpus_allowed = stick ? (1 << cpu_logical_map(rqn)): p->cpus_allowed; + p->cpus_allowed = 0; + task_cpu = p->task_qid; + p->task_qid = rqn; + spin_unlock(&runqueue_lock(task_cpu)); + + rq_lock(p); + p->rcl_last = rcl_curr(rqn); + if (onpslist) + __add_to_proclist(p, p->task_qid); + if (onrqlist) + __add_to_runqueue(p, p->task_qid); + p->cpus_allowed = cpus_allowed; + return task_cpu; +} + +/* + * this is only called by softirq.c::ksoftirqd() and is used to place + * ksoftirqd tasks over different cpus. + */ +int move_to_cpu(struct task_struct * p, int cpu, int stick) +{ +#ifdef CONFIG_SMP + unsigned long flags; + + write_lock_irqsave(&tasklist_lock, flags); + rq_lock(p); + move_to_rqn(p, cpu_number_map(cpu), stick); + rq_unlock(p); + write_unlock_irqrestore(&tasklist_lock, flags); + return cpu; +#else /* #ifdef CONFIG_SMP */ + return 0; +#endif /* #ifdef CONFIG_SMP */ +} + +/* + * this function gets called inside kernel/timer.c when the timer + * tick hit the idle task. maybe architectures with huge HZ might + * want to not wake up the idle at every timer tick + */ +void sched_wake_idle(void) +{ + if (smp_num_cpus > 1) + current->need_resched = 1; +} + +#ifdef CONFIG_SMP + +/* + * the runtime cpu distance is the sum of the base cpu distance plus the + * load on the remote cpu + */ +static inline long rt_cpu_dist(int src_cpu, int dst_cpu) +{ + return (cpu_distance(src_cpu, dst_cpu) << 4) + + (qnr_running(src_cpu) * mvtsk_cost * (HZ << 4)) / 1000; +} + +/* + * try to find the best cpu to run a fresh new process, no locks are held + * during this function. it gets called by do_fork() in SMP mode + */ +int task_cpu_place(struct task_struct *p) +{ + int i, best_cpu, this_cpu = cpu_number_map(smp_processor_id()); + long cdist, min_cdist; + + best_cpu = this_cpu; + min_cdist = rt_cpu_dist(this_cpu, this_cpu); + for (i = 0; i < smp_num_cpus; i++) { + if (i == this_cpu || !run_allowed(p, cpu_logical_map(i))) continue; + if ((cdist = rt_cpu_dist(i, this_cpu)) < min_cdist) { + min_cdist = cdist; + best_cpu = i; + } + } + p->rcl_last = rcl_curr(best_cpu); + p->processor = cpu_logical_map(best_cpu); + p->task_qid = best_cpu; + return p->processor; +} + +static inline long move_goodness(struct task_struct *p, struct mm_struct *this_mm) +{ + long mgds = (long) (jiffies - p->run_jtime); + + if (p->mm == this_mm || !p->mm) + mgds += MOVE_MM_BONUS; + return mgds; +} + +static inline struct task_struct *try_steal_task(int src_cpu, int dst_cpu) +{ + int ldst_cpu = cpu_logical_map(dst_cpu); + long mgdns = -1, mvg; + struct mm_struct *this_mm = current->active_mm; + struct task_struct *tsk, *mvtsk = NULL; + struct list_head *head, *tmp; + + spin_lock_irq(&runqueue_lock(src_cpu)); + head = &runqueue_head(src_cpu); + list_for_each(tmp, head) { + tsk = list_entry(tmp, struct task_struct, run_list); + if (can_move(tsk, ldst_cpu) && !task_foreign(tsk) && + (mvg = move_goodness(tsk, this_mm)) > mgdns) { + mvtsk = tsk; + mgdns = mvg; + } + } + if (mvtsk) { + unsigned long cpus_allowed = mvtsk->cpus_allowed; + + mvtsk->cpus_allowed = 0; + __del_from_runqueue(mvtsk, src_cpu); + spin_unlock(&runqueue_lock(src_cpu)); + write_lock(&tasklist_lock); + spin_lock(&runqueue_lock(src_cpu)); + __del_from_proclist(mvtsk, src_cpu); + spin_unlock(&runqueue_lock(src_cpu)); + spin_lock(&runqueue_lock(dst_cpu)); + mvtsk->rcl_last = rcl_curr(dst_cpu); + __add_to_runqueue(mvtsk, dst_cpu); + __add_to_proclist(mvtsk, dst_cpu); + mvtsk->cpus_allowed = cpus_allowed; + mvtsk->task_qid = dst_cpu; + spin_unlock(&runqueue_lock(dst_cpu)); + write_unlock_irq(&tasklist_lock); + } else + spin_unlock_irq(&runqueue_lock(src_cpu)); + return mvtsk; +} + +/* + * the move cost is the difference from the cpu distance and the run queue + * load on the remote cpu. both terms are scaled by a factor 16 ( << 4 ) and + * the cost for each remote cpu task depend on mvtsk_cost + */ +static inline long move_cost(int src_cpu, int dst_cpu) +{ + return (cpu_distance(src_cpu, dst_cpu) << 4) - + (qnr_running(src_cpu) * mvtsk_cost * (HZ << 4)) / 1000; +} + +static inline struct task_struct *get_remote_task(int this_cpu) +{ + int i, max_cpu; + unsigned long hcpus = 0; + long ccost, min_cost; + struct task_struct *rtask; + + this_cpu = cpu_number_map(this_cpu); + for (i = 0; i < smp_num_cpus; i++) { + if (i == this_cpu) continue; + if (qnr_running(i) >= min_mov_rqlen) { + if (hit_cpus(this_cpu) & (1 << i)) + ldhits(this_cpu, i)++; + else { + hit_cpus(this_cpu) |= (1 << i); + ldhits(this_cpu, i) = 1; + } + if (ldhits(this_cpu, i) >= cpu_distance(this_cpu, i)) + hcpus |= (1 << i); + } else + hit_cpus(this_cpu) &= ~(1 << i); + } + while (hcpus) { + max_cpu = -1; + min_cost = 1000; + for (i = 0; i < smp_num_cpus; i++) { + if (!(hcpus & (1 << i))) continue; + if ((ccost = move_cost(i, this_cpu)) < min_cost) { + min_cost = ccost; + max_cpu = i; + } + } + if (max_cpu < 0) break; + if ((rtask = try_steal_task(max_cpu, this_cpu))) { + hit_cpus(this_cpu) = 0; + return rtask; + } + hcpus &= ~(1 << max_cpu); + } + return NULL; +} + +#endif /* #ifdef CONFIG_SMP */ + +/* * Wake up a process. Put it on the run-queue if it's not * already there. The "current" process is always on the * run-queue (except when the actual re-schedule is in @@ -354,16 +758,21 @@ /* * We want the common case fall through straight, thus the goto. */ - spin_lock_irqsave(&runqueue_lock, flags); + rq_lock_irqsave(p, flags); p->state = TASK_RUNNING; - if (task_on_runqueue(p)) + /* + * cpus_allowed is cleared when a task is moving from one cpu + * to another and it is used to avoid to be hit while we're + * switching locks. + */ + if (task_on_runqueue(p) || !p->cpus_allowed) goto out; - add_to_runqueue(p); - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) + __add_to_runqueue(p, p->task_qid); + if (!synchronous || p->task_qid != cpu_number_map(smp_processor_id())) reschedule_idle(p); success = 1; out: - spin_unlock_irqrestore(&runqueue_lock, flags); + rq_unlock_irqrestore(p, flags); return success; } @@ -487,41 +896,7 @@ task_lock(prev); task_release_cpu(prev); mb(); - if (prev->state == TASK_RUNNING) - goto needs_resched; - -out_unlock: task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ - return; - - /* - * Slow path - we 'push' the previous process and - * reschedule_idle() will attempt to find a new - * processor for it. (but it might preempt the - * current process as well.) We must take the runqueue - * lock and re-check prev->state to be correct. It might - * still happen that this process has a preemption - * 'in progress' already - but this is not a problem and - * might happen in other circumstances as well. - */ -needs_resched: - { - unsigned long flags; - - /* - * Avoid taking the runqueue lock in cases where - * no preemption-check is necessery: - */ - if ((prev == idle_task(smp_processor_id())) || - (policy & SCHED_YIELD)) - goto out_unlock; - - spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) - reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); - goto out_unlock; - } #else prev->policy &= ~SCHED_YIELD; #endif /* CONFIG_SMP */ @@ -534,62 +909,83 @@ void expire_task(struct task_struct *p) { - if (unlikely(!p->time_slice)) - goto need_resched; - - if (!--p->time_slice) { + if (--p->time_slice <= 0) { if (p->dyn_prio) p->dyn_prio--; -need_resched: + p->time_slice = 0; p->need_resched = 1; } } -/* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. - * - * The goto is "interesting". - * - * NOTE!! Task 0 is the 'idle' task, which gets called when no other - * tasks can run. It can not be killed, and it cannot sleep. The 'state' - * information in task[0] is never used. - */ -asmlinkage void schedule(void) +static inline void set_task_running(struct task_struct *p, int cpu) { - struct schedule_data * sched_data; - struct task_struct *prev, *next, *p; - struct list_head *tmp; - int this_cpu, c; +#ifdef CONFIG_SMP + if (p != idle_task(cpu) && hit_cpus(cpu_number_map(cpu))) + hit_cpus(cpu_number_map(cpu)) = 0; + p->run_jtime = jiffies; +#endif /* #ifdef CONFIG_SMP */ + cpu_curr(cpu_number_map(cpu)) = p; + task_set_cpu(p, cpu); +} +static inline void switch_tasks(struct task_struct *prev, struct task_struct *next) +{ + /* + * there are 3 processes which are affected by a context switch: + * + * prev == .... ==> (last => next) + * + * It's the 'much more previous' 'prev' that is on next's stack, + * but prev is set to (the just run) 'last' process by switch_to(). + * This might sound slightly confusing but makes tons of sense. + */ + prepare_to_switch(); + { + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + if (!mm) { + if (next->active_mm) BUG(); + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next, prev->processor); + } else { + if (next->active_mm != mm) BUG(); + switch_mm(oldmm, mm, next, prev->processor); + } + + if (!prev->mm) { + prev->active_mm = NULL; + mmdrop(oldmm); + } + } + + kstat.context_swtch++; + switch_to(prev, next, prev); + __schedule_tail(prev); +} - spin_lock_prefetch(&runqueue_lock); +asmlinkage void schedule(void) +{ + struct task_struct *prev, *next, *p; + struct list_head *head, *tmp; + int this_cpu, c, weight; - if (!current->active_mm) BUG(); need_resched_back: prev = current; this_cpu = prev->processor; - if (unlikely(in_interrupt())) { - printk("Scheduling in interrupt\n"); + if (unlikely(in_interrupt())) BUG(); - } release_kernel_lock(prev, this_cpu); - /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. - */ - sched_data = & aligned_data[this_cpu].schedule_data; - - spin_lock_irq(&runqueue_lock); + rq_lock_irq(prev); /* move an exhausted RR process to be last.. */ if (unlikely(prev->policy == SCHED_RR)) if (!prev->time_slice) { prev->time_slice = TASK_TIMESLICE(prev); - move_last_runqueue(prev); + __move_last_runqueue(prev); } switch (prev->state) { @@ -599,49 +995,55 @@ break; } default: - del_from_runqueue(prev); + if (task_on_runqueue(prev)) + __del_from_runqueue(prev, prev->task_qid); case TASK_RUNNING:; } prev->need_resched = 0; /* - * this is the scheduler proper: + * check global rt queue first without held locks and if it's not empty + * try to pickup the rt task first. despite to the new "unlikely" feature + * the code for rt task selection is kept out. */ + if (grt_chkp != rtt_chkp(cpu_number_map(this_cpu)) && + !list_empty(&runqueue_head(RT_QID))) + goto rt_queue_select; -repeat_schedule: /* - * Default process to select.. + * this is true for running tasks moved with move_to_rqn() ( the first time + * they call schedule() ) and for global RT tasks. */ + if (unlikely(task_foreign(prev))) { + rq_unlock(prev); + spin_lock(&runqueue_lock(cpu_number_map(this_cpu))); + } + +repeat_schedule: next = idle_task(this_cpu); c = -1000; - list_for_each(tmp, &runqueue_head) { + head = &runqueue_head(cpu_number_map(this_cpu)); + list_for_each(tmp, head) { p = list_entry(tmp, struct task_struct, run_list); - if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); - if (weight > c) - c = weight, next = p; - } + if ((weight = goodness(p, prev->active_mm)) > c) + c = weight, next = p; } - /* Do we need to re-calculate counters? */ + /* Do we need to re-calculate time slices? */ if (unlikely(!c)) { - rcl_curr++; - list_for_each(tmp, &runqueue_head) { + rcl_curr(cpu_number_map(this_cpu))++; + head = &runqueue_head(cpu_number_map(this_cpu)); + list_for_each(tmp, head) { p = list_entry(tmp, struct task_struct, run_list); p->time_slice = TASK_TIMESLICE(p); - p->rcl_last = rcl_curr; } goto repeat_schedule; } - /* - * from this point on nothing can prevent us from - * switching to the next task, save this fact in - * sched_data. - */ - sched_data->curr = next; - task_set_cpu(next, this_cpu); - spin_unlock_irq(&runqueue_lock); + set_task_running(next, this_cpu); + spin_unlock_irq(&runqueue_lock(cpu_number_map(this_cpu))); + +rt_task_selected: if (unlikely(prev == next)) { /* We won't go through the normal tail, so do this by hand */ @@ -649,66 +1051,59 @@ goto same_process; } -#ifdef CONFIG_SMP - /* - * maintain the per-process 'last schedule' value. - * (this has to be recalculated even if we reschedule to - * the same process) Currently this is only used on SMP, - * and it's approximate, so we do not have to maintain - * it while holding the runqueue spinlock. - */ - sched_data->last_schedule = get_cycles(); + switch_tasks(prev, next); - /* - * We drop the scheduler lock early (it's a global spinlock), - * thus we have to lock the previous process from getting - * rescheduled during switch_to(). - */ +same_process: + reacquire_kernel_lock(current); -#endif /* CONFIG_SMP */ +#ifdef CONFIG_SMP + if (unlikely(current == idle_task(this_cpu))) + if (get_remote_task(this_cpu)) + goto need_resched_back; +#endif /* #ifdef CONFIG_SMP */ - kstat.context_swtch++; - /* - * there are 3 processes which are affected by a context switch: - * - * prev == .... ==> (last => next) - * - * It's the 'much more previous' 'prev' that is on next's stack, - * but prev is set to (the just run) 'last' process by switch_to(). - * This might sound slightly confusing but makes tons of sense. - */ - prepare_to_switch(); - { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; - if (!mm) { - if (next->active_mm) BUG(); - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next, this_cpu); - } else { - if (next->active_mm != mm) BUG(); - switch_mm(oldmm, mm, next, this_cpu); - } + if (current->need_resched) + goto need_resched_back; + return; - if (!prev->mm) { - prev->active_mm = NULL; - mmdrop(oldmm); +rt_queue_select: + /* + * the fast lockless check reported that it might be a successful + * pickup inside the global rt queue, so we try here. this section + * is entered with "prev" locked. if the "prev" task qid is not RT_QID + * then it must be unlocked and RT_QID lock must be acquired. + */ + if (!global_rttask(prev)) { + rq_unlock(prev); + spin_lock(&runqueue_lock(RT_QID)); + } + c = 0; + head = &runqueue_head(RT_QID); + list_for_each(tmp, head) { + p = list_entry(tmp, struct task_struct, run_list); + if (can_schedule(p, this_cpu)) { + if ((weight = rt_goodness(p)) > c) + c = weight, next = p; } } - + if (!c) { + /* + * the fast test reported a false positive so we go back to + * the local CPU runqueue selection. we also update the checkpoint + * to avoid future false lookups. + */ + rtt_chkp(cpu_number_map(this_cpu)) = grt_chkp; + spin_unlock(&runqueue_lock(RT_QID)); + spin_lock(&runqueue_lock(cpu_number_map(this_cpu))); + goto repeat_schedule; + } /* - * This just switches the register state and the - * stack. + * the global rt task has been selected and final setup is needed. */ - switch_to(prev, next, prev); - __schedule_tail(prev); + set_task_running(next, this_cpu); + spin_unlock_irq(&runqueue_lock(RT_QID)); + goto rt_task_selected; -same_process: - reacquire_kernel_lock(current); - if (current->need_resched) - goto need_resched_back; - return; } /* @@ -900,9 +1295,9 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param) { + int retval, grt, pgrt, rqn; struct sched_param lp; struct task_struct *p; - int retval; retval = -EINVAL; if (!param || pid < 0) @@ -914,22 +1309,31 @@ /* * We play safe to avoid deadlocks. + * It's possible that we need a write lock to move the task in/out the + * RT_QID run queue so instead of getting a read lock and having to + * release/writelock again, it's better to get directly the write one. */ - read_lock_irq(&tasklist_lock); - spin_lock(&runqueue_lock); + write_lock_irq(&tasklist_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) - goto out_unlock; + goto out_unlock_tkll; - if (policy < 0) + rq_lock(p); + if (policy < 0) { policy = p->policy; - else { + grt = pgrt = global_rttask(p); + } else { + grt = (policy & SCHED_RTLOCAL) == 0; + policy &= ~SCHED_RTLOCAL; + grt = grt && (policy == SCHED_FIFO || policy == SCHED_RR); + pgrt = global_rttask(p); + retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_OTHER) + policy != SCHED_OTHER) goto out_unlock; } @@ -954,14 +1358,28 @@ retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - if (task_on_runqueue(p)) - move_first_runqueue(p); + if (pgrt == grt) { + rqn = p->task_qid; + if (task_on_runqueue(p)) + __move_first_runqueue(p); + } else { + rqn = cpu_number_map(p->processor); + move_to_rqn(p, grt ? RT_QID: rqn, 0); + if (grt) grt_chkp++; + } - current->need_resched = 1; + if (grt || rqn == cpu_number_map(smp_processor_id())) + current->need_resched = 1; + else { +#ifdef CONFIG_SMP + smp_send_reschedule(cpu_logical_map(rqn)); +#endif /* #ifdef CONFIG_SMP */ + } out_unlock: - spin_unlock(&runqueue_lock); - read_unlock_irq(&tasklist_lock); + rq_unlock(p); +out_unlock_tkll: + write_unlock_irq(&tasklist_lock); out_nounlock: return retval; @@ -1031,41 +1449,18 @@ asmlinkage long sys_sched_yield(void) { - /* - * Trick. sched_yield() first counts the number of truly - * 'pending' runnable processes, then returns if it's - * only the current processes. (This test does not have - * to be atomic.) In threaded applications this optimization - * gets triggered quite often. - */ - - int nr_pending = nr_running; - -#if CONFIG_SMP - int i; - - // Subtract non-idle processes running on other CPUs. - for (i = 0; i < smp_num_cpus; i++) { - int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; - } -#else - // on UP this process is on the runqueue as well - nr_pending--; -#endif - if (nr_pending) { + if (qnr_running(current->task_qid) > 1) { /* - * This process can only be rescheduled by us, + * This process can only be rescheduled by us, * so this is safe without any locking. */ if (current->policy == SCHED_OTHER) current->policy |= SCHED_YIELD; - current->need_resched = 1; - + local_irq_disable(); current->time_slice = 0; - if (++current->dyn_prio > MAX_DYNPRIO) - current->dyn_prio = MAX_DYNPRIO; + if (++current->dyn_prio > MAX_DYNPRIO) current->dyn_prio = MAX_DYNPRIO; + local_irq_enable(); + current->need_resched = 1; } return 0; } @@ -1243,7 +1638,7 @@ /* We also take the runqueue_lock while altering task fields * which affect scheduling decisions */ - spin_lock(&runqueue_lock); + rq_lock(current); current->ptrace = 0; current->nice = DEF_NICE; @@ -1258,7 +1653,7 @@ memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); current->user = INIT_USER; - spin_unlock(&runqueue_lock); + rq_unlock(current); write_unlock_irq(&tasklist_lock); } @@ -1298,18 +1693,24 @@ void __init init_idle(void) { - struct schedule_data * sched_data; - sched_data = &aligned_data[smp_processor_id()].schedule_data; - if (current != &init_task && task_on_runqueue(current)) { printk("UGH! (%d:%d) was on the runqueue, removing.\n", - smp_processor_id(), current->pid); - del_from_runqueue(current); + smp_processor_id(), current->pid); + __del_from_runqueue(current, current->task_qid); } - current->dyn_prio = 0; - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); + current->task_qid = cpu_number_map(current->processor); + current->nice = 20; + current->dyn_prio = -100; + current->time_slice = 0; + cpu_curr(cpu_number_map(smp_processor_id())) = current; clear_bit(current->processor, &wait_init_idle); + + /* Wait for the other cpus to set up their idle processes */ + printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); + while (wait_init_idle) { + cpu_relax(); + barrier(); + } } extern void init_timervecs (void); @@ -1326,19 +1727,55 @@ } } +void __init sched_cpudmap_init(void) +{ + int i, j; + + /* + * this should use ( if provided ) a topology api to setup + * the distance map. right now it assignes same distance to + * 4 cpus groups + */ + for (i = 0; i < NR_CPUS; i++) + for (j = 0; j <= i; j++) + if ((i & ~0x03) == (j & ~0x03)) + cpus_dmap[i][j] = cpus_dmap[j][i] = DEF_CPU_DIST; + else + cpus_dmap[i][j] = cpus_dmap[j][i] = FAR_CPU_DIST; + +} + void __init sched_init(void) { /* * We have to do a little magic to get the first * process right in SMP mode. */ - int cpu = smp_processor_id(); - int nr; + int i, j, cpu = smp_processor_id(); + + for (i = 0; i <= NR_CPUS; i++) { + qnr_processes(i) = 0; + qnr_running(i) = 0; + cpu_curr(i) = &init_task; + rtt_chkp(i) = grt_chkp; + hit_cpus(i) = 0; + rcl_curr(i) = 0; + INIT_LIST_HEAD(&runqueue_head(i)); + INIT_LIST_HEAD(&proclist_head(i)); + runqueue_lock(i) = SPIN_LOCK_UNLOCKED; + } + + /* + * give a first initialization to the cpu distance map + */ + for (i = 0; i < NR_CPUS; i++) + for (j = 0; j <= i; j++) + cpus_dmap[i][j] = cpus_dmap[j][i] = DEF_CPU_DIST; init_task.processor = cpu; - for(nr = 0; nr < PIDHASH_SZ; nr++) - pidhash[nr] = NULL; + for(i = 0; i < PIDHASH_SZ; i++) + pidhash[i] = NULL; fill_tslice_map(); @@ -1354,3 +1791,4 @@ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); } + diff -Nru linux-2.5.2-pre9.vanilla/kernel/signal.c linux-2.5.2-pre9.xs2/kernel/signal.c --- linux-2.5.2-pre9.vanilla/kernel/signal.c Sat Jan 5 19:46:31 2002 +++ linux-2.5.2-pre9.xs2/kernel/signal.c Sun Jan 6 15:04:05 2002 @@ -478,10 +478,10 @@ * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - spin_lock(&runqueue_lock); + runqueue_spin_lock(t); if (task_has_cpu(t) && t->processor != smp_processor_id()) smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); + runqueue_spin_unlock(t); #endif /* CONFIG_SMP */ if (t->state & TASK_INTERRUPTIBLE) { diff -Nru linux-2.5.2-pre9.vanilla/kernel/softirq.c linux-2.5.2-pre9.xs2/kernel/softirq.c --- linux-2.5.2-pre9.vanilla/kernel/softirq.c Fri Nov 30 15:53:28 2001 +++ linux-2.5.2-pre9.xs2/kernel/softirq.c Sun Jan 6 15:04:05 2002 @@ -369,7 +369,7 @@ sigfillset(¤t->blocked); /* Migrate to the right CPU */ - current->cpus_allowed = 1UL << cpu; + if (move_to_cpu(current, cpu, 1) < 0) BUG(); while (smp_processor_id() != cpu) schedule(); diff -Nru linux-2.5.2-pre9.vanilla/kernel/timer.c linux-2.5.2-pre9.xs2/kernel/timer.c --- linux-2.5.2-pre9.vanilla/kernel/timer.c Sat Jan 5 19:46:31 2002 +++ linux-2.5.2-pre9.xs2/kernel/timer.c Sun Jan 6 15:04:05 2002 @@ -589,8 +589,11 @@ else kstat.per_cpu_user[cpu] += user_tick; kstat.per_cpu_system[cpu] += system; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system; + } else { + sched_wake_idle(); + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system; + } } /*