diff -Nru linux-2.5.1-pre11.vanilla/Makefile linux-2.5.1-pre11.lxs2/Makefile --- linux-2.5.1-pre11.vanilla/Makefile Thu Dec 13 11:05:02 2001 +++ linux-2.5.1-pre11.lxs2/Makefile Thu Dec 13 12:43:28 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 1 -EXTRAVERSION =-pre11 +EXTRAVERSION = -pre11-lxs2 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -Nru linux-2.5.1-pre11.vanilla/arch/i386/kernel/smpboot.c linux-2.5.1-pre11.lxs2/arch/i386/kernel/smpboot.c --- linux-2.5.1-pre11.vanilla/arch/i386/kernel/smpboot.c Wed Nov 21 10:35:48 2001 +++ linux-2.5.1-pre11.lxs2/arch/i386/kernel/smpboot.c Thu Dec 13 12:39:11 2001 @@ -799,15 +799,13 @@ if (!idle) panic("No idle process for CPU %d", cpu); - idle->processor = cpu; - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ - map_cpu_to_boot_apicid(cpu, apicid); - idle->thread.eip = (unsigned long) start_secondary; - del_from_runqueue(idle); unhash_process(idle); + idle->processor = cpu; + idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ + idle->thread.eip = (unsigned long) start_secondary; init_tasks[cpu] = idle; /* start_eip had better be page-aligned! */ diff -Nru linux-2.5.1-pre11.vanilla/drivers/char/Makefile linux-2.5.1-pre11.lxs2/drivers/char/Makefile --- linux-2.5.1-pre11.vanilla/drivers/char/Makefile Sun Nov 11 10:09:32 2001 +++ linux-2.5.1-pre11.lxs2/drivers/char/Makefile Thu Dec 13 12:40:23 2001 @@ -16,7 +16,7 @@ O_TARGET := char.o -obj-y += mem.o tty_io.o n_tty.o tty_ioctl.o raw.o pty.o misc.o random.o +obj-y += mem.o tty_io.o n_tty.o tty_ioctl.o raw.o pty.o misc.o random.o latsched.o # All of the (potential) objects that export symbols. # This list comes from 'grep -l EXPORT_SYMBOL *.[hc]'. diff -Nru linux-2.5.1-pre11.vanilla/drivers/char/latsched.c linux-2.5.1-pre11.lxs2/drivers/char/latsched.c --- linux-2.5.1-pre11.vanilla/drivers/char/latsched.c Wed Dec 31 16:00:00 1969 +++ linux-2.5.1-pre11.lxs2/drivers/char/latsched.c Thu Dec 13 12:52:02 2001 @@ -0,0 +1,176 @@ +/* + * linux/kernel/latsched.c + * + * Kernel scheduler latency tester + * + * Copyright (C) 2001, Davide Libenzi + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + + + + +#define DEBUG 0 +#ifdef DEBUG +#define DPRINTK(x) printk x +#define DNPRINTK(n,x) if (n <= DEBUG) printk x +#else +#define DPRINTK(x) +#define DNPRINTK(n,x) +#endif + + + +struct latsched { + +}; + + + +static int open_latsched(struct inode *inode, struct file *file); +static int close_latsched(struct inode *inode, struct file *file); +static int ioctl_latsched(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); + + +static struct file_operations latsched_fops = { + ioctl: ioctl_latsched, + open: open_latsched, + release: close_latsched +}; + +static struct miscdevice latsched = { + LATSCHED_MINOR, "latsched", &latsched_fops +}; + + + + + + + +static int open_latsched(struct inode *inode, struct file *file) +{ + int res; + struct latsched *ls; + + if (!(ls = kmalloc(sizeof(struct latsched), GFP_KERNEL))) + return -ENOMEM; + + memset(ls, 0, sizeof(*ls)); + + + file->private_data = ls; + + MOD_INC_USE_COUNT; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: open() ls=%p\n", current, ls)); + return 0; +} + + +static int close_latsched(struct inode *inode, struct file *file) +{ + struct latsched *ls = (struct latsched *) file->private_data; + + kfree(ls); + + MOD_DEC_USE_COUNT; + + DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: close() ls=%p\n", current, ls)); + return 0; +} + + +static int ioctl_latsched(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int res; + struct latsched *ls = (struct latsched *) file->private_data; + struct lsctl_getdata lsgd; + + switch (cmd) { + case LS_START: + res = latsched_start(1); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: ioctl(%p, LS_START) == %d\n", + current, ls, res)); + return res; + + case LS_STOP: + res = latsched_start(0); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: ioctl(%p, LS_STOP) == %d\n", + current, ls, res)); + return res; + + case LS_FETCH: + if ((res = verify_area(VERIFY_WRITE, (void *) arg, sizeof(struct lsctl_getdata)))) + return res; + __copy_from_user(&lsgd, (void *) arg, sizeof(struct lsctl_getdata)); + if ((res = verify_area(VERIFY_WRITE, (void *) lsgd.data, lsgd.size * sizeof(struct latsched_sample)))) + return res; + + if (!(res = latsched_getdata(&lsgd))) + __copy_to_user((void *) arg, &lsgd, sizeof(struct lsctl_getdata)); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: ioctl(%p, LS_FETCH, %d) == %d\n", + current, ls, lsgd.cpu, res)); + return res; + + case LS_SAMPLES: + res = latsched_setsamples((int) arg); + + DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: ioctl(%p, LS_SAMPLES, %lu) == %d\n", + current, ls, arg, res)); + return res; + } + + return -EINVAL; +} + + + + +int __init init_latsched(void) +{ + + misc_register(&latsched); + + printk(KERN_INFO "[%p] /dev/latsched: driver installed.\n", current); + + return 0; +} + + +module_init(init_latsched); + diff -Nru linux-2.5.1-pre11.vanilla/drivers/net/slip.c linux-2.5.1-pre11.lxs2/drivers/net/slip.c --- linux-2.5.1-pre11.vanilla/drivers/net/slip.c Sun Sep 30 12:26:07 2001 +++ linux-2.5.1-pre11.lxs2/drivers/net/slip.c Thu Dec 13 12:38:14 2001 @@ -1395,6 +1395,7 @@ do { if (busy) { current->counter = 0; + current->timer_ticks = 0; schedule(); } diff -Nru linux-2.5.1-pre11.vanilla/include/linux/latsched.h linux-2.5.1-pre11.lxs2/include/linux/latsched.h --- linux-2.5.1-pre11.vanilla/include/linux/latsched.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.1-pre11.lxs2/include/linux/latsched.h Thu Dec 13 12:50:13 2001 @@ -0,0 +1,41 @@ +/* + * linux/include/linux/latsched.h + * + * Kernel scheduler latency tester + * + * Copyright (C) 2001, Davide Libenzi + * + */ + +#ifndef _LINUX_LATSCHED_H +#define _LINUX_LATSCHED_H + +#include + +#define LATSCHED_MINOR 117 +#define STD_LATSCHED_SAMPLES 1024 + +struct latsched_sample { + cycles_t lss_in, lss_out; + pid_t lss_pid; +}; +struct latsched_data { + struct latsched_sample *ls_data; + int ls_size; + int ls_curr; +}; +struct lsctl_getdata { + int cpu; + int size; + struct latsched_sample *data; + int rsize; +}; + +#define LS_START _IO('P', 1) +#define LS_STOP _IO('P', 2) +#define LS_FETCH _IOWR('P', 3, struct lsctl_getdata) +#define LS_SAMPLES _IOR('P', 4, int) + + +#endif /* #ifndef _LINUX_LATSCHED_H */ + diff -Nru linux-2.5.1-pre11.vanilla/include/linux/sched.h linux-2.5.1-pre11.lxs2/include/linux/sched.h --- linux-2.5.1-pre11.vanilla/include/linux/sched.h Thu Dec 13 14:28:40 2001 +++ linux-2.5.1-pre11.lxs2/include/linux/sched.h Thu Dec 13 12:50:13 2001 @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include struct exec_domain; @@ -71,7 +73,10 @@ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_threads; +#define nr_running atomic_read(&gnr_running) + +extern atomic_t gnr_running; +extern int nr_threads; extern int last_pid; #include @@ -120,6 +125,7 @@ * yield the CPU for one re-schedule.. */ #define SCHED_YIELD 0x10 +#define SCHED_RTGLOBAL 0x20 struct sched_param { int sched_priority; @@ -138,9 +144,12 @@ * a separate lock). */ extern rwlock_t tasklist_lock; -extern spinlock_t runqueue_lock; extern spinlock_t mmlist_lock; +extern void latsched_init(void); +extern int latsched_start(int on); +extern int latsched_setsamples(int nsamps); +extern int latsched_getdata(struct lsctl_getdata *lsgd); extern void sched_init(void); extern void init_idle(void); extern void show_state(void); @@ -318,7 +327,10 @@ * that's just fine.) */ struct list_head run_list; + int task_qid; unsigned long sleep_time; + long timer_ticks; + unsigned long run_jtime; struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; @@ -400,6 +412,9 @@ int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; + +/* per cpu proc list */ + struct list_head proclist_cpu; /* Thread group tracking */ u32 parent_exec_id; @@ -448,6 +463,10 @@ #define MAX_COUNTER (20*HZ/100) #define DEF_NICE (0) +/* + * see sched.c comment on this variable + */ +extern int decay_ticks; /* * The default (Linux) execution domain. @@ -467,15 +486,17 @@ exec_domain: &default_exec_domain, \ lock_depth: -1, \ counter: DEF_COUNTER, \ + timer_ticks: 0, \ nice: DEF_NICE, \ policy: SCHED_OTHER, \ mm: NULL, \ active_mm: &init_mm, \ cpus_runnable: -1, \ cpus_allowed: -1, \ - run_list: LIST_HEAD_INIT(tsk.run_list), \ + run_list: { NULL, NULL }, \ next_task: &tsk, \ prev_task: &tsk, \ + pid: 0, \ p_opptr: &tsk, \ p_pptr: &tsk, \ thread_group: LIST_HEAD_INIT(tsk.thread_group), \ @@ -499,6 +520,9 @@ blocked: {{0}}, \ alloc_lock: SPIN_LOCK_UNLOCKED, \ journal_info: NULL, \ + task_qid: 0, \ + proclist_cpu: LIST_HEAD_INIT(tsk.proclist_cpu), \ + run_jtime: 0, \ } @@ -791,6 +815,16 @@ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +extern void del_from_runqueue(struct task_struct * p); +extern void add_to_proclist(struct task_struct * p); +extern void del_from_proclist(struct task_struct * p); +extern void sched_wake_idle(void); +extern int move_to_cpu(struct task_struct * p, int cpu, int stick); +extern int get_best_cpu(struct task_struct *p); +extern void runqueue_spin_lock(struct task_struct * p); +extern void runqueue_spin_unlock(struct task_struct * p); + + #define __wait_event(wq, condition) \ do { \ wait_queue_t __wait; \ @@ -844,6 +878,7 @@ }) #define REMOVE_LINKS(p) do { \ + del_from_proclist(p); \ (p)->next_task->prev_task = (p)->prev_task; \ (p)->prev_task->next_task = (p)->next_task; \ if ((p)->p_osptr) \ @@ -855,6 +890,7 @@ } while (0) #define SET_LINKS(p) do { \ + add_to_proclist(p); \ (p)->next_task = &init_task; \ (p)->prev_task = init_task.prev_task; \ init_task.prev_task->next_task = (p); \ @@ -871,17 +907,20 @@ #define next_thread(p) \ list_entry((p)->thread_group.next, struct task_struct, thread_group) -static inline void del_from_runqueue(struct task_struct * p) -{ - nr_running--; - p->sleep_time = jiffies; - list_del(&p->run_list); - p->run_list.next = NULL; -} static inline int task_on_runqueue(struct task_struct *p) { return (p->run_list.next != NULL); +} + +static inline int task_on_proclist(struct task_struct *p) +{ + return (p->proclist_cpu.next != NULL); +} + +static inline int task_realtime(struct task_struct *p) +{ + return ((p->policy & ~SCHED_YIELD) != SCHED_OTHER); } static inline void unhash_process(struct task_struct *p) diff -Nru linux-2.5.1-pre11.vanilla/init/main.c linux-2.5.1-pre11.lxs2/init/main.c --- linux-2.5.1-pre11.vanilla/init/main.c Thu Dec 13 11:05:12 2001 +++ linux-2.5.1-pre11.lxs2/init/main.c Thu Dec 13 12:40:23 2001 @@ -427,6 +427,7 @@ * make syscalls (and thus be locked). */ smp_init(); + latsched_init(); rest_init(); } diff -Nru linux-2.5.1-pre11.vanilla/kernel/exit.c linux-2.5.1-pre11.lxs2/kernel/exit.c --- linux-2.5.1-pre11.vanilla/kernel/exit.c Thu Dec 13 11:05:12 2001 +++ linux-2.5.1-pre11.lxs2/kernel/exit.c Thu Dec 13 12:38:02 2001 @@ -63,8 +63,14 @@ * was given away by the parent in the first place.) */ current->counter += p->counter; - if (current->counter >= MAX_COUNTER) + if (current->counter >= MAX_COUNTER) { current->counter = MAX_COUNTER; + if (current->timer_ticks >= current->counter) { + current->counter = 0; + current->timer_ticks = 0; + current->need_resched = 1; + } + } p->pid = 0; free_task_struct(p); } else { diff -Nru linux-2.5.1-pre11.vanilla/kernel/fork.c linux-2.5.1-pre11.lxs2/kernel/fork.c --- linux-2.5.1-pre11.vanilla/kernel/fork.c Thu Dec 13 11:05:12 2001 +++ linux-2.5.1-pre11.lxs2/kernel/fork.c Thu Dec 13 12:38:02 2001 @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -28,7 +29,7 @@ /* The idle threads do not count.. */ int nr_threads; -int nr_running; +atomic_t gnr_running = ATOMIC_INIT(0); int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -614,6 +615,9 @@ p->run_list.next = NULL; p->run_list.prev = NULL; + p->proclist_cpu.next = NULL; + p->proclist_cpu.prev = NULL; + p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); p->vfork_done = NULL; @@ -639,7 +643,13 @@ { int i; p->cpus_runnable = ~0UL; - p->processor = current->processor; + /* + * if it's a real time task we leave it on the same processor/task_qid + */ + if (!task_realtime(p) && !(clone_flags & CLONE_PID)) { + p->processor = get_best_cpu(p); + p->task_qid = cpu_number_map(p->processor); + } /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; @@ -684,8 +694,14 @@ */ p->counter = (current->counter + 1) >> 1; current->counter >>= 1; - if (!current->counter) + p->timer_ticks = (current->timer_ticks + 1) >> 1; + current->timer_ticks >>= 1; + if (!current->counter) { current->need_resched = 1; + current->timer_ticks = 0; + } + + p->run_jtime = 0; /* * Ok, add it to the run-queues and make it diff -Nru linux-2.5.1-pre11.vanilla/kernel/ksyms.c linux-2.5.1-pre11.lxs2/kernel/ksyms.c --- linux-2.5.1-pre11.vanilla/kernel/ksyms.c Thu Dec 13 11:05:12 2001 +++ linux-2.5.1-pre11.lxs2/kernel/ksyms.c Thu Dec 13 12:38:02 2001 @@ -447,7 +447,6 @@ #endif EXPORT_SYMBOL(kstat); -EXPORT_SYMBOL(nr_running); /* misc */ EXPORT_SYMBOL(panic); diff -Nru linux-2.5.1-pre11.vanilla/kernel/sched.c linux-2.5.1-pre11.lxs2/kernel/sched.c --- linux-2.5.1-pre11.vanilla/kernel/sched.c Wed Nov 21 16:25:48 2001 +++ linux-2.5.1-pre11.lxs2/kernel/sched.c Fri Dec 14 09:31:50 2001 @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -81,18 +83,74 @@ /* * The tasklist_lock protects the linked list of processes. * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. + * This is the lock order : + * 1) tasklist_lock + * 2) RT_QID + * 3) lock(0) + * ... + * M) lock(N) * - * If both locks are to be concurrently held, the runqueue_lock - * nests inside the tasklist_lock. + * This does not mean that if a lock(3) is needed we've to lock the + * whole chain down to lock(3) but it states that if RT_QID and lock(3) + * are needed, RT_QID must be locked first. + * The lock patterns are tasklist_lock+lock(i) when the task is moved to + * a different runqueue, RT_QID+lock(i) inside rechedule_idle() of a global + * RT task ( only if the best CPU of the RT task is running another RT task ), + * RT_QID+lock(0)+..+lock(N) inside rechedule_idle() of a global RT task + * when all CPUs are running RT tasks ). + * No other patterns are used, ie: lock(i)+lock(j) * * task->alloc_lock nests inside tasklist_lock. */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ -static LIST_HEAD(runqueue_head); +/* + * dynamic priority ( counter ) decay limit. + * each timer tick, if counter > decay_ticks it gets decremented by 1 + * while if counter <= decay_ticks, timer ticks accumulates in timer_ticks + * and the task is preempted when timer_ticks == counter + */ +int decay_ticks = NICE_TO_TICKS(0); + +/* + * this is the distance map ( move cost ) between cpus. + * the move cost from cpu I to cpu J is : cpus_dmap[I][J] + * this value can be seen as the number of milliseconds we can + * tolerate to have an idle cpu before grabbing a remote task + * to run on the idle cpu + */ +#define DEF_CPU_DIST_MS 10 +#define MS_TO_DIST(t) (((t) * HZ) / 1000) +#define DEF_CPU_DIST MS_TO_DIST(DEF_CPU_DIST_MS) + +#define cpu_distance(i, j) (cpus_dmap[i][j]) + +/* + * this is a bonus that we give to cpus that have previously run + * an affine mm struct. the bonus value is in milliseconds + */ +#define MOVE_MM_BONUS_MS 20 +#define MOVE_MM_BONUS ((MOVE_MM_BONUS_MS * HZ) / 1000) + +/* + * this is the cpu distance map that should be compiled by the architecture + * dependent code or by the common code using a provided abstract topology + * interface + */ +unsigned char cpus_dmap[NR_CPUS][NR_CPUS]; + +/* + * this is the minimum run queue length that trigger balancing decisions + */ +int min_mov_rqlen = 2; + +/* + * this is the weight ( in milliseconds ) that a remote process has and is + * used together with the cpu distance ( metric ) map to build a uniform + * cost of move + */ +int mvtsk_cost = DEF_CPU_DIST_MS / 3; + /* * We align per-CPU scheduling data on cacheline boundaries, @@ -100,33 +158,86 @@ */ static union { struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; + int qnr_processes; + int qnr_running; + struct list_head proclist_head; + struct list_head runqueue_head; + struct task_struct *curr; + int idle_ticks; + int cpu_hit; + struct latsched_data ls; + spinlock_t runqueue_lock ____cacheline_aligned; } schedule_data; char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; +} aligned_data [NR_CPUS + 1] __cacheline_aligned; + +#define RT_QID NR_CPUS +#define global_rttask(p) ((p)->task_qid == RT_QID) +#define task_foreign(p) (cpu_number_map((p)->processor) != (p)->task_qid) +#define cpu_next(cpu) (((cpu) + 1) < smp_num_cpus ? (cpu) + 1: 0) #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule +#define idle_ticks(cpu) aligned_data[(cpu)].schedule_data.idle_ticks +#define cpu_hit(cpu) aligned_data[(cpu)].schedule_data.cpu_hit +#define qnr_processes(cpu) aligned_data[(cpu)].schedule_data.qnr_processes +#define qnr_running(cpu) aligned_data[(cpu)].schedule_data.qnr_running +#define proclist_head(cpu) aligned_data[(cpu)].schedule_data.proclist_head +#define runqueue_head(cpu) aligned_data[(cpu)].schedule_data.runqueue_head +#define runqueue_lock(cpu) aligned_data[(cpu)].schedule_data.runqueue_lock + +#define rq_lock(p) lock_task_rq(p) +#define rq_unlock(p) spin_unlock(&runqueue_lock((p)->task_qid)) +#define rq_lock_irq(p) do { local_irq_disable(); lock_task_rq(p); } while (0) +#define rq_unlock_irq(p) do { spin_unlock(&runqueue_lock((p)->task_qid)); local_irq_enable(); } while (0) +#define rq_lock_irqsave(p, f) do { local_irq_save(f); lock_task_rq(p); } while (0) +#define rq_unlock_irqrestore(p, f) do { spin_unlock(&runqueue_lock((p)->task_qid)); local_irq_restore(f); } while (0) + + +#define latsched_data(cpu) aligned_data[(cpu)].schedule_data.ls.ls_data +#define latsched_samp(cpu, idx) aligned_data[(cpu)].schedule_data.ls.ls_data[(idx)] +#define latsched_size(cpu) aligned_data[(cpu)].schedule_data.ls.ls_size +#define latsched_curr(cpu) aligned_data[(cpu)].schedule_data.ls.ls_curr + + +static atomic_t lss_enabled = ATOMIC_INIT(0); + + struct kernel_stat kstat; extern struct task_struct *child_reaper; + #ifdef CONFIG_SMP #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) -#define can_schedule(p,cpu) \ - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) +#define can_schedule(p, cpu) \ + ((p)->cpus_runnable & (p)->cpus_allowed & (1 << (cpu))) +#define can_move(p, cpu) \ + ((p)->cpus_runnable == ~0L && (p)->cpus_allowed & (1 << (cpu))) #else #define idle_task(cpu) (&init_task) -#define can_schedule(p,cpu) (1) +#define can_schedule(p, cpu) (1) +#define can_move(p, cpu) (1) #endif + void scheduling_functions_start_here(void) { } +static inline void lock_task_rq(struct task_struct *p) +{ + int rqn = p->task_qid; + + spin_lock(&runqueue_lock(rqn)); + while (p->task_qid != rqn) { + spin_unlock(&runqueue_lock(rqn)); + rqn = p->task_qid; + spin_lock(&runqueue_lock(rqn)); + } +} + /* * This is the function that decides how desirable a process is.. * You can weigh different processes against each other depending @@ -141,7 +252,7 @@ * +1000: realtime process, select this. */ -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) +static inline int goodness(struct task_struct * p, struct mm_struct *this_mm) { int weight; @@ -169,13 +280,6 @@ if (!weight) goto out; -#ifdef CONFIG_SMP - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; -#endif - /* .. and a slight advantage to the current MM */ if (p->mm == this_mm || !p->mm) weight += 1; @@ -197,107 +301,154 @@ * the 'goodness value' of replacing a process on a given CPU. * positive value means 'replace', zero or negative means 'dont'. */ -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p) +{ + return goodness(p, prev->active_mm) - goodness(prev, prev->active_mm); +} + +#ifdef CONFIG_SMP + +static inline void lock_queues(void) { - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); + int cpu; + for (cpu = 0; cpu < smp_num_cpus; cpu++) + spin_lock(&runqueue_lock(cpu)); +} + +static inline void unlock_queues(void) +{ + int cpu; + for (cpu = smp_num_cpus - 1; cpu >= 0; cpu--) + spin_unlock(&runqueue_lock(cpu)); } /* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. + * this is used to try to find a place to run the global rt task. + * it's called with the RT_QID lock held and with local irq disabled. */ -static FASTCALL(void reschedule_idle(struct task_struct * p)); - -static void reschedule_idle(struct task_struct * p) +static inline void rtt_reschedule_idle(struct task_struct * p) { -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; - - /* - * shortcut if the woken up task's last CPU is - * idle now. - */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: - /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. - */ + int cpu, best_cpu = cpu_number_map(p->processor), + this_cpu = cpu_number_map(smp_processor_id()), need_resched, maxpg = 0, pg; + struct task_struct *tsk, *ttsk = NULL; + + /* + * if the best cpu for the global rt task is not currently running + * another rt task, that's the choice. + */ + if (can_schedule(p, cpu_logical_map(best_cpu))) { + spin_lock(&runqueue_lock(best_cpu)); + tsk = cpu_curr(best_cpu); + if (!task_realtime(tsk)) { need_resched = tsk->need_resched; tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); + if (best_cpu != this_cpu && + (!need_resched || tsk != idle_task(cpu_logical_map(best_cpu)))) + smp_send_reschedule(cpu_logical_map(best_cpu)); + spin_unlock(&runqueue_lock(best_cpu)); return; } + spin_unlock(&runqueue_lock(best_cpu)); } - /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 0; - - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); - /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. - */ - if (tsk == idle_task(cpu)) { - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - } - } else { - if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); - - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; - } + * the best cpu for the global rt task is running another rt task. + * instead of using preemption_goodness() to try to schedule on that cpu + * we try to find a cpu that is not running another rt task. + */ + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (can_schedule(p, cpu_logical_map(cpu))) { + spin_lock(&runqueue_lock(cpu)); + tsk = cpu_curr(cpu); + if (!task_realtime(tsk)) { + need_resched = tsk->need_resched; + tsk->need_resched = 1; + if (cpu != this_cpu && + (!need_resched || tsk != idle_task(cpu_logical_map(cpu)))) + smp_send_reschedule(cpu_logical_map(cpu)); + spin_unlock(&runqueue_lock(cpu)); + return; } + spin_unlock(&runqueue_lock(cpu)); } } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; + /* + * it's not our lucky day ..., all the cpus are running rt tasks and + * a preemption_goodness() loop is needed to ensure that the global + * priority is respected among rt tasks. + */ + lock_queues(); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (can_schedule(p, cpu_logical_map(cpu))) { + tsk = cpu_curr(cpu); + if ((pg = preemption_goodness(tsk, p)) > maxpg) { + ttsk = tsk; + maxpg = pg; + if (tsk == idle_task(cpu_logical_map(cpu))) + break; + } } + } + if (ttsk) { + need_resched = ttsk->need_resched; + ttsk->need_resched = 1; + if (ttsk->processor != smp_processor_id() && !need_resched) + smp_send_reschedule(ttsk->processor); + } + unlock_queues(); +} + +static inline void std_reschedule_idle(struct task_struct * p) +{ + int best_cpu = p->task_qid, this_cpu = cpu_number_map(smp_processor_id()); + struct task_struct *tsk; + + tsk = cpu_curr(best_cpu); + if (tsk == idle_task(cpu_logical_map(best_cpu))) { + /* + * If need_resched == -1 then we can skip sending + * the IPI altogether, tsk->need_resched is + * actively watched by the idle thread. + */ + int need_resched = tsk->need_resched; tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); + if ((best_cpu != this_cpu) && !need_resched) + smp_send_reschedule(cpu_logical_map(best_cpu)); + } else if (tsk != p && preemption_goodness(tsk, p) > 0) { + tsk->need_resched = 1; + if (tsk->task_qid != this_cpu) + smp_send_reschedule(cpu_logical_map(tsk->task_qid)); } - return; +} +#endif /* #ifdef CONFIG_SMP */ + +/* + * This is ugly, but reschedule_idle() is very timing-critical. + * We are called with the runqueue spinlock held and we must + * not claim the tasklist_lock. + */ +static FASTCALL(void reschedule_idle(struct task_struct * p)); + +static void reschedule_idle(struct task_struct * p) +{ +#ifdef CONFIG_SMP + /* + * it's better to fork the path here instead of having complex if()s + * inside the function itself. rt tasks really have different wakeup + * methods compared with local cpu ones + */ + if (!global_rttask(p)) + std_reschedule_idle(p); + else + rtt_reschedule_idle(p); -#else /* UP */ - int this_cpu = smp_processor_id(); +#else /* #ifdef CONFIG_SMP */ struct task_struct *tsk; - tsk = cpu_curr(this_cpu); - if (preemption_goodness(tsk, p, this_cpu) > 0) + tsk = cpu_curr(smp_processor_id()); + if (preemption_goodness(tsk, p) > 0) tsk->need_resched = 1; -#endif +#endif /* #ifdef CONFIG_SMP */ } /* @@ -307,24 +458,276 @@ * run-queue, not the end. See the comment about "This is * subtle" in the scheduler proper.. */ -static inline void add_to_runqueue(struct task_struct * p) +static inline void __add_to_runqueue(struct task_struct * p, int task_qid) { - list_add(&p->run_list, &runqueue_head); - nr_running++; + list_add(&p->run_list, &runqueue_head(task_qid)); + ++qnr_running(task_qid); + atomic_inc(&gnr_running); } -static inline void move_last_runqueue(struct task_struct * p) +static inline void __del_from_runqueue(struct task_struct * p, int task_qid) { + atomic_dec(&gnr_running); + --qnr_running(task_qid); + p->sleep_time = jiffies; list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); + p->run_list.next = NULL; +} + +void del_from_runqueue(struct task_struct * p) +{ + unsigned long flags; + + rq_lock_irqsave(p, flags); + __del_from_runqueue(p, p->task_qid); + rq_unlock_irqrestore(p, flags); +} + +static inline void __add_to_proclist(struct task_struct * p, int task_qid) +{ + list_add(&p->proclist_cpu, &proclist_head(task_qid)); + ++qnr_processes(task_qid); +} + +void add_to_proclist(struct task_struct * p) +{ + unsigned long flags; + + rq_lock_irqsave(p, flags); + __add_to_proclist(p, p->task_qid); + rq_unlock_irqrestore(p, flags); +} + +static inline void __del_from_proclist(struct task_struct * p, int task_qid) +{ + list_del(&p->proclist_cpu); + --qnr_processes(task_qid); + p->proclist_cpu.next = NULL; +} + +void del_from_proclist(struct task_struct * p) +{ + unsigned long flags; + + rq_lock_irqsave(p, flags); + __del_from_proclist(p, p->task_qid); + rq_unlock_irqrestore(p, flags); +} + +void runqueue_spin_lock(struct task_struct * p) +{ + rq_lock(p); +} + +void runqueue_spin_unlock(struct task_struct * p) +{ + rq_unlock(p); } -static inline void move_first_runqueue(struct task_struct * p) +static inline void __move_last_runqueue(struct task_struct * p) { list_del(&p->run_list); - list_add(&p->run_list, &runqueue_head); + list_add_tail(&p->run_list, &runqueue_head(p->task_qid)); } +static inline void __move_first_runqueue(struct task_struct * p) +{ + list_del(&p->run_list); + list_add(&p->run_list, &runqueue_head(p->task_qid)); +} + +/* + * move_to_rqn() must be called with 1) local irq disabled + * 2) tasklist_lock write-locked 3) task locked + */ +static int move_to_rqn(struct task_struct * p, int rqn, int stick) +{ + int task_cpu; + unsigned long cpus_allowed; + + if (p->task_qid == rqn) { + if (stick) + p->cpus_allowed = (1 << cpu_logical_map(rqn)); + return rqn; + } + if (task_on_runqueue(p)) + __del_from_runqueue(p, p->task_qid); + if (task_on_proclist(p)) + __del_from_proclist(p, p->task_qid); + cpus_allowed = stick ? (1 << cpu_logical_map(rqn)): p->cpus_allowed; + p->cpus_allowed = 0; + task_cpu = p->task_qid; + p->task_qid = rqn; + spin_unlock(&runqueue_lock(task_cpu)); + + rq_lock(p); + __add_to_proclist(p, p->task_qid); + if (!task_on_runqueue(p) && p->state == TASK_RUNNING) + __add_to_runqueue(p, p->task_qid); + p->cpus_allowed = cpus_allowed; + return task_cpu; +} + +/* + * this is only called by softirq.c::ksoftirqd() and is used to place + * ksoftirqd tasks over different cpus. + */ +int move_to_cpu(struct task_struct * p, int cpu, int stick) +{ +#ifdef CONFIG_SMP + unsigned long flags; + + write_lock_irqsave(&tasklist_lock, flags); + rq_lock(p); + move_to_rqn(p, cpu_number_map(cpu), stick); + rq_unlock(p); + write_unlock_irqrestore(&tasklist_lock, flags); + if (cpu != smp_processor_id()) + smp_send_reschedule(cpu); + return cpu; +#else /* #ifdef CONFIG_SMP */ + return 0; +#endif /* #ifdef CONFIG_SMP */ +} + +/* + * this function gets called inside kernel/timer.c when the timer + * tick hit the idle task. maybe architectures with huge HZ might + * want to not wake up the idle at every timer tick + */ +void sched_wake_idle(void) +{ + if (smp_num_cpus > 1) + current->need_resched = 1; +} + +#ifdef CONFIG_SMP + +/* + * the runtime cpu distance is the sum of the base cpu distance plus the + * load on the remote cpu + */ +static long rt_cpu_dist(int src_cpu, int dst_cpu) +{ + return (cpu_distance(src_cpu, dst_cpu) << 4) + + (qnr_running(src_cpu) * mvtsk_cost * (HZ << 4)) / 1000; +} + +/* + * try to find the best cpu to run a fresh new process, no locks are held + * during this function. it gets called by do_fork() in SMP mode + */ +int get_best_cpu(struct task_struct *p) +{ + int i, best_cpu, this_cpu = cpu_number_map(smp_processor_id()); + long cdist, min_cdist; + + best_cpu = this_cpu; + min_cdist = rt_cpu_dist(this_cpu, this_cpu); + for (i = 0; i < smp_num_cpus; i++) { + if (i == this_cpu) continue; + if ((cdist = rt_cpu_dist(i, this_cpu)) < min_cdist) { + min_cdist = cdist; + best_cpu = i; + } + } + return cpu_logical_map(best_cpu); +} + +static inline long move_goodness(struct task_struct *p, struct mm_struct *this_mm) +{ + long mgds = (long) (jiffies - p->run_jtime); + if (p->mm == this_mm || !p->mm) + mgds += MOVE_MM_BONUS; + return mgds; +} + +static inline struct task_struct *try_steal_task(int src_cpu, int dst_cpu) +{ + int ldst_cpu = cpu_logical_map(dst_cpu); + long mgdns = -1, mvg; + struct mm_struct *this_mm = current->active_mm; + struct task_struct *tsk, *mvtsk = NULL; + struct list_head *head, *tmp; + + spin_lock_irq(&runqueue_lock(src_cpu)); + head = &runqueue_head(src_cpu); + list_for_each(tmp, head) { + tsk = list_entry(tmp, struct task_struct, run_list); + if (can_move(tsk, ldst_cpu) && !task_foreign(tsk) && + (mvg = move_goodness(tsk, this_mm)) > mgdns) { + mvtsk = tsk; + mgdns = mvg; + } + } + if (mvtsk) { + unsigned long cpus_allowed = mvtsk->cpus_allowed; + + mvtsk->cpus_allowed = 0; + __del_from_runqueue(mvtsk, src_cpu); + spin_unlock(&runqueue_lock(src_cpu)); + write_lock(&tasklist_lock); + spin_lock(&runqueue_lock(src_cpu)); + __del_from_proclist(mvtsk, src_cpu); + spin_unlock(&runqueue_lock(src_cpu)); + spin_lock(&runqueue_lock(dst_cpu)); + __add_to_runqueue(mvtsk, dst_cpu); + __add_to_proclist(mvtsk, dst_cpu); + mvtsk->counter -= mvtsk->timer_ticks; + mvtsk->timer_ticks = 0; + mvtsk->cpus_allowed = cpus_allowed; + mvtsk->task_qid = dst_cpu; + spin_unlock(&runqueue_lock(dst_cpu)); + write_unlock_irq(&tasklist_lock); + } else + spin_unlock_irq(&runqueue_lock(src_cpu)); + return mvtsk; +} + +/* + * the move cost is the difference from the cpu distance and the run queue + * load on the remote cpu. both terms are scaled by a factor 16 ( << 4 ) and + * the cost for each remote cpu task depend on mvtsk_cost + */ +static long move_cost(int src_cpu, int dst_cpu) +{ + return (cpu_distance(src_cpu, dst_cpu) << 4) - + (qnr_running(src_cpu) * mvtsk_cost * (HZ << 4)) / 1000; +} + +static inline struct task_struct *get_remote_task(int this_cpu) +{ + int i, max_cpu = -1; + long ccost, min_cost = 1000; + struct task_struct *rtask; + + this_cpu = cpu_number_map(this_cpu); + for (i = 0; i < smp_num_cpus; i++) { + if (i == this_cpu) continue; + if (qnr_running(i) >= min_mov_rqlen && (ccost = move_cost(i, this_cpu)) < min_cost) { + min_cost = ccost; + max_cpu = i; + } + } + if (max_cpu >= 0) { + if (cpu_hit(this_cpu) == max_cpu) + ++idle_ticks(this_cpu); + else { + cpu_hit(this_cpu) = max_cpu; + idle_ticks(this_cpu) = 1; + } + if (idle_ticks(this_cpu) >= cpu_distance(this_cpu, max_cpu)) { + if ((rtask = try_steal_task(max_cpu, this_cpu))) { + cpu_hit(this_cpu) = -1; + return rtask; + } + } + } else if (cpu_hit(this_cpu) >= 0) + cpu_hit(this_cpu) = -1; + return NULL; +} +#endif /* #ifdef CONFIG_SMP */ + /* * Wake up a process. Put it on the run-queue if it's not * already there. The "current" process is always on the @@ -341,16 +744,21 @@ /* * We want the common case fall through straight, thus the goto. */ - spin_lock_irqsave(&runqueue_lock, flags); + rq_lock_irqsave(p, flags); p->state = TASK_RUNNING; - if (task_on_runqueue(p)) + /* + * cpus_allowed is cleared when a task is moving from one cpu + * to another and it is used to avoid to be hit while we're + * switching locks. + */ + if (task_on_runqueue(p) || !p->cpus_allowed) goto out; - add_to_runqueue(p); - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) + __add_to_runqueue(p, p->task_qid); + if (!synchronous || p->task_qid != cpu_number_map(smp_processor_id())) reschedule_idle(p); success = 1; out: - spin_unlock_irqrestore(&runqueue_lock, flags); + rq_unlock_irqrestore(p, flags); return success; } @@ -476,41 +884,8 @@ task_lock(prev); task_release_cpu(prev); mb(); - if (prev->state == TASK_RUNNING) - goto needs_resched; - -out_unlock: task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ return; - - /* - * Slow path - we 'push' the previous process and - * reschedule_idle() will attempt to find a new - * processor for it. (but it might preempt the - * current process as well.) We must take the runqueue - * lock and re-check prev->state to be correct. It might - * still happen that this process has a preemption - * 'in progress' already - but this is not a problem and - * might happen in other circumstances as well. - */ -needs_resched: - { - unsigned long flags; - - /* - * Avoid taking the runqueue lock in cases where - * no preemption-check is necessery: - */ - if ((prev == idle_task(smp_processor_id())) || - (policy & SCHED_YIELD)) - goto out_unlock; - - spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) - reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); - goto out_unlock; - } #else prev->policy &= ~SCHED_YIELD; #endif /* CONFIG_SMP */ @@ -521,6 +896,15 @@ __schedule_tail(prev); } +static inline void set_task_running(struct task_struct *p, int cpu) +{ + if (p != idle_task(cpu) && cpu_hit(cpu_number_map(cpu)) >= 0) + cpu_hit(cpu_number_map(cpu)) = -1; + cpu_curr(cpu_number_map(cpu)) = p; + p->run_jtime = jiffies; + task_set_cpu(p, cpu); +} + /* * 'schedule()' is the scheduler function. It's a very simple and nice * scheduler: it's not perfect, but certainly works for most things. @@ -531,15 +915,13 @@ * tasks can run. It can not be killed, and it cannot sleep. The 'state' * information in task[0] is never used. */ -asmlinkage void schedule(void) +static inline void __schedule(void) { - struct schedule_data * sched_data; struct task_struct *prev, *next, *p; - struct list_head *tmp; + struct list_head *head, *tmp; int this_cpu, c; - - spin_lock_prefetch(&runqueue_lock); + spin_lock_prefetch(&runqueue_lock(current->processor)); if (!current->active_mm) BUG(); need_resched_back: @@ -554,18 +936,16 @@ release_kernel_lock(prev, this_cpu); /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. + * lock the task run queue to perform task related ops like + * move to last and del from runqueue. */ - sched_data = & aligned_data[this_cpu].schedule_data; - - spin_lock_irq(&runqueue_lock); + rq_lock_irq(prev); /* move an exhausted RR process to be last.. */ if (unlikely(prev->policy == SCHED_RR)) if (!prev->counter) { prev->counter = NICE_TO_TICKS(prev->nice); - move_last_runqueue(prev); + __move_last_runqueue(prev); } switch (prev->state) { @@ -575,12 +955,31 @@ break; } default: - del_from_runqueue(prev); + if (task_on_runqueue(prev)) + __del_from_runqueue(prev, prev->task_qid); case TASK_RUNNING:; } prev->need_resched = 0; /* + * this is true for running tasks moved with move_to_rqn() ( the first time + * they call schedule() ) and for global RT tasks. + */ + if (unlikely(task_foreign(prev))) { + rq_unlock(prev); + spin_lock(&runqueue_lock(cpu_number_map(this_cpu))); + } + + /* + * check global rt queue first without held locks and if it's not empty + * try to pickup the rt task first. despite to the new "unlikely" feature + * the code for rt task selection is kept out. + */ + if (!list_empty(&runqueue_head(RT_QID))) + goto rt_queue_select; +rt_queue_select_back: + + /* * this is the scheduler proper: */ @@ -590,10 +989,11 @@ */ next = idle_task(this_cpu); c = -1000; - list_for_each(tmp, &runqueue_head) { + head = &runqueue_head(cpu_number_map(this_cpu)); + list_for_each(tmp, head) { p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); + int weight = goodness(p, prev->active_mm); if (weight > c) c = weight, next = p; } @@ -601,25 +1001,26 @@ /* Do we need to re-calculate counters? */ if (unlikely(!c)) { - struct task_struct *p; - - spin_unlock_irq(&runqueue_lock); + spin_unlock_irq(&runqueue_lock(cpu_number_map(this_cpu))); read_lock(&tasklist_lock); - for_each_task(p) + head = &proclist_head(cpu_number_map(this_cpu)); + list_for_each(tmp, head) { + p = list_entry(tmp, struct task_struct, proclist_cpu); p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); + } read_unlock(&tasklist_lock); - spin_lock_irq(&runqueue_lock); + spin_lock_irq(&runqueue_lock(cpu_number_map(this_cpu))); goto repeat_schedule; } /* * from this point on nothing can prevent us from - * switching to the next task, save this fact in - * sched_data. + * switching to the next task. */ - sched_data->curr = next; - task_set_cpu(next, this_cpu); - spin_unlock_irq(&runqueue_lock); + set_task_running(next, this_cpu); + spin_unlock_irq(&runqueue_lock(cpu_number_map(this_cpu))); + +rt_task_selected: if (unlikely(prev == next)) { /* We won't go through the normal tail, so do this by hand */ @@ -627,24 +1028,6 @@ goto same_process; } -#ifdef CONFIG_SMP - /* - * maintain the per-process 'last schedule' value. - * (this has to be recalculated even if we reschedule to - * the same process) Currently this is only used on SMP, - * and it's approximate, so we do not have to maintain - * it while holding the runqueue spinlock. - */ - sched_data->last_schedule = get_cycles(); - - /* - * We drop the scheduler lock early (it's a global spinlock), - * thus we have to lock the previous process from getting - * rescheduled during switch_to(). - */ - -#endif /* CONFIG_SMP */ - kstat.context_swtch++; /* * there are 3 processes which are affected by a context switch: @@ -684,9 +1067,50 @@ same_process: reacquire_kernel_lock(current); + +#ifdef CONFIG_SMP + if (unlikely(current == idle_task(this_cpu))) + if (get_remote_task(this_cpu)) + goto need_resched_back; +#endif /* #ifdef CONFIG_SMP */ + if (current->need_resched) goto need_resched_back; return; + +rt_queue_select: + /* + * the fast lockless check reported that it might be a successful + * pickup inside the global rt queue, so we try here ... + */ + spin_unlock(&runqueue_lock(cpu_number_map(this_cpu))); + spin_lock(&runqueue_lock(RT_QID)); + c = 0; + head = &runqueue_head(RT_QID); + list_for_each(tmp, head) { + p = list_entry(tmp, struct task_struct, run_list); + if (can_schedule(p, this_cpu)) { + int weight = goodness(p, prev->active_mm); + if (weight > c) + c = weight, next = p; + } + } + if (!c) { + /* + * the fast test reported a false positive so we go back to + * the local CPU runqueue selection. + */ + spin_unlock(&runqueue_lock(RT_QID)); + spin_lock(&runqueue_lock(cpu_number_map(this_cpu))); + goto rt_queue_select_back; + } + /* + * the global rt task has been selected and final setup is needed. + */ + set_task_running(next, this_cpu); + spin_unlock_irq(&runqueue_lock(RT_QID)); + goto rt_task_selected; + } /* @@ -886,6 +1310,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param) { + int grt = 0, pgrt = 0, rqn; struct sched_param lp; struct task_struct *p; int retval; @@ -900,22 +1325,29 @@ /* * We play safe to avoid deadlocks. + * It's possible that we need a write lock to move the task in/out the + * RT_QID run queue so instead of getting a read lock and having to + * release/writelock again, it's better to get directly the write one. */ - read_lock_irq(&tasklist_lock); - spin_lock(&runqueue_lock); + write_lock_irq(&tasklist_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) - goto out_unlock; + goto out_unlock_tkll; + rq_lock(p); if (policy < 0) policy = p->policy; else { + grt = (policy & SCHED_RTGLOBAL) != 0; + policy &= ~SCHED_RTGLOBAL; + pgrt = global_rttask(p); + retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_OTHER) + policy != SCHED_OTHER) goto out_unlock; } @@ -940,14 +1372,27 @@ retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - if (task_on_runqueue(p)) - move_first_runqueue(p); + if (pgrt == grt) { + rqn = p->task_qid; + if (task_on_runqueue(p)) + __move_first_runqueue(p); + } else { + rqn = cpu_number_map(p->processor); + move_to_rqn(p, grt ? RT_QID: rqn, 0); + } - current->need_resched = 1; + if (grt || rqn == cpu_number_map(smp_processor_id())) + current->need_resched = 1; + else { +#ifdef CONFIG_SMP + smp_send_reschedule(cpu_logical_map(rqn)); +#endif /* #ifdef CONFIG_SMP */ + } out_unlock: - spin_unlock(&runqueue_lock); - read_unlock_irq(&tasklist_lock); + rq_unlock(p); +out_unlock_tkll: + write_unlock_irq(&tasklist_lock); out_nounlock: return retval; @@ -1017,6 +1462,7 @@ asmlinkage long sys_sched_yield(void) { + struct task_struct *ctsk = current; /* * Trick. sched_yield() first counts the number of truly * 'pending' runnable processes, then returns if it's @@ -1024,34 +1470,18 @@ * to be atomic.) In threaded applications this optimization * gets triggered quite often. */ - - int nr_pending = nr_running; - -#if CONFIG_SMP - int i; - - // Subtract non-idle processes running on other CPUs. - for (i = 0; i < smp_num_cpus; i++) { - int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; - } -#else - // on UP this process is on the runqueue as well - nr_pending--; -#endif - if (nr_pending) { + if (qnr_running(ctsk->task_qid) > 1) { /* * This process can only be rescheduled by us, * so this is safe without any locking. */ - if (current->policy == SCHED_OTHER) - current->policy |= SCHED_YIELD; - current->need_resched = 1; - - spin_lock_irq(&runqueue_lock); - move_last_runqueue(current); - spin_unlock_irq(&runqueue_lock); + if (ctsk->policy == SCHED_OTHER) + ctsk->policy |= SCHED_YIELD; + local_irq_disable(); + if (ctsk->counter > 0) + --ctsk->counter; + local_irq_enable(); + ctsk->need_resched = 1; } return 0; } @@ -1231,7 +1661,7 @@ /* We also take the runqueue_lock while altering task fields * which affect scheduling decisions */ - spin_lock(&runqueue_lock); + rq_lock(this_task); this_task->ptrace = 0; this_task->nice = DEF_NICE; @@ -1246,7 +1676,7 @@ memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); this_task->user = INIT_USER; - spin_unlock(&runqueue_lock); + rq_unlock(this_task); write_unlock_irq(&tasklist_lock); } @@ -1286,16 +1716,12 @@ void __init init_idle(void) { - struct schedule_data * sched_data; - sched_data = &aligned_data[smp_processor_id()].schedule_data; - if (current != &init_task && task_on_runqueue(current)) { printk("UGH! (%d:%d) was on the runqueue, removing.\n", smp_processor_id(), current->pid); - del_from_runqueue(current); + __del_from_runqueue(current, current->task_qid); } - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); + cpu_curr(cpu_number_map(smp_processor_id())) = current; clear_bit(current->processor, &wait_init_idle); } @@ -1307,13 +1733,35 @@ * We have to do a little magic to get the first * process right in SMP mode. */ - int cpu = smp_processor_id(); - int nr; + int i, j, cpu = smp_processor_id(); + + for (i = 0; i <= NR_CPUS; i++) { + qnr_processes(i) = 0; + qnr_running(i) = 0; + cpu_curr(i) = &init_task; + idle_ticks(i) = 0; + cpu_hit(i) = -1; + INIT_LIST_HEAD(&runqueue_head(i)); + INIT_LIST_HEAD(&proclist_head(i)); + runqueue_lock(i) = SPIN_LOCK_UNLOCKED; + } + + /* + * this should definetely be initialized by the architecure dependent + * smp layer, or by the scheduler code through an abstract topology + * interface. right now it sets same base distances for 4 CPU groups + */ + for (i = 0; i < NR_CPUS; i++) + for (j = 0; j <= i; j++) + if (i & ~0x03 == j & ~0x03) + cpus_dmap[i][j] = cpus_dmap[j][i] = DEF_CPU_DIST; + else + cpus_dmap[i][j] = cpus_dmap[j][i] = DEF_CPU_DIST + MS_TO_DIST(10); init_task.processor = cpu; - for(nr = 0; nr < PIDHASH_SZ; nr++) - pidhash[nr] = NULL; + for(i = 0; i < PIDHASH_SZ; i++) + pidhash[i] = NULL; init_timervecs(); @@ -1327,3 +1775,143 @@ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); } + + +void __init latsched_init(void) +{ + int ii, size; + + size = STD_LATSCHED_SAMPLES; + for (ii = 0; ii < smp_num_cpus; ii++) { + if ((latsched_data(ii) = kmalloc(size * sizeof(struct latsched_sample), GFP_KERNEL))) + memset(latsched_data(ii), 0, size * sizeof(struct latsched_sample)); + latsched_size(ii) = size; + latsched_curr(ii) = 0; + } +} + + +asmlinkage void schedule(void) +{ + int this_cpu; + unsigned long flags; + cycles_t cycls; + + if (atomic_read(&lss_enabled)) { + local_irq_save(flags); + this_cpu = current->processor; + latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_pid = -1; + latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_in = get_cycles(); + local_irq_restore(flags); + } + + __schedule(); + + cycls = get_cycles(); + if (atomic_read(&lss_enabled)) { + local_irq_save(flags); + this_cpu = current->processor; + if (latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_pid == -1) { + latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_out = cycls; + latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_pid = current->pid; + if (++latsched_curr(this_cpu) >= latsched_size(this_cpu)) + latsched_curr(this_cpu) = 0; + } + local_irq_restore(flags); + } +} + + +int latsched_start(int on) +{ + int res; + + cli(); + if (on) { + if (!atomic_read(&lss_enabled)) { + int ii; + + for (ii = 0; ii < smp_num_cpus; ii++) { + res = -ENOMEM; + if (!latsched_data(ii) && + !(latsched_data(ii) = kmalloc(latsched_size(ii) * sizeof(struct latsched_sample), GFP_KERNEL))) + goto out; + memset(latsched_data(ii), 0, latsched_size(ii) * sizeof(struct latsched_sample)); + latsched_curr(ii) = 0; + } + atomic_set(&lss_enabled, 1); + } + } else + atomic_set(&lss_enabled, 0); + res = 0; +out: + sti(); + return res; +} + + +int latsched_setsamples(int nsamps) +{ + int ii, res, size = nsamps; + + cli(); + res = -EBUSY; + if (atomic_read(&lss_enabled)) + goto out; + for (ii = 0; ii < smp_num_cpus; ii++) { + if (latsched_data(ii)) + kfree(latsched_data(ii)); + res = -ENOMEM; + if (!(latsched_data(ii) = kmalloc(size * sizeof(struct latsched_sample), GFP_KERNEL))) + goto out; + memset(latsched_data(ii), 0, size * sizeof(struct latsched_sample)); + latsched_size(ii) = size; + latsched_curr(ii) = 0; + } + res = 0; +out: + sti(); + return res; +} + + +int latsched_getdata(struct lsctl_getdata *lsgd) +{ + int res; + + cli(); + res = -EBUSY; + if (atomic_read(&lss_enabled)) + goto out; + res = -EINVAL; + if (lsgd->cpu < 0 || lsgd->cpu >= smp_num_cpus) + goto out; + if (latsched_samp(lsgd->cpu, latsched_size(lsgd->cpu) - 1).lss_pid != 0) { + int size, csize; + struct latsched_sample *data = lsgd->data; + + lsgd->rsize = size = latsched_size(lsgd->cpu); + if (lsgd->rsize > lsgd->size) + lsgd->rsize = size = lsgd->size; + csize = latsched_size(lsgd->cpu) - latsched_curr(lsgd->cpu); + if (csize > size) + csize = size; + if (csize) + __copy_to_user(data, &latsched_samp(lsgd->cpu, latsched_curr(lsgd->cpu)), + csize * sizeof(struct latsched_sample)); + data += csize; + size -= csize; + if (size) + __copy_to_user(data, &latsched_samp(lsgd->cpu, 0), + size * sizeof(struct latsched_sample)); + } else { + lsgd->rsize = latsched_curr(lsgd->cpu); + __copy_to_user(lsgd->data, &latsched_samp(lsgd->cpu, 0), + lsgd->rsize * sizeof(struct latsched_sample)); + } + res = 0; +out: + sti(); + return res; +} + diff -Nru linux-2.5.1-pre11.vanilla/kernel/signal.c linux-2.5.1-pre11.lxs2/kernel/signal.c --- linux-2.5.1-pre11.vanilla/kernel/signal.c Wed Nov 21 16:26:27 2001 +++ linux-2.5.1-pre11.lxs2/kernel/signal.c Thu Dec 13 12:38:02 2001 @@ -478,10 +478,10 @@ * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - spin_lock(&runqueue_lock); + runqueue_spin_lock(t); if (task_has_cpu(t) && t->processor != smp_processor_id()) smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); + runqueue_spin_unlock(t); #endif /* CONFIG_SMP */ if (t->state & TASK_INTERRUPTIBLE) { diff -Nru linux-2.5.1-pre11.vanilla/kernel/softirq.c linux-2.5.1-pre11.lxs2/kernel/softirq.c --- linux-2.5.1-pre11.vanilla/kernel/softirq.c Thu Dec 13 11:05:12 2001 +++ linux-2.5.1-pre11.lxs2/kernel/softirq.c Thu Dec 13 12:38:02 2001 @@ -369,7 +369,7 @@ sigfillset(¤t->blocked); /* Migrate to the right CPU */ - current->cpus_allowed = 1UL << cpu; + if (move_to_cpu(current, cpu, 1) < 0) BUG(); while (smp_processor_id() != cpu) schedule(); diff -Nru linux-2.5.1-pre11.vanilla/kernel/timer.c linux-2.5.1-pre11.lxs2/kernel/timer.c --- linux-2.5.1-pre11.vanilla/kernel/timer.c Mon Oct 8 10:41:41 2001 +++ linux-2.5.1-pre11.lxs2/kernel/timer.c Thu Dec 13 12:38:02 2001 @@ -583,8 +583,11 @@ update_one_process(p, user_tick, system, cpu); if (p->pid) { - if (--p->counter <= 0) { + if (p->counter > decay_ticks) + --p->counter; + else if (++p->timer_ticks >= p->counter) { p->counter = 0; + p->timer_ticks = 0; p->need_resched = 1; } if (p->nice > 0) @@ -592,8 +595,12 @@ else kstat.per_cpu_user[cpu] += user_tick; kstat.per_cpu_system[cpu] += system; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system; + } else { + sched_wake_idle(); + + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system; + } } /* diff -Nru linux-2.5.1-pre11.vanilla/mm/oom_kill.c linux-2.5.1-pre11.lxs2/mm/oom_kill.c --- linux-2.5.1-pre11.vanilla/mm/oom_kill.c Sat Nov 3 17:05:25 2001 +++ linux-2.5.1-pre11.lxs2/mm/oom_kill.c Thu Dec 13 12:38:02 2001 @@ -150,6 +150,7 @@ * exit() and clear out its resources quickly... */ p->counter = 5 * HZ; + p->timer_ticks = 0; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */