diff -U 5 -r linux-2.2.14.vanilla/include/linux/sched.h linux-2.2.14/include/linux/sched.h --- linux-2.2.14.vanilla/include/linux/sched.h Thu Jan 6 00:31:31 2000 +++ linux-2.2.14/include/linux/sched.h Sat Jan 22 00:09:58 2000 @@ -225,10 +225,16 @@ * Right now it is only used to track how many processes a * user has, but it has the potential to track memory usage etc. */ struct user_struct; +/* For task goodness clustering */ +struct gds_slot_struct { + struct task_struct *__next; + struct task_struct **__pprev; +}; + struct task_struct { /* these are hardcoded - don't touch */ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ unsigned long flags; /* per process flags, defined below */ int sigpending; @@ -238,18 +244,22 @@ */ struct exec_domain *exec_domain; long need_resched; /* various fields */ + cycles_t avg_slice; + unsigned long policy; long counter; long priority; - cycles_t avg_slice; +/* memory management info */ + struct mm_struct *mm; /* SMP and runqueue state */ int has_cpu; int processor; int last_processor; int lock_depth; /* Lock depth. We can context switch in and out of holding a syscall kernel lock... */ + struct gds_slot_struct gss; struct task_struct *next_task, *prev_task; struct task_struct *next_run, *prev_run; /* task state */ struct linux_binfmt *binfmt; @@ -279,11 +289,11 @@ /* Pointer to task[] array linkage. */ struct task_struct **tarray_ptr; struct wait_queue *wait_chldexit; /* for wait4() */ struct semaphore *vfork_sem; /* for vfork() */ - unsigned long policy, rt_priority; + unsigned long rt_priority; unsigned long it_real_value, it_prof_value, it_virt_value; unsigned long it_real_incr, it_prof_incr, it_virt_incr; struct timer_list real_timer; struct tms times; unsigned long start_time; @@ -312,12 +322,10 @@ struct thread_struct tss; /* filesystem information */ struct fs_struct *fs; /* open file information */ struct files_struct *files; -/* memory management info */ - struct mm_struct *mm; /* signal handlers */ spinlock_t sigmask_lock; /* Protects signal and blocked */ struct signal_struct *sig; sigset_t signal, blocked; @@ -361,21 +369,24 @@ * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) */ #define INIT_TASK \ /* state etc */ { 0,0,0,KERNEL_DS,&default_exec_domain,0, \ -/* counter */ DEF_PRIORITY,DEF_PRIORITY,0, \ +/* avg_slice */ 0, \ +/* policy */ SCHED_OTHER, DEF_PRIORITY,DEF_PRIORITY, \ +/* mm */ &init_mm, \ /* SMP */ 0,0,0,-1, \ +/* gss */ { NULL, NULL }, \ /* schedlink */ &init_task,&init_task, &init_task, &init_task, \ /* binfmt */ NULL, \ /* ec,brk... */ 0,0,0,0,0,0, \ /* pid etc.. */ 0,0,0,0,0, \ /* proc links*/ &init_task,&init_task,NULL,NULL,NULL, \ /* pidhash */ NULL, NULL, \ /* tarray */ &task[0], \ /* chld wait */ NULL, NULL, \ -/* timeout */ SCHED_OTHER,0,0,0,0,0,0,0, \ +/* timeout */ 0,0,0,0,0,0,0, \ /* timer */ { NULL, NULL, 0, 0, it_real_fn }, \ /* utime */ {0,0,0,0},0, \ /* per CPU times */ {0, }, {0, }, \ /* flt */ 0,0,0,0,0,0, \ /* swp */ 0, \ @@ -390,11 +401,10 @@ /* fs info */ 0,NULL, \ /* ipc */ NULL, NULL, \ /* tss */ INIT_TSS, \ /* fs */ &init_fs, \ /* files */ &init_files, \ -/* mm */ &init_mm, \ /* signals */ SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \ /* exec cts */ 0,0, \ } union task_union { diff -U 5 -r linux-2.2.14.vanilla/kernel/fork.c linux-2.2.14/kernel/fork.c --- linux-2.2.14.vanilla/kernel/fork.c Sat Oct 23 20:03:32 1999 +++ linux-2.2.14/kernel/fork.c Fri Jan 21 22:55:53 2000 @@ -688,10 +688,14 @@ /* ok, now we should be set up.. */ p->swappable = 1; p->exit_signal = clone_flags & CSIGNAL; p->pdeath_signal = 0; +/* Task goodness slotizer. */ + p->gss.__next = NULL; + p->gss.__pprev = NULL; + /* * "share" dynamic priority between parent and child, thus the * total amount of dynamic priorities in the system doesnt change, * more scheduling fairness. This is only important in the first * timeslice, on the long run the scheduling behaviour is unchanged. diff -U 5 -r linux-2.2.14.vanilla/kernel/sched.c linux-2.2.14/kernel/sched.c --- linux-2.2.14.vanilla/kernel/sched.c Thu Jan 6 00:27:52 2000 +++ linux-2.2.14/kernel/sched.c Sat Feb 5 13:59:50 2000 @@ -122,10 +122,61 @@ #define idle_task(cpu) (&init_task) #define can_schedule(p) (1) #endif + + +/* + * Task goodness slotizer ( begin ). + */ +#define MAX_GDS 48 /* + * Estimated maximum goodness value. + * All task with a greater one stay in (MAX_SLOTS - 1). + */ +#define SLOT_SHIFT 0 /* Shift factor for slot arithmetic. */ +#define GDS_STEP (1 << SLOT_SHIFT) /* Goodness step */ +#define MAX_SLOTS (MAX_GDS / GDS_STEP) /* Number of goodness slots */ +#define GDS_LOWER DEF_PRIORITY /* Goodness lower level, tasks lower than this goes in slot 1 */ +#define GDS_SLOT(c) (((c) >= GDS_LOWER) ? (((c) - GDS_LOWER) >> SLOT_SHIFT): 0) + +/* The -1 is because slots is incremented by 1 to keep slot 0 for processes really exhausted. */ +#define SLOT_GDS_BASE(s) ((((s) - 1) << SLOT_SHIFT) + GDS_LOWER) + +/* This great value for goodness in slot (MAX_SLOTS - 1) ensure that al tasks + * in that slot ( commonly RT processes ) are totally tested before exiting. + */ +#define MAX_LINUX_GOODNESS 10000 +/* Maximum goodness possible in slot s */ +#define SLOT_GDS_MAX(s) (((s) < (MAX_SLOTS - 1)) ? \ + (SLOT_GDS_BASE(s) + ((1 << SLOT_SHIFT) - 1)) : MAX_LINUX_GOODNESS) + +/* Return the task_struct pointer that contain the gds_slot_struct pointed by "s" */ +#define TGDS_HEAD(s) ((struct task_struct *) ((char *) &(s)->__next - (int) &(((struct task_struct *) 0)->gss.__next))) + +/* Define the initial runqueue load that will trigger turbo mode */ +#define GDS_LOADTRIGGER 5 + +/* Define the initial delay that turbo scheduler will wait until boot */ +#define GDS_WAIT_SPINS 2 + +/* + * Task goodness slots. + * gds_slots[0] contain all exhausted processes and is skipped from scan in schedule(). + */ +static struct gds_slot_struct gds_slots_init[MAX_SLOTS]; /* Init image to speedup turbo boot */ +static struct gds_slot_struct gds_slots[MAX_SLOTS]; +static int start_gdslot; /* Index at which start iteration */ +static int turbo_sched; /* Turbo scheduler switcher */ +static int turbo_sched_waitspins; /* Define the delay that turbo scheduler will wait until boot */ +static int turbo_sched_loadtrigger; /* Define the runqueue load that will trigger turbo mode */ + +/* + * Task goodness slotizer ( end ). + */ + + void scheduling_functions_start_here(void) { } /* * This is the function that decides how desirable a process is.. * You can weigh different processes against each other depending @@ -355,10 +406,149 @@ * Pass #2 */ reschedule_idle_slow(p); } +/* + * Task goodness slotizer ( begin ). + */ +/* Initialize goodness slots circular lists. */ +static void gds_init(void) +{ + int ii; + + for (ii = 0; ii < MAX_SLOTS; ii++) { + gds_slots[ii].__pprev = &gds_slots[ii].__next; + gds_slots[ii].__next = TGDS_HEAD(&gds_slots[ii]); + } + + memcpy(gds_slots_init, gds_slots, sizeof(gds_slots_init)); + + start_gdslot = MAX_SLOTS - 1; + turbo_sched = 0; /* Start in normal mode */ + turbo_sched_waitspins = GDS_WAIT_SPINS; + turbo_sched_loadtrigger = GDS_LOADTRIGGER; +} + +/* Add task to ( tail ) goodness slot ( want "runqueue_lock" ). */ +static inline void gds_tadd_task(struct task_struct * ts) +{ + /* It's important to compute the maximum goodness possible for ts, + * so that we can stop iterate in schedule() when we find a process + * that maintain his goodness promise. + */ + int weight = goodness(ts, ts, ts->processor), + slot = 0; + struct task_struct * qh; + /* Exhausted processes fall in slot 0 to avoid to scan them in schedule(). */ + if (weight > 0) + { + if ((slot = GDS_SLOT(weight)) < (MAX_SLOTS - 1)) + ++slot; /* We keep slot 0 for really exhausted processes */ + else + slot = MAX_SLOTS - 1; + if (slot > start_gdslot) + start_gdslot = slot; /* Update iteration start index. */ + } + qh = TGDS_HEAD(&gds_slots[slot]); + ts->gss.__pprev = qh->gss.__pprev; + *qh->gss.__pprev = ts; + qh->gss.__pprev = &ts->gss.__next; + ts->gss.__next = qh; +} + +/* Add task to ( head ) goodness slot ( want "runqueue_lock" ). */ +static inline void gds_hadd_task(struct task_struct * ts) +{ + /* It's important to compute the maximum goodness possible for ts, + * so that we can stop iterate in schedule() when we find a process + * that maintain his goodness promise. + */ + int weight = goodness(ts, ts, ts->processor), + slot = 0; + struct task_struct * qh; + /* Exhausted processes fall in slot 0 to avoid to scan them in schedule(). */ + if (weight > 0) + { + if ((slot = GDS_SLOT(weight)) < (MAX_SLOTS - 1)) + ++slot; /* We keep slot 0 for really exhausted processes */ + else + slot = MAX_SLOTS - 1; + if (slot > start_gdslot) + start_gdslot = slot; /* Update iteration start index. */ + } + qh = TGDS_HEAD(&gds_slots[slot]); + qh = qh->gss.__next; + ts->gss.__pprev = qh->gss.__pprev; + *qh->gss.__pprev = ts; + qh->gss.__pprev = &ts->gss.__next; + ts->gss.__next = qh; +} + +/* Remove task to goodness slot ( want "runqueue_lock" ). */ +static inline int gds_remove_task(struct task_struct * ts) +{ + if (ts->gss.__pprev) { + ts->gss.__next->gss.__pprev = ts->gss.__pprev; + *ts->gss.__pprev = ts->gss.__next; + ts->gss.__pprev = NULL; + return (1); + } + return (0); +} + +/* Switch task from goodness slots ( want "runqueue_lock" ). */ +static inline void gds_switch(struct task_struct * ts) +{ + if (gds_remove_task(ts)) + gds_tadd_task(ts); +} + +/* Used after a turbo boot to reinsert tasks in clusters ( want "runqueue_lock" ). */ +static inline void gds_switch_init(struct task_struct * ts) +{ + ts->gss.__pprev = NULL; + ts->gss.__next = NULL; + if (ts->next_run) + gds_tadd_task(ts); +} + + +/* Measure the load factor and decide if switch in normal_mode or in turbo_mode. + * This must be called only before the recalculate loop. + * It returns 1 if it does a switch, 0 otherwise. + */ +static inline int gds_try_switch(void) +{ + static int spins = 0; + + if (nr_running > turbo_sched_loadtrigger) { + if (!turbo_sched) { + if (++spins > turbo_sched_waitspins) { + turbo_sched = 1; + spins = 0; + return 1; + } + } + } else { + if (turbo_sched) { + if (++spins > turbo_sched_waitspins) { + turbo_sched = 0; + memcpy(gds_slots, gds_slots_init, sizeof(gds_slots)); + spins = 0; + return 1; + } + } + } + return 0; +} +/* + * Task goodness slotizer ( end ). + */ + + + /* * Careful! * * This has to add the process to the _beginning_ of the * run-queue, not the end. See the comment about "This is @@ -371,10 +561,13 @@ p->prev_run = &init_task; init_task.next_run = p; p->next_run = next; next->prev_run = p; nr_running++; + /* Add task to goodness slot ( want "runqueue_lock" ). */ + if (turbo_sched) + gds_hadd_task(p); } static inline void del_from_runqueue(struct task_struct * p) { struct task_struct *next = p->next_run; @@ -383,10 +576,13 @@ nr_running--; next->prev_run = prev; prev->next_run = next; p->next_run = NULL; p->prev_run = NULL; + /* Remove task to goodness slot ( want "runqueue_lock" ). */ + if (turbo_sched) + gds_remove_task(p); } static inline void move_last_runqueue(struct task_struct * p) { struct task_struct *next = p->next_run; @@ -399,10 +595,15 @@ p->next_run = &init_task; prev = init_task.prev_run; init_task.prev_run = p; p->prev_run = prev; prev->next_run = p; + /* Change task to goodness slot ( want "runqueue_lock" ). */ + if (turbo_sched) { + gds_remove_task(p); + gds_tadd_task(p); + } } static inline void move_first_runqueue(struct task_struct * p) { struct task_struct *next = p->next_run; @@ -415,10 +616,15 @@ p->prev_run = &init_task; next = init_task.next_run; init_task.next_run = p; p->next_run = next; next->prev_run = p; + /* Change task to goodness slot ( want "runqueue_lock" ). */ + if (turbo_sched) { + gds_remove_task(p); + gds_hadd_task(p); + } } /* * The tasklist_lock protects the linked list of processes. * @@ -712,10 +918,16 @@ */ sched_data = & aligned_data[this_cpu].schedule_data; spin_lock_irq(&runqueue_lock); + /* Change task goodness slot ( want "runqueue_lock" ). + * Perhaps this can be moved out of fast path. + */ + if (turbo_sched) + gds_switch(prev); + /* move an exhausted RR process to be last.. */ if (prev->policy == SCHED_RR) goto move_rr_last; move_rr_back: @@ -735,45 +947,84 @@ /* * this is the scheduler proper: */ - p = init_task.next_run; /* Default process to select.. */ next = idle_task(this_cpu); c = -1000; if (prev->state == TASK_RUNNING) goto still_running; still_running_back: - /* - * This is subtle. - * Note how we can enable interrupts here, even - * though interrupts can add processes to the run- - * queue. This is because any new processes will - * be added to the front of the queue, so "p" above - * is a safe starting point. - * run-queue deletion and re-ordering is protected by - * the scheduler lock - */ + if (turbo_sched) { + int ii, cslots, gdsmax, weight; + struct task_struct *qh; + /* Scan task goodness slots ( want "runqueue_lock" ). + * Note that "ii > 0" skip all exhausted processes in slot 0 . + */ + for (ii = start_gdslot, cslots = 0; ii > 0; ii--) { + qh = TGDS_HEAD(&gds_slots[ii]); + if ((p = qh->gss.__next) != qh) { + gdsmax = SLOT_GDS_MAX(ii); /* Max goodness in slot. */ + if (!cslots) /* Remember iteration start index. */ + start_gdslot = ii, ++cslots; + if (c >= gdsmax) + goto task_found; + do { + if (can_schedule(p)) { + if ((weight = goodness(prev, p, this_cpu)) > c) { + c = weight, next = p; + if (c >= gdsmax) + goto task_found; + } + } + } while ((p = p->gss.__next) != qh); + } + /* Goodness promise has been maintained, we've found the President ! */ + if (c >= SLOT_GDS_BASE(ii)) + goto task_found; + } + + /* Do we need to re-calculate counters? */ + if ((c <= 0) && ((qh = TGDS_HEAD(&gds_slots[0]))->gss.__next != qh)) + goto recalculate; + + } else { + /* + * This is subtle. + * Note how we can enable interrupts here, even + * though interrupts can add processes to the run- + * queue. This is because any new processes will + * be added to the front of the queue, so "p" above + * is a safe starting point. + * run-queue deletion and re-ordering is protected by + * the scheduler lock + */ /* * Note! there may appear new tasks on the run-queue during this, as * interrupts are enabled. However, they will be put on front of the * list, so our list starting at "p" is essentially fixed. */ - while (p != &init_task) { - if (can_schedule(p)) { - int weight = goodness(prev, p, this_cpu); - if (weight > c) - c = weight, next = p; + p = init_task.next_run; + + while (p != &init_task) { + if (can_schedule(p)) { + int weight = goodness(prev, p, this_cpu); + if (weight > c) + c = weight, next = p; + } + p = p->next_run; } - p = p->next_run; + + if (!c) + goto recalculate; } - /* Do we need to re-calculate counters? */ - if (!c) - goto recalculate; + +task_found: + /* * from this point on nothing can prevent us from * switching to the next task, save this fact in * sched_data. */ @@ -828,17 +1079,37 @@ reacquire_kernel_lock(current); return; recalculate: { - struct task_struct *p; - spin_unlock_irq(&runqueue_lock); - read_lock(&tasklist_lock); - for_each_task(p) - p->counter = (p->counter >> 1) + p->priority; - read_unlock(&tasklist_lock); - spin_lock_irq(&runqueue_lock); + int switched = gds_try_switch(); + + if (turbo_sched) { + struct task_struct *p; + read_lock(&tasklist_lock); + if (!switched) { + for_each_task(p) { + p->counter = (p->counter >> 1) + p->priority; + gds_switch(p); + } + } else { + for_each_task(p) { + p->counter = (p->counter >> 1) + p->priority; + gds_switch_init(p); + } + } + read_unlock(&tasklist_lock); + } else { + struct task_struct *p; + spin_unlock_irq(&runqueue_lock); + read_lock(&tasklist_lock); + for_each_task(p) + p->counter = (p->counter >> 1) + p->priority; + read_unlock(&tasklist_lock); + spin_lock_irq(&runqueue_lock); + } + goto repeat_schedule; } still_running: c = prev_goodness(prev, prev, this_cpu); @@ -918,10 +1189,85 @@ read_unlock(&waitqueue_lock); out: return; } + +/* + * This is the new code for semaphore wakeup. + * As You can see it release only the best waiting task except when + * all processes counters are exhausted. + * In that case I prefer to fall in the previous implementation and + * release all tasks instead of perform a recharge loop here. + * Anyway such situation rarely occur ( it is more rare higher is the + * number of waiting task and with few tasks the cost of a total release + * is not so high ). + * It can be stated that in SMP systems the goodness calculation is not 100% precise + * due to the fact that we don't know which CPU will reschedule the task. + * Anyway, IMVHO, I prefer this solution to : + * 1) a FIFO one due to the fact the we try to release the best task ( under UP we're 100% precise ) + * 2) a "release all" method due to : + * A) peak of processes flushed on the scheduler + * B) starvation issues + */ +void __sem_wake_up(struct wait_queue **q, unsigned int mode) +{ + struct task_struct *p, *best = NULL; + struct wait_queue *head, *next; + int c = 0, this_cpu = current->processor; + + if (!q) + goto out; + /* + * this is safe to be done before the check because it + * means no deference, just pointer operations. + */ + head = WAIT_QUEUE_HEAD(q); + + read_lock(&waitqueue_lock); + next = *q; + if (!next) + goto out_unlock; + + while (next != head) { + p = next->task; + next = next->next; + if (p->state & mode) { + /* Search the best one to run */ + if (can_schedule(p)) { + int weight = goodness(current, p, this_cpu); + if (weight > c) { + c = weight, best = p; + } + } + } + } + /* Found it ? */ + if (best) { + wake_up_process(best); + } + else { + /* Old way. Release all tasks ( sigh ! ) */ + head = WAIT_QUEUE_HEAD(q); + next = *q; + + while (next != head) { + p = next->task; + next = next->next; + if (p->state & mode) { + wake_up_process(p); + } + } + } +out_unlock: + read_unlock(&waitqueue_lock); +out: + return; +} + + + /* * Semaphores are implemented using a two-way counter: * The "count" variable is decremented for each process * that tries to sleep, while the "waking" variable is * incremented when the "up()" code goes to wake up waiting @@ -951,11 +1297,15 @@ * where we want to avoid any extra jumps and calls. */ void __up(struct semaphore *sem) { wake_one_more(sem); +#ifdef OLD_SEMAPHORE_METHOD wake_up(&sem->wait); +#else + __sem_wake_up(&sem->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE); +#endif } /* * Perform the "down" function. Return zero for semaphore acquired, * return negative for signalled out of the function. @@ -2061,6 +2411,7 @@ pidhash[nr] = NULL; init_bh(TIMER_BH, timer_bh); init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); + gds_init(); }