This is a simple patch for x86 CPU
family with 32 bytes cacheline size ( can be easily changed to other
sizes ) that colours "struct task_struct" allocations inside the kernel
by adding four different colours that will offer a collision reduction
of about 1/4 :
This patch cache-colours both task_struct and kernel stack using a slab allocator for task_struct and initial stack pointer jittering for kernel stack. An extra structure is defined for task_struct allocations as long as a new init task structure :
[include/linux/sched.h]
#define
TSK_TO_KSTACK(p) (((struct full_task_struct *)
(p))->stack)
#define TSK_KSTACK_TOP(p) (((struct full_task_struct *)
(p))->stack_top)
#define TSK_COUNT(p) (((struct full_task_struct *)
(p))->count)
struct full_task_struct {
struct task_struct task;
atomic_t count;
unsigned long stack;
unsigned long stack_top;
};
struct init_task_struct {
unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
struct full_task_struct ftsk;
};
So, each pointer to a task_struct is really a full_task_struct pointer that can be used to access other fields like task stack base, task stack top and task_struct use count. The stack to task_struct iverse lookup is done by storing the pointer of the stack's owner task_struct at the base of the stack :
[arch/??/kernel/process.c]
struct task_struct
*alloc_task_struct(void)
{
struct full_task_struct *f = (struct
full_task_struct *) kmem_cache_alloc(tsk_cache, GFP_KERNEL);
if (!f)
return NULL;
f->stack = __get_free_pages(GFP_KERNEL, 1);
if (!f->stack) {
kmem_cache_free(tsk_cache,
f);
return NULL;
}
atomic_set(&f->count, 1);
*((struct task_struct **) f->stack) = (struct
task_struct *) f;
return (struct task_struct *) f;
}
The initial stack frame pointer jittering is done by :
[arch/??/kernel/process.c]
#define
STACK_COLOUR_BITS 3
#define STACK_COLOUR_MASK ((1 << STACK_COLOUR_BITS) -
1)
#define L2_CACHE_SIZE (256 * 1024)
#define L2_ASSOCIATIVITY 4
#define L2_SET_SIZE
(L2_CACHE_SIZE / L2_ASSOCIATIVITY)
static inline unsigned long get_stack_jitter(struct task_struct *p)
{
return ((TSK_TO_KSTACK(p) / L2_SET_SIZE) &
STACK_COLOUR_MASK) << L1_CACHE_SHIFT;
}
int copy_thread(int nr, unsigned
long clone_flags, unsigned long esp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
struct pt_regs * childregs;
childregs =
((struct pt_regs *) (THREAD_SIZE + TSK_TO_KSTACK(p) -
get_stack_jitter(p))) - 1;
struct_cpy(childregs, regs);
childregs->eax = 0;
childregs->esp = esp;
p->thread.esp
= TSK_KSTACK_TOP(p) = (unsigned long) childregs;
p->thread.esp0 = (unsigned long) (childregs+1);
p->thread.eip = (unsigned long) ret_from_fork;
savesegment(fs,p->thread.fs);
savesegment(gs,p->thread.gs);
unlazy_fpu(current);
struct_cpy(&p->thread.i387,
¤t->thread.i387);
return 0;
}
By using three stack color bits
eight colors will be used for initial stack frame pointer jittering
that
should be enough for most cache architectures. Care has to be taken in
increasing too much STACK_COLOUR_BITS because this can lead to kernel
stack overflows ( for example a STACK_COLOUR_BITS set to 4 with a cache
line size of 128 bytes like P4 will result in a maximum jitter of 2048
bytes by leaving only 6 Kb for the kernel stack ). The patch is
available here :
My brand new CPQ Presario 3045US had a new
SiS chipset (650) whose IRQ router was not compatible with the code
that
was inside the kernel. New requests were present inside the PCI
configuration space (0x60...0x63) that were used by the USB subsystem.
Those patches correctly recognize the new 96x South Bridge and fix the
problem.