Improving (network) IO performance ...

07-01-2001 - Davide Libenzi <davidel@xmailserver.org>
	    

The reason of the current work is to analyze different methods for
efficent delivery of networks events from kernel mode to user mode.
Three methods are examined, poll() that has been chosen as the better
old-style method, standard /dev/poll interface and a new /dev/poll
that uses a quite different notification method.
RT signals have been discarded because they are not so efficent as
the old /dev/poll interface due the single event pickup way to pop
signals from the queue.
Provos and Lever have developed an interface to retire more than one
signal per system call but, even in this way, the interface resulted
a loser compared to the old /dev/poll.
This work is composed by :

1) the new /dev/poll kernel patch
2) the /dev/poll patch from Provos-Lever modified to work with 2.4.6
3) the HTTP server
4) the deadconn(tm) tool to create "dead" connections

As a measure tool httperf has been chosen coz, even if not perfect,
it offers a quite sufficent number of loading options.


                    The new /dev/poll kernel patch

The patch is quite simple and it adds notification callbacks to the
'struct file' data structure :

****** include/linux/fs.h

/* file callback notification events */
#define ION_IN          1
#define ION_OUT         2
#define ION_HUP         3
#define ION_ERR         4

#define FCB_LOCAL_SIZE  4

#define fcblist_read_lock(fp, fl)              read_lock_irqsave(&(fp)->f_cblock, fl)
#define fcblist_read_unlock(fp, fl)            read_unlock_irqrestore(&(fp)->f_cblock, fl)
#define fcblist_write_lock(fp, fl)             write_lock_irqsave(&(fp)->f_cblock, fl)
#define fcblist_write_unlock(fp, fl)           write_unlock_irqrestore(&(fp)->f_cblock, fl)

struct fcb_struct {
        struct list_head lnk;
        void (*cbproc)(struct file *, void *, unsigned long *, long *);
        void *data;
        unsigned long local[FCB_LOCAL_SIZE];
};

struct file {
        ...
        /* file callback list */
        rwlock_t f_cblock;
        struct list_head f_cblist;
};

The meaning of this callback list is to give lower IO layers the ability to
notify upper layers that will register their "interests" to the file structure.
In fs/file_table.c initialization and cleanups code has been added while
in fs/file.c the callback list handling code has been fit :

****** fs/file_table.c

struct file * get_empty_filp(void)
{
        ...
        rwlock_init(&f->f_cblock);
        INIT_LIST_HEAD(&f->f_cblist);
        ...
}

int init_private_file(struct file *filp, struct dentry *dentry, int mode)
{
        ...
        rwlock_init(&f->f_cblock);
        INIT_LIST_HEAD(&f->f_cblist);
        ...
}

void fput(struct file * file)
{
        ...
        file_notify_cleanup(file);
        ...
}


****** fs/file.c

void file_notify_event(struct file *filep, long *event)
{
        unsigned long flags;
        struct list_head *lnk;

        fcblist_read_lock(filep, flags);
        list_for_each(lnk, &filep->f_cblist) {
                struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, lnk);

                fcbp->cbproc(filep, fcbp->data, fcbp->local, event);
        }
        fcblist_read_unlock(filep, flags);
}

int file_notify_addcb(struct file *filep,
                void (*cbproc)(struct file *, void *, unsigned long *, long *),
void *data)
{
        unsigned long flags;
        struct fcb_struct *fcbp;

        if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL)))
                return -ENOMEM;
        memset(fcbp, 0, sizeof(struct fcb_struct));
        fcbp->cbproc = cbproc;
        fcbp->data = data;
        fcblist_write_lock(filep, flags);
        list_add_tail(&fcbp->lnk, &filep->f_cblist);
        fcblist_write_unlock(filep, flags);
        return 0;
}

int file_notify_delcb(struct file *filep,
                void (*cbproc)(struct file *, void *, unsigned long *, long *))
{
        int error;
        unsigned long flags;
        struct list_head *lnk;

        fcblist_write_lock(filep, flags);
        error = -ENOENT;
        list_for_each(lnk, &filep->f_cblist) {
                struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, lnk);

                if (fcbp->cbproc == cbproc) {
                        list_del(lnk);
                        kfree(fcbp);
                        error = 0;
                        break;
                }
        }
        fcblist_write_unlock(filep, flags);
        return error;
}

void file_notify_cleanup(struct file *filep)
{
        unsigned long flags;
        struct list_head *lnk;

        fcblist_write_lock(filep, flags);
        while ((lnk = list_first(&filep->f_cblist))) {
                struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, lnk);

                list_del(lnk);
                kfree(fcbp);
        }
        fcblist_write_unlock(filep, flags);
}

The callbacks will receive a 'long *' whose first element is one of the
ION_* events while the nexts could store additionals params whose meaning
will vary depending on the first one.
This interface is a draft and I used it only to verify if the transport method
is "enough" efficent to work on.
At the current stage notifications has been plugged only inside the socket
files by adding :

****** include/net/sock.h

static inline void sk_wake_async(struct sock *sk, int how, int band)
{
        if (sk->socket) {
                if (sk->socket->file) {
                        extern long ion_band_table[];
                        extern long band_table[];
                        long event[] = { ion_band_table[band - POLL_IN], band_table[band - POLL_IN], -1 };

                        file_notify_event(sk->socket->file, event);
                }
                if (sk->socket->fasync_list)
                        sock_wake_async(sk->socket, how, band);
        }
}

Even if it has been hooked only to network sockets it should not be a problem
to expand it to other files types.
The /dev/poll implementation resides in two new files driver/char/devpoll.c
and the include/linux/devpoll.h include file.
The interface of the new /dev/poll is quite different from the previous one
coz it works only by mmapping the device file descriptor while the
copy-data-to-user-space has been discarded for efficiency reasons.
The initialization sequence is :

        if ((kdpfd = open("/dev/poll", O_RDWR)) == -1) {

        }
        if (ioctl(kdpfd, DP_ALLOC, maxfds))
        {

        }
        if ((map = (char *) mmap(NULL, DP_MAP_SIZE(maxfds), PROT_READ | PROT_WRITE,
                        MAP_PRIVATE, kdpfd, 0)) == (char *) -1)
        {

        }

where  maxfds  is the maximum number of file descriptors that it's supposed
to stock inside the polling device.
Files are added to the interest set by :

        struct pollfd pfd;

        pfd.fd = fd;
        pfd.events = POLLIN | POLLOUT | POLLERR | POLLHUP;
        pfd.revents = 0;
        if (write(kdpfd, &pfd, sizeof(pfd)) != sizeof(pfd)) {
                ...
        }

and removed with :

        struct pollfd pfd;

        pfd.fd = fd;
        pfd.events = POLLREMOVE;
        pfd.revents = 0;
        if (write(kdpfd, &pfd, sizeof(pfd)) != sizeof(pfd)) {
                ...
        }

The core dispatching code looks like :

        struct pollfd *pfds;
        struct dvpoll dvp;

        for (;;) {
                dvp.dp_timeout = STD_SCHED_TIMEOUT;
                dvp.dp_resoff = 0;

                nfds = ioctl(kdpfd, DP_POLL, &dvp);
                pfds = (struct pollfd *) (map + dvp.dp_resoff);
                for (ii = 0; ii < nfds; ii++, pfds++) {
                        ...
                }
        }

Basically the driver allocates two sets of pages that it uses as a double buffer
to store files events.
The field  dp_resoff  will tell where, inside the map, the result set resides
so, while working on one set, the kernel can use the other one to store incoming
events.
There is no copy to userspace issues, events coming from the same file are
collapsed into a single slot and the DT_POLL function will never do a linear
scan of the interest set to perform a file->f_ops->poll().
You can download the patch here :

http://www.xmailserver.org/linux-patches/newdp_last.diff.gz


                      The /dev/poll patch from Provos-Lever

There's very few things to say about this, only that a virt_to_page() bug has
been fixed to make the patch work.
I modified a patch for 2.4.3 that I found at the CITI web site and this should
be the port to 2.4.x of the original ( 2.2.x ) one used by Provos-Lever.
Niels, Charles, is it true ?
You can download the patch here :

http://www.xmailserver.org/linux-patches/olddp_last.diff.gz


                               The HTTP server

The HTTP server is very simple(tm) and is based on event polling + coroutines
that make the server quite efficent.
The coroutine library implementation used inside the server has been taken from :

http://lecker.essen.de/~froese/coro/

It's very small, simple and fast.
Again, it's very simple ( the server ) and emits always the same HTTP response
whose size can be programmed by a command line parameter.
Other two command line options enable You to set the listening port and the fd
set size.
You can download the server here :

http://www.xmailserver.org/linux-patches/dphttpd_last.tar.gz


                            The deadconn(tm) tool

If the server is simple this is even simpler and its purpose is to create
"dead" connections to the server to simulate a realistic load where a bunch of
slow links are connected.
You can download  deadconn  here :

http://www.xmailserver.org/linux-patches/deadconn_last.c


                                  The test

The test machine is a PIII 600MHz, 128 Mb RAM, eepro100 network card connected
to a 100Mbps fast ethernet switch. The kernel is 2.4.6 over a RH 6.2 and the
coroutine library version is 1.1.0-pre2.
I used a dual PIII 1GHz, 256 Mb RAM and dual eepro100 as httperf machine, while
a dual PIII 900 MHz, 256 Mb RAM and dual eepro100 has been used as deadconn(tm)
machine.
Since httperf when used with an high number of num-conns goes very quickly to
fill the fds space ( modified to 8000 ) I used this command line :

--think-timeout 5 --timeout 5 --num-calls 2500 --num-conns 100 --hog --rate 100

This basically allocates 100 connections that will load the server under
different values of dead connections.
The other parameter I varied is the response size from 128, 512 and 1024.
Each of these numbers is the average of three runs.
You can download  httperf  here :

http://www.hpl.hp.com/personal/David_Mosberger/httperf.html


[respsize=128]

        poll()

dead    resp     std
conns   rate     dev

0       22510    600
1000    14800    603
2000    10800    400
4000    7200     180
6000    5500     140
16000   2350     50

   old /dev/poll

dead    resp     std
conns   rate     dev

0       23500    500
1000    16000    800
2000    12600    500
4000    8900     350
6000    6760     240
16000   3150     90

   new /dev/poll

dead    resp     std
conns   rate     dev

0       27000    10
1000    26500    0
2000    26700    10
4000    26200    0
6000    26250    0
16000   26400    0


[respsize=512]

        poll()

dead    resp     std
conns   rate     dev

0       18000    200
1000    14800    650
2000    10900    390
4000    7200     200
6000    5512     130
16000   2330     78

   old /dev/poll

dead    resp     std
conns   rate     dev

0       18000    150
1000    15500    530
2000    12500    500
4000    8800     390
6000    6830     260
16000   3159     80

   new /dev/poll

dead    resp     std
conns   rate     dev

0       18200    40
1000    18200    30
2000    18150    60
4000    18140    60
6000    18150    45
16000   18146    50


[respsize=1024]

        poll()

dead    resp     std
conns   rate     dev

0       10300    70
1000    10000    300
2000    8400     1500
4000    7000     240
6000    5410     135
16000   2150     110

   old /dev/poll

dead    resp     std
conns   rate     dev

0       10400    40
1000    10150    350
2000    9600     720
4000    8500     300
6000    6655     250
16000   3145     100

   new /dev/poll

dead    resp     std
conns   rate     dev

0       10900    15
1000    10800    10
2000    10680    10
4000    10600    15
6000    10600    14
16000   10650    25


These numbers show that the new /dev/poll improve the efficency of the server
from a response rate point of view and from a CPU utilization point of view
( not shown here ).
I've not all the data for 7800 dead connections but a comparison between two
runs shown even more dramatic differences.
The standard deviation is also very low compared to poll() and old /dev/poll
and this let me think that 1) there's more power to be extracted 2) the method
has a predictable response over high loads.
The patch is not in final version coz I'm still working on it.
To use the /dev/poll interface You've to mknod such name with major=10
and minor=125.


- Davide