epoll、select和poll源码简析

网络

发布日期: 2021-10-09

更新日期: 2025-08-10

文章字数: 4.9k

阅读时长: 24 分

本文使用的 Linux 内核版本为 4.19.194 。

一、epoll_create 系统调用

系统调用 epoll_create1 和 epoll_create 的定义分别位于 fs/eventpoll.c 文件中的 1979 行和 1984 行。

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
    return do_epoll_create(flags);
}

SYSCALL_DEFINE1(epoll_create, int, size)
{
    if (size <= 0)
        return -EINVAL;

    return do_epoll_create(0);
}

主要工作就是调用 do_epoll_create 函数。

1.1 do_epoll_create 函数

函数 do_epoll_create 的定义位于 fs/eventpoll.c 文件中的 1936 行。

/*
* Open an eventpoll file descriptor.
*/
static int do_epoll_create(int flags)

首先，对传入的 flags 参数做简单的验证。

/* Check the EPOLL_* constant for consistency.  */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

if (flags & ~EPOLL_CLOEXEC)
    return -EINVAL;

然后，申请分配 eventpoll 所需的内存并初始化。

struct eventpoll *ep = NULL;

/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
if (error < 0)
    return error;

接下来，分配一个空闲的文件描述符 fd 和匿名文件 file 。注意，eventpoll 实例会保存一份匿名文件的引用，并通过调用 fd_install 将文件描述符和匿名文件关联起来。

另外还需注意 anon_inode_getfile 调用时将 eventpoll 作为匿名文件的 private_data 保存了起来。后面就可以通过 epoll 实例的文件描述符快速的找到 eventpoll 对象。

最后，将文件描述符 fd 作为 epoll 的句柄返回给调用者。epoll 实例其实就是一个匿名文件。

/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
    error = fd;
    goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
            O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
    error = PTR_ERR(file);
    goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd;

二、epoll_ctl 系统调用

系统调用 epoll_ctl 的定义位于 fs/eventpoll.c 文件中的 1997 行。

/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epoll_event __user *, event)

首先，获取 epoll 实例对应的匿名文件。

f = fdget(epfd);
if (!f.file)
    goto error_return;

然后，获得添加的套接字对应的文件。

/* Get the "struct file *" for the target file */
tf = fdget(fd);
if (!tf.file)
    goto error_fput;

接着，进行一系列的数据验证，保存用户传入的参数是合法的。

判断待处理的文件有没有实现 poll 接口，使用 epoll 监听的资源需要实现 poll 钩子。

/* The target file descriptor must support poll */
if (!file_can_poll(tf.file))
    goto error_tgt_fput;

保证被操作的 fd 不是自己，并且自己是 epoll 。

/*
 * We have to check that the file structure underneath the file descriptor
 * the user passed to us _is_ an eventpoll file. And also we do not permit
 * adding an epoll file descriptor inside itself.
 */
if (f.file == tf.file || !is_file_epoll(f.file))
    goto error_tgt_fput;

如果获得的是一个真正的 epoll 实例句柄，就通过匿名文件的 private_data 获取之前创建的 eventpoll 实例。

struct eventpoll *ep;

/*
 * At this point it is safe to assume that the "private_data" contains
 * our own data structure.
 */
ep = f.file->private_data;

在 eventpoll 实例里的中查找待添加的套接字。

struct epitem *epi;

/*
 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
 * above, we can be sure to be able to use the item looked up by
 * ep_find() till we release the mutex.
 */
epi = ep_find(ep, tf.file, fd);

eventpoll 实例中通过红黑树跟踪了当前监听的所有文件的描述符，而红黑树的根就保存在 eventpoll 中。

struct eventpoll {
    /* RB tree root used to store monitored fd structs */
    struct rb_root_cached rbr;
    // ...
};

对于每个被监听的文件描述符，都有一个的 epitem 与之对应，epitem 就作为红黑树的节点保存在红黑树中。作为二叉树的节点，epitem 必须提供比较能力，以便可以按照大小顺序构建出一颗有序的二叉树。其排序能力是靠 epitem 中的 epoll_filefd 结构体来完成的。

struct epitem {
    /* The file descriptor information this item refers to */
    struct epoll_filefd ffd;
    //...
};

struct epoll_filefd {
    struct file *file;
    int fd;
} __packed;

/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
                struct epoll_filefd *p2)
{
    return (p1->file > p2->file ? +1:
            (p1->file < p2->file ? -1 : p1->fd - p2->fd));
}

epoll_filefd 之间的比较首先按照文件地址大小排序。如果文件地址大小相同，就按照文件描述符的大小来排序。

在进行完红黑树查找后，如果是一个 ADD 操作，并且在树中没有找到对应的二叉树节点的话，就会调用 ep_insert 添加。

case EPOLL_CTL_ADD:
    if (!epi) {
        epds.events |= EPOLLERR | EPOLLHUP;
        error = ep_insert(ep, &epds, tf.file, fd, full_check);
    } else
        error = -EEXIST;
    break;

2.1 ep_insert 函数

函数 ep_insert 的定义位于 fs/eventpoll.c 文件中的 1418 行。

/*
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
            struct file *tfile, int fd, int full_check)

首先，判断当前监控的文件数量是否超过了 /proc/sys/fs/epoll/max_user_watches 中预设的最大值，如果超过了就返回错误。
```
user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
    return -ENOSPC;
```

然后分配 epitem 对象并初始化。

struct epitem *epi;

if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
    return -ENOMEM;

/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;

将 epitem 对象和目标文件 tfile 关联起来，并把 epitem 对象添加到 eventpoll 对象里的红黑树中。

/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);

/*
 * Add the current item to the RB tree. All RB tree operations are
 * protected by "mtx", and ep_insert() is called with "mtx" held.
 */
ep_rbtree_insert(ep, epi);

为加入的文件描述符设置回调函数 ep_poll_callback。如果对应的文件描述符上有事件发生，就会调用这个回调函数，比如套接字缓冲区有数据时，就会回调这个函数。而 ep_poll_callback 这个回调函数的设置是通过回调函数 ep_ptable_queue_proc 来完成的。

struct ep_pqueue epq;

/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

init_poll_funcptr 函数的定义如下：

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->_qproc = qproc;
    pt->_key   = ~(__poll_t)0; /* all events enabled */
}

需要注意的是，init_poll_funcptr 注册了所有的事件。

ep_ptable_queue_proc 函数的定义如下：

/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                poll_table *pt)
{
    struct epitem *epi = ep_item_from_epqueue(pt);
    struct eppoll_entry *pwq;

    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
            add_wait_queue_exclusive(whead, &pwq->wait);
        else
            add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        /* We have to signal that an error occurred */
        epi->nwait = -1;
    }
}

2.2 ep_poll_callback 函数

函数 ep_poll_callback 的定义位于 fs/eventpoll.c 文件中 1118 行。

/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)

这个函数的作用非常重要，它将内核时间真正的和 epoll 对象联系了起来。

首先，通过这个文件的 wait_queue_entry_t 实例找到对应的 epitem 实例。一旦获得了 epitem 实例就可以获得对应的 eventpoll 实例了。

struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;

ep_item_from_wait 函数的定义为：

/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
    return container_of(p, struct eppoll_entry, wait)->base;
}

然后，对发生的事件进行过滤。为什么需要过滤呢？为了性能考虑，ep_insert 在注册回调时，使用 init_poll_funcptr 函数注册被监控文件的所有事件，而实际用户关心的事件不一定和内核发送的事件匹配。比如，用户向内核订阅了一个套接字可读的时间，但是在某个时刻套接字可写的事件发生时，就并不需要想用户空间传递这个事件。
```
/*
 * Check the events coming with the callback. At this stage, not
 * every device reports the events in the "key" parameter of the
 * callback. We need to be able to handle both cases here, hence the
 * test for "key" != NULL before the event match test.
 */
if (pollflags && !(pollflags & epi->event.events))
    goto out_unlock;
```

接下来，判断是否需要把该事件传递给用户空间。

/*
 * If we are transferring events to userspace, we can hold no locks
 * (because we're accessing user memory, and because of linux f_op->poll()
 * semantics). All the events that happen during that period of time are
 * chained in ep->ovflist and requeued later on.
 */
if (ep->ovflist != EP_UNACTIVE_PTR) {
    if (epi->next == EP_UNACTIVE_PTR) {
        epi->next = ep->ovflist;
        ep->ovflist = epi;
        if (epi->ws) {
            /*
             * Activate ep->ws since epi->ws may get
             * deactivated at any time.
             */
            __pm_stay_awake(ep->ws);
        }

    }
    goto out_unlock;
}

如果需要，而且该事件对应的 event_item 不在 eventpoll 对应的已完成队列中，就把它放入该队列，以便将该事件传递给用户空间。

/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(epi)) {
    list_add_tail(&epi->rdllink, &ep->rdllist);
    ep_pm_stay_awake_rcu(epi);
}

我们知道，当我们调用 epoll_wait 时，调用的进程会被挂起，在内核看来调用进程陷入了休眠。如果该 epoll 实例上对应描述符有事件发生，这个休眠的进程应该被唤醒来处理事件。下面的代码就是这个作用。 wake_up_locked 函数唤醒当前 eventpoll 上等待的进程。

/*
 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
 * wait list.
 */
if (waitqueue_active(&ep->wq)) {
    if ((epi->event.events & EPOLLEXCLUSIVE) &&
                !(pollflags & POLLFREE)) {
        switch (pollflags & EPOLLINOUT_BITS) {
        case EPOLLIN:
            if (epi->event.events & EPOLLIN)
                ewake = 1;
            break;
        case EPOLLOUT:
            if (epi->event.events & EPOLLOUT)
                ewake = 1;
            break;
        case 0:
            ewake = 1;
            break;
        }
    }
    wake_up_locked(&ep->wq);
}

三、epoll_wait 系统调用

系统调用 epoll_wait 的定义位于 fs/eventpoll.c 文件中的 2197 行。

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
        int, maxevents, int, timeout)
{
    return do_epoll_wait(epfd, events, maxevents, timeout);
}

主要工作就是调用 do_epoll_wait 函数。

3.1 do_epoll_wait 函数

函数 do_epoll_wait 的定义位于 fs/eventpoll.c 文件中的 2155 行。

/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
            int maxevents, int timeout)

首先进行一系列的检查，比如传入的 maxevents 应该大于 0。

/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
    return -EINVAL;

/* Verify that the area passed by the user is writeable */
if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
    return -EFAULT;

与 epoll_ctl 类似，通过 epoll 实例找到对应的匿名文件和描述字，并进行检查。

/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
if (!f.file)
    return -EBADF;

/*
 * We have to check that the file structure underneath the fd
 * the user passed to us _is_ an eventpoll file.
 */
if (!is_file_epoll(f.file))
    goto error_fput;

获取 eventpoll 实例。

/*
 * At this point it is safe to assume that the "private_data" contains
 * our own data structure.
 */
ep = f.file->private_data;

调用 ep_poll 完成事件的收集并传递到用户空间。

/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);

3.2 ep_poll 函数

函数 ep_poll 的定义位于 fs/eventpoll.c 文件中的 1743 行。

/**
 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
*           event buffer.
*
* @ep: Pointer to the eventpoll context.
* @events: Pointer to the userspace buffer where the ready events should be
*          stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
*           milliseconds. If the @timeout is zero, the function will not block,
*           while if the @timeout is less than zero, the function will block
*           until at least one event has been retrieved (or an error
*           occurred).
*
* Returns: Returns the number of ready events which have been fetched, or an
*          error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
        int maxevents, long timeout)

首先，根据不同的超时时间做不同的处理。如果大于 0 就计算超时时间，如果等于 0 就立即检查是否有事件发生。

if (timeout > 0) {
    struct timespec64 end_time = ep_set_mstimeout(timeout);

    slack = select_estimate_accuracy(&end_time);
    to = &expires;
    *to = timespec64_to_ktime(end_time);
} else if (timeout == 0) {
    /*
     * Avoid the unnecessary trip to the wait queue loop, if the
     * caller specified a non blocking operation.
     */
    timed_out = 1;
    spin_lock_irq(&ep->wq.lock);
    goto check_events;
}

检查当前是否有事件发生，如果没有，就把当前进程加入到 eventpoll 的等待队列 wq 中，这样做的目的是当有事件发生时， ep_poll_callback 函数可以把该等待进程唤醒。

if (!ep_events_available(ep)) {
    /*
     * Busy poll timed out.  Drop NAPI ID for now, we can add
     * it back in when we have moved a socket with a valid NAPI
     * ID onto the ready list.
     */
    ep_reset_busy_poll_napi_id(ep);

    /*
     * We don't have any available event to return to the caller.
     * We need to sleep here, and we will be wake up by
     * ep_poll_callback() when events will become available.
     */
    init_waitqueue_entry(&wait, current);
    __add_wait_queue_exclusive(&ep->wq, &wait);

接下来是一个无限循环，循环中通过调用 schedule_hrtimeout_range ，将当前进程陷入休眠，CPU 时间被调度器调度给其他进程使用，当然，当前进程可能会被唤醒，唤醒的条件包括下面四种：

当前进程超时；
当前进程收到一个 signal 信号；
某个描述符上有事件发生；

当前进程被 CPU 重新调度，进入 for 循环重新判断，如果没有满足前三个条件，就又重新进入休眠。

对应的 1、2、3 都会通过 break 跳出循环，直接返回。

for (;;) {
    /*
     * We don't want to sleep if the ep_poll_callback() sends us
     * a wakeup in between. That's why we set the task state
     * to TASK_INTERRUPTIBLE before doing the checks.
     */
    set_current_state(TASK_INTERRUPTIBLE);
    /*
     * Always short-circuit for fatal signals to allow
     * threads to make a timely exit without the chance of
     * finding more events available and fetching
     * repeatedly.
     */
    if (fatal_signal_pending(current)) {
        res = -EINTR;
        break;
    }
    if (ep_events_available(ep) || timed_out)
        break;
    if (signal_pending(current)) {
        res = -EINTR;
        break;
    }

    spin_unlock_irq(&ep->wq.lock);
    if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
        timed_out = 1;

    spin_lock_irq(&ep->wq.lock);
}

如果进程从休眠中返回，则将当前进程从 eventpoll 的等待队列中删除，并且设置当前进程为 TASK_RUNNING 状态。
```
    __remove_wait_queue(&ep->wq, &wait);
    __set_current_state(TASK_RUNNING);
}
```

最后，调用 ep_send_events 函数将事件拷贝到用户空间。

/*
 * Try to transfer events to user space. In case we get 0 events and
 * there's still timeout left over, we go trying again in search of
 * more luck.
 */
if (!res && eavail &&
    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
    goto fetch_events;

3.3 ep_send_events 函数

ep_send_events 函数会将 ep_send_events_proc 函数作为回调函数来调用 ep_scan_ready_list 函数。 ep_scan_ready_list 函数循环调用 ep_send_events_proc 函数对每个已经就绪的事件进行处理。ep_send_events_proc 函数处理就绪事件时，会再次调用每个文件描述符的 poll 函数，这个是为了确认注册的事件在这个时刻还是有效的。

可以看到，虽然 ep_send_events_proc 已经尽可能的保证用户空间获得的事件通知都是真实有效的，但是依然有一定的概率，当 ep_send_events_proc 再次调用文件上的 poll 函数后，用户空间处理该事件之前，对应的事件失效了。这也就是为什么 推荐使用非阻塞套接字配合 epoll 使用 的原因。

在进行简单的事件掩码校验之后，ep_send_events_proc 通过使用函数 __put_user 将事件结构体拷贝到用户空间需要的数据结构中。

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
                   void *priv)
{
    struct ep_send_events_data *esed = priv;
    __poll_t revents;
    struct epitem *epi;
    struct epoll_event __user *uevent;
    struct wakeup_source *ws;
    poll_table pt;

    init_poll_funcptr(&pt, NULL);

    /*
     * We can loop without lock because we are passed a task private list.
     * Items cannot vanish during the loop because ep_scan_ready_list() is
     * holding "mtx" during this call.
     */
    for (esed->res = 0, uevent = esed->events;
         !list_empty(head) && esed->res < esed->maxevents;) {
        epi = list_first_entry(head, struct epitem, rdllink);

        /*
         * Activate ep->ws before deactivating epi->ws to prevent
         * triggering auto-suspend here (in case we reactive epi->ws
         * below).
         *
         * This could be rearranged to delay the deactivation of epi->ws
         * instead, but then epi->ws would temporarily be out of sync
         * with ep_is_linked().
         */
        ws = ep_wakeup_source(epi);
        if (ws) {
            if (ws->active)
                __pm_stay_awake(ep->ws);
            __pm_relax(ws);
        }

        list_del_init(&epi->rdllink);

        revents = ep_item_poll(epi, &pt, 1);

        /*
         * If the event mask intersect the caller-requested one,
         * deliver the event to userspace. Again, ep_scan_ready_list()
         * is holding "mtx", so no operations coming from userspace
         * can change the item.
         */
        if (revents) {
            if (__put_user(revents, &uevent->events) ||
                __put_user(epi->event.data, &uevent->data)) {
                list_add(&epi->rdllink, head);
                ep_pm_stay_awake(epi);
                if (!esed->res)
                    esed->res = -EFAULT;
                return 0;
            }
            esed->res++;
            uevent++;
            if (epi->event.events & EPOLLONESHOT)
                epi->event.events &= EP_PRIVATE_BITS;
            else if (!(epi->event.events & EPOLLET)) {
                /*
                 * If this file has been added with Level
                 * Trigger mode, we need to insert back inside
                 * the ready list, so that the next call to
                 * epoll_wait() will check again the events
                 * availability. At this point, no one can insert
                 * into ep->rdllist besides us. The epoll_ctl()
                 * callers are locked out by
                 * ep_scan_ready_list() holding "mtx" and the
                 * poll callback will queue them in ep->ovflist.
                 */
                list_add_tail(&epi->rdllink, &ep->rdllist);
                ep_pm_stay_awake(epi);
            }
        }
    }

    return 0;
}

在 ep_send_events_proc 函数的最后，针对 Level Trigger 模式，当前的 epoll_item 对象被重新加到 eventpoll 的就绪列表中，这样在下一次调用 epoll_wait 时，这些 epoll_item 对象就会被重新处理。

四、select 系统调用

系统调用 select 的定义位于 fs/select.c 文件中的 697 行。

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
    fd_set __user *, exp, struct timeval __user *, tvp)
{
    return kern_select(n, inp, outp, exp, tvp);
}

主要工作就是调用 kern_select 函数。

4.1 kern_select 函数

函数 kern_select 的定义位于 fs/select.c 文件中的 673 行。

static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
               fd_set __user *exp, struct timeval __user *tvp)
{
    struct timespec64 end_time, *to = NULL;
    struct timeval tv;
    int ret;

    if (tvp) {
        if (copy_from_user(&tv, tvp, sizeof(tv)))
            return -EFAULT;

        to = &end_time;
        if (poll_select_set_timeout(to,
                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
            return -EINVAL;
    }

    ret = core_sys_select(n, inp, outp, exp, to);
    ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);

    return ret;
}

kern_select 函数的主要作用是转换超时时间到内核空间，执行 core_sys_select 函数，并更新剩余的时间。

4.2 core_sys_select 函数

core_sys_select 函数定义位于 fs/select.c 文件中的 594 行。

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
               fd_set __user *exp, struct timespec64 *end_time)
{
    fd_set_bits fds;

    //...

    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

    ret = do_select(n, &fds, end_time);

    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;
    //...
}

core_sys_select 函数的主要作用是准备调用 do_select 所需的参数，并更新结果。

4.3 do_select 函数

do_select 函数定义位于 fs/select.c 文件中的 449 行。

static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{

    //...
    for (;;) {
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;

            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            if (all_bits == 0) {
                i += BITS_PER_LONG;
                continue;
            }

            for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
                struct fd f;
                f = fdget(i);
                if (f.file) {
                    //...
                }
            }
        }

        if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                       to, slack))
            timed_out = 1;
    }
    //...
}

do_select 函数的主要逻辑是监听多个 fd ，只要这些 fd 中有一个 fd 有事件发生，进程就会从休眠中被唤醒。并依次遍历所有的 fd 来判断到底是哪个 fd 有事件发生。这也是 select 相比于 epoll 效率低的主要原因。

五、poll 系统调用

系统调用 poll 的定义位于 fs/select.c 文件中的 1012 行。

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
        int, timeout_msecs)
{
    struct timespec64 end_time, *to = NULL;
    int ret;

    if (timeout_msecs >= 0) {
        to = &end_time;
        poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
            NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
    }

    ret = do_sys_poll(ufds, nfds, to);

    //...
}

主要作用与 select 实现中的 kern_select 函数作用类似，转换时间并调用 do_sys_poll 函数。

5.1 do_sys_poll 函数

函数 do_sys_poll 的定义位于 fs/select.c 文件中的 926 行。

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
        struct timespec64 *end_time)
{
    //...
    len = min_t(unsigned int, nfds, N_STACK_PPS);
    for (;;) {
        //...
        if (copy_from_user(walk->entries, ufds + nfds-todo,
                    sizeof(struct pollfd) * walk->len))
            goto out_fds;

        todo -= walk->len;
        if (!todo)
            break;
        //...
    }

    poll_initwait(&table);
    fdcount = do_poll(head, &table, end_time);
    poll_freewait(&table);

    for (walk = head; walk; walk = walk->next) {
        struct pollfd *fds = walk->entries;
        int j;

        for (j = 0; j < walk->len; j++, ufds++)
            if (__put_user(fds[j].revents, &ufds->revents))
                goto out_fds;
      }
    //...
}

do_sys_poll 函数的主要作用与 select 实现中的 core_sys_select 函数作用类似，准备调用 do_poll 所需的参数，并更新结果。

5.2 do_poll 函数

函数 do_poll 的定义位于 fs/select.c 文件中的 837 行。

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
           struct timespec64 *end_time)
{
    //...
    for (;;) {
        struct poll_list *walk;

        for (walk = list; walk != NULL; walk = walk->next) {
            struct pollfd * pfd, * pfd_end;

            pfd = walk->entries;
            pfd_end = pfd + walk->len;
            for (; pfd != pfd_end; pfd++) {
                /*
                 * Fish for events. If we found one, record it
                 * and kill poll_table->_qproc, so we don't
                 * needlessly register any other waiters after
                 * this. They'll get immediately deregistered
                 * when we break out and return.
                 */
                if (do_pollfd(pfd, pt, &can_busy_loop,
                          busy_flag)) {

                    //...
                }
            }
        }
        //...
        if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
            timed_out = 1;
    }
    return count;
}

同样与 select 实现中的 do_select 类似，do_poll 函数的主要逻辑是监听多个 fd ，只要这些 fd 中有一个 fd 有事件发生，进程就会从休眠中被唤醒。并依次遍历所有的 fd 来判断到底是哪个 fd 有事件发生。所以 poll 与 select 一样的效率低。