本文使用的 Linux 内核版本为 4.19.194 。
一、epoll_create 系统调用
系统调用
epoll_create1
和epoll_create
的定义分别位于fs/eventpoll.c
文件中的 1979 行和 1984 行。SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); }
- 主要工作就是调用
do_epoll_create
函数。
1.1 do_epoll_create 函数
函数
do_epoll_create
的定义位于fs/eventpoll.c
文件中的 1936 行。/* * Open an eventpoll file descriptor. */ static int do_epoll_create(int flags)
首先,对传入的
flags
参数做简单的验证。/* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL;
然后,申请分配
eventpoll
所需的内存并初始化。struct eventpoll *ep = NULL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error;
接下来,分配一个空闲的文件描述符
fd
和匿名文件file
。注意,eventpoll
实例会保存一份匿名文件的引用,并通过调用fd_install
将文件描述符和匿名文件关联起来。另外还需注意
anon_inode_getfile
调用时将eventpoll
作为匿名文件的private_data
保存了起来。后面就可以通过epoll
实例的文件描述符快速的找到eventpoll
对象。最后,将文件描述符
fd
作为 epoll 的句柄返回给调用者。epoll
实例其实就是一个匿名文件。/* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd;
二、epoll_ctl 系统调用
系统调用
epoll_ctl
的定义位于fs/eventpoll.c
文件中的 1997 行。/* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event)
首先,获取
epoll
实例对应的匿名文件。f = fdget(epfd); if (!f.file) goto error_return;
然后,获得添加的套接字对应的文件。
/* Get the "struct file *" for the target file */ tf = fdget(fd); if (!tf.file) goto error_fput;
接着,进行一系列的数据验证,保存用户传入的参数是合法的。
判断待处理的文件有没有实现
poll
接口,使用epoll
监听的资源需要实现poll
钩子。/* The target file descriptor must support poll */ if (!file_can_poll(tf.file)) goto error_tgt_fput;
保证被操作的
fd
不是自己,并且自己是epoll
。/* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput;
如果获得的是一个真正的
epoll
实例句柄,就通过匿名文件的private_data
获取之前创建的eventpoll
实例。struct eventpoll *ep; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data;
在
eventpoll
实例里的中查找待添加的套接字。struct epitem *epi; /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, tf.file, fd);
eventpoll
实例中通过红黑树跟踪了当前监听的所有文件的描述符,而红黑树的根就保存在eventpoll
中。struct eventpoll { /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; // ... };
对于每个被监听的文件描述符,都有一个的
epitem
与之对应,epitem
就作为红黑树的节点保存在红黑树中。作为二叉树的节点,epitem
必须提供比较能力,以便可以按照大小顺序构建出一颗有序的二叉树。其排序能力是靠epitem
中的epoll_filefd
结构体来完成的。struct epitem { /* The file descriptor information this item refers to */ struct epoll_filefd ffd; //... }; struct epoll_filefd { struct file *file; int fd; } __packed; /* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); }
epoll_filefd
之间的比较首先按照文件地址大小排序。如果文件地址大小相同,就按照文件描述符的大小来排序。在进行完红黑树查找后,如果是一个 ADD 操作,并且在树中没有找到对应的二叉树节点的话,就会调用
ep_insert
添加。case EPOLL_CTL_ADD: if (!epi) { epds.events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, &epds, tf.file, fd, full_check); } else error = -EEXIST; break;
2.1 ep_insert 函数
函数
ep_insert
的定义位于fs/eventpoll.c
文件中的 1418 行。/* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check)
首先,判断当前监控的文件数量是否超过了
/proc/sys/fs/epoll/max_user_watches
中预设的最大值,如果超过了就返回错误。user_watches = atomic_long_read(&ep->user->epoll_watches); if (unlikely(user_watches >= max_user_watches)) return -ENOSPC;
然后分配
epitem
对象并初始化。struct epitem *epi; if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR;
将
epitem
对象和目标文件tfile
关联起来,并把epitem
对象添加到eventpoll
对象里的红黑树中。/* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi);
为加入的文件描述符设置回调函数
ep_poll_callback
。如果对应的文件描述符上有事件发生,就会调用这个回调函数,比如套接字缓冲区有数据时,就会回调这个函数。而ep_poll_callback
这个回调函数的设置是通过回调函数ep_ptable_queue_proc
来完成的。struct ep_pqueue epq; /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
init_poll_funcptr
函数的定义如下:static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->_qproc = qproc; pt->_key = ~(__poll_t)0; /* all events enabled */ }
需要注意的是,
init_poll_funcptr
注册了所有的事件。ep_ptable_queue_proc
函数的定义如下:/* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { /* We have to signal that an error occurred */ epi->nwait = -1; } }
2.2 ep_poll_callback 函数
函数
ep_poll_callback
的定义位于fs/eventpoll.c
文件中 1118 行。/* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
这个函数的作用非常重要,它将内核时间真正的和
epoll
对象联系了起来。首先,通过这个文件的
wait_queue_entry_t
实例找到对应的epitem
实例。一旦获得了epitem
实例 就可以获得对应的eventpoll
实例了。struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep;
ep_item_from_wait
函数的定义为:/* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; }
然后,对发生的事件进行过滤。为什么需要过滤呢?为了性能考虑,
ep_insert
在注册回调时,使用init_poll_funcptr
函数注册被监控文件的所有事件,而实际用户关心的事件不一定和内核发送的事件匹配。比如,用户向内核订阅了一个套接字可读的时间,但是在某个时刻套接字可写的事件发生时,就并不需要想用户空间传递这个事件。/* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock;
接下来,判断是否需要把该事件传递给用户空间。
/* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (ep->ovflist != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; if (epi->ws) { /* * Activate ep->ws since epi->ws may get * deactivated at any time. */ __pm_stay_awake(ep->ws); } } goto out_unlock; }
如果需要,而且该事件对应的
event_item
不在eventpoll
对应的已完成队列中,就把它放入该队列,以便将该事件传递给用户空间。/* If this file is already in the ready list we exit soon */ if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake_rcu(epi); }
我们知道,当我们调用
epoll_wait
时,调用的进程会被挂起,在内核看来调用进程陷入了休眠。如果该epoll
实例上对应描述符有事件发生,这个休眠的进程应该被唤醒来处理事件。下面的代码就是这个作用。wake_up_locked
函数唤醒当前eventpoll
上等待的进程。/* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN) ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } wake_up_locked(&ep->wq); }
三、epoll_wait 系统调用
系统调用
epoll_wait
的定义位于fs/eventpoll.c
文件中的 2197 行。SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { return do_epoll_wait(epfd, events, maxevents, timeout); }
- 主要工作就是调用
do_epoll_wait
函数。
3.1 do_epoll_wait 函数
函数
do_epoll_wait
的定义位于fs/eventpoll.c
文件中的 2155 行。/* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout)
首先进行一系列的检查,比如传入的
maxevents
应该大于 0。/* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) return -EFAULT;
与
epoll_ctl
类似,通过epoll
实例找到对应的匿名文件和描述字,并进行检查。/* Get the "struct file *" for the eventpoll file */ f = fdget(epfd); if (!f.file) return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ if (!is_file_epoll(f.file)) goto error_fput;
获取
eventpoll
实例。/* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data;
调用
ep_poll
完成事件的收集并传递到用户空间。/* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, timeout);
3.2 ep_poll 函数
函数
ep_poll
的定义位于fs/eventpoll.c
文件中的 1743 行。/** * ep_poll - Retrieves ready events, and delivers them to the caller supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * milliseconds. If the @timeout is zero, the function will not block, * while if the @timeout is less than zero, the function will block * until at least one event has been retrieved (or an error * occurred). * * Returns: Returns the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout)
首先,根据不同的超时时间做不同的处理。如果大于 0 就计算超时时间,如果等于 0 就立即检查是否有事件发生。
if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(&end_time); to = &expires; *to = timespec64_to_ktime(end_time); } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; spin_lock_irq(&ep->wq.lock); goto check_events; }
检查当前是否有事件发生,如果没有,就把当前进程加入到
eventpoll
的等待队列wq
中,这样做的目的是当有事件发生时,ep_poll_callback
函数可以把该等待进程唤醒。if (!ep_events_available(ep)) { /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ ep_reset_busy_poll_napi_id(ep); /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ init_waitqueue_entry(&wait, current); __add_wait_queue_exclusive(&ep->wq, &wait);
接下来是一个无限循环,循环中通过调用
schedule_hrtimeout_range
,将当前进程陷入休眠,CPU 时间被调度器调度给其他进程使用,当然,当前进程可能会被唤醒,唤醒的条件包括下面四种:- 当前进程超时;
- 当前进程收到一个 signal 信号;
- 某个描述符上有事件发生;
当前进程被 CPU 重新调度,进入
for
循环重新判断,如果没有满足前三个条件,就又重新进入休眠。对应的 1、2、3 都会通过
break
跳出循环,直接返回。for (;;) { /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state * to TASK_INTERRUPTIBLE before doing the checks. */ set_current_state(TASK_INTERRUPTIBLE); /* * Always short-circuit for fatal signals to allow * threads to make a timely exit without the chance of * finding more events available and fetching * repeatedly. */ if (fatal_signal_pending(current)) { res = -EINTR; break; } if (ep_events_available(ep) || timed_out) break; if (signal_pending(current)) { res = -EINTR; break; } spin_unlock_irq(&ep->wq.lock); if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irq(&ep->wq.lock); }
如果进程从休眠中返回,则将当前进程从
eventpoll
的等待队列中删除,并且设置当前进程为TASK_RUNNING
状态。__remove_wait_queue(&ep->wq, &wait); __set_current_state(TASK_RUNNING); }
最后,调用
ep_send_events
函数将事件拷贝到用户空间。/* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events;
3.3 ep_send_events 函数
ep_send_events
函数会将 ep_send_events_proc
函数作为回调函数来调用 ep_scan_ready_list
函数。 ep_scan_ready_list
函数循环调用 ep_send_events_proc
函数对每个已经就绪的事件进行处理。ep_send_events_proc
函数处理就绪事件时,会再次调用每个文件描述符的 poll
函数,这个是为了确认注册的事件在这个时刻还是有效的。
可以看到,虽然 ep_send_events_proc
已经尽可能的保证用户空间获得的事件通知都是真实有效的,但是依然有一定的概率,当 ep_send_events_proc
再次调用文件上的 poll
函数后,用户空间处理该事件之前,对应的事件失效了。这也就是为什么 推荐使用非阻塞套接字配合 epoll
使用 的原因。
在进行简单的事件掩码校验之后,ep_send_events_proc
通过使用函数 __put_user
将事件结构体拷贝到用户空间需要的数据结构中。
static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct ep_send_events_data *esed = priv;
__poll_t revents;
struct epitem *epi;
struct epoll_event __user *uevent;
struct wakeup_source *ws;
poll_table pt;
init_poll_funcptr(&pt, NULL);
/*
* We can loop without lock because we are passed a task private list.
* Items cannot vanish during the loop because ep_scan_ready_list() is
* holding "mtx" during this call.
*/
for (esed->res = 0, uevent = esed->events;
!list_empty(head) && esed->res < esed->maxevents;) {
epi = list_first_entry(head, struct epitem, rdllink);
/*
* Activate ep->ws before deactivating epi->ws to prevent
* triggering auto-suspend here (in case we reactive epi->ws
* below).
*
* This could be rearranged to delay the deactivation of epi->ws
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
ws = ep_wakeup_source(epi);
if (ws) {
if (ws->active)
__pm_stay_awake(ep->ws);
__pm_relax(ws);
}
list_del_init(&epi->rdllink);
revents = ep_item_poll(epi, &pt, 1);
/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, ep_scan_ready_list()
* is holding "mtx", so no operations coming from userspace
* can change the item.
*/
if (revents) {
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
ep_pm_stay_awake(epi);
if (!esed->res)
esed->res = -EFAULT;
return 0;
}
esed->res++;
uevent++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
else if (!(epi->event.events & EPOLLET)) {
/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
* the ready list, so that the next call to
* epoll_wait() will check again the events
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
* ep_scan_ready_list() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
}
return 0;
}
在 ep_send_events_proc
函数的最后,针对 Level Trigger
模式,当前的 epoll_item
对象被重新加到 eventpoll
的就绪列表中,这样在下一次调用 epoll_wait
时,这些 epoll_item
对象就会被重新处理。
四、select 系统调用
系统调用 select
的定义位于 fs/select.c
文件中的 697 行。
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
fd_set __user *, exp, struct timeval __user *, tvp)
{
return kern_select(n, inp, outp, exp, tvp);
}
主要工作就是调用 kern_select
函数。
4.1 kern_select 函数
函数 kern_select
的定义位于 fs/select.c
文件中的 673 行。
static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timeval __user *tvp)
{
struct timespec64 end_time, *to = NULL;
struct timeval tv;
int ret;
if (tvp) {
if (copy_from_user(&tv, tvp, sizeof(tv)))
return -EFAULT;
to = &end_time;
if (poll_select_set_timeout(to,
tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
return -EINVAL;
}
ret = core_sys_select(n, inp, outp, exp, to);
ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
return ret;
}
kern_select
函数的主要作用是转换超时时间到内核空间,执行 core_sys_select
函数,并更新剩余的时间。
4.2 core_sys_select 函数
core_sys_select
函数定义位于 fs/select.c
文件中的 594 行。
/*
* We can actually return ERESTARTSYS instead of EINTR, but I'd
* like to be certain this leads to no problems. So I return
* EINTR just for safety.
*
* Update: ERESTARTSYS breaks at least the xview clock binary, so
* I'm trying ERESTARTNOHAND which restart only when you want to.
*/
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timespec64 *end_time)
{
fd_set_bits fds;
//...
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
ret = do_select(n, &fds, end_time);
if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT;
//...
}
core_sys_select
函数的主要作用是准备调用 do_select
所需的参数,并更新结果。
4.3 do_select 函数
do_select
函数定义位于 fs/select.c
文件中的 449 行。
static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
//...
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == 0) {
i += BITS_PER_LONG;
continue;
}
for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
struct fd f;
f = fdget(i);
if (f.file) {
//...
}
}
}
if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
to, slack))
timed_out = 1;
}
//...
}
do_select
函数的主要逻辑是监听多个 fd
,只要这些 fd
中有一个 fd
有事件发生,进程就会从休眠中被唤醒。并依次遍历所有的 fd
来判断到底是哪个 fd
有事件发生。 这也是 select
相比于 epoll
效率低的主要原因。
五、poll 系统调用
系统调用 poll
的定义位于 fs/select.c
文件中的 1012 行。
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
ret = do_sys_poll(ufds, nfds, to);
//...
}
主要作用与 select
实现中的 kern_select
函数作用类似,转换时间并调用 do_sys_poll
函数。
5.1 do_sys_poll 函数
函数 do_sys_poll
的定义位于 fs/select.c
文件中的 926 行。
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
//...
len = min_t(unsigned int, nfds, N_STACK_PPS);
for (;;) {
//...
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;
todo -= walk->len;
if (!todo)
break;
//...
}
poll_initwait(&table);
fdcount = do_poll(head, &table, end_time);
poll_freewait(&table);
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = 0; j < walk->len; j++, ufds++)
if (__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
//...
}
do_sys_poll
函数的主要作用与 select
实现中的 core_sys_select
函数作用类似,准备调用 do_poll
所需的参数,并更新结果。
5.2 do_poll 函数
函数 do_poll
的定义位于 fs/select.c
文件中的 837 行。
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
struct timespec64 *end_time)
{
//...
for (;;) {
struct poll_list *walk;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt, &can_busy_loop,
busy_flag)) {
//...
}
}
}
//...
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
return count;
}
同样与 select
实现中的 do_select
类似,do_poll
函数的主要逻辑是监听多个 fd
,只要这些 fd
中有一个 fd
有事件发生,进程就会从休眠中被唤醒。并依次遍历所有的 fd
来判断到底是哪个 fd
有事件发生。 所以 poll
与 select
一样的效率低。