非阻塞io之 epoll内核实现

it2024-12-03 34

epoll

创建epollFdepollFD文件描述符的回调实现 epoll_ctl 系统调用epoll_ctl的系统调用实现EPOLL_CTL_ADDEPOLL_CTL_DElEPOLL_CTL_MOD epoll_wait系统调用do_epoll_waitep_pollep_send_eventsep_scan_ready_listep_send_events_procep_item_pollvfs_poll 可以结合这篇文章一起看 eventfd 由于需要实现一个驱动想封装出epoll的设备，在网上找关于epoll的信息，好少所以自己看了内核的代码了解以下epoll机制，做个总结以便自己后面查看。

/* * 这个结构存储在`struct file`结构的 "private_data"成员中，也是eventpoll接口的主要数据结构。 * 访问它受到 wq 内部锁的保护。 */ struct eventpoll { // fs/eventpoll.c /* * 此互斥锁用于确保在epoll使用文件时不会删除文件。 * 这是在事件收集循环，文件清理路径，epoll文件出口代码和ctl操作期间保留的。 */ struct mutex mtx; /* sys_epoll_wait（）使用的等待队列 */ /* 调用epoll_wait方法的线程在被堵塞之前会放相应的信息在这个队列里这样当有监听事件发生时，这些线程就可以被唤醒 */ wait_queue_head_t wq; /* 等待者使用的等待队列 file->poll() */ wait_queue_head_t poll_wait; /*准备好的文件描述符列表*/ /* 被监听的socket文件有对应的事件生成后，就会被放到这个队列中 */ struct list_head rdllist; /* RB树根，用于存储受监视的fd结构 */ /* 被监听的socket文件会被放到这个数据结构里，红黑树 */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->wq.lock. * 这是一个单链表，其中链接了将就绪事件转移到用户空间 w/out * 持有 ->wq.lock时发生的所有“struct epitem”。 */ struct epitem *ovflist; /* ep_scan_ready_list运行时使用的wakeup_source */ struct wakeup_source *ws; /* 创建事件轮询描述符的用户 */ struct user_struct *user; struct file *file; /* 用于优化回路检测检查 */ int visited; struct list_head visited_list_link; #ifdef CONFIG_NET_RX_BUSY_POLL /* 用于跟踪繁忙的民意调查napi_id */ unsigned int napi_id; #endif };

创建epollFd

他有俩个系统调用epoll_create1和epoll_create

调用ep_alloc方法创建一个eventpoll实例调用get_unused_fd_flags方法找到一个未使用的fd，这个就是最终返回给我们的文件描述符。调用anon_inode_getfile方法创建一个file实例，其类型为 // include/linux/fs.h struct file { //... // 这个struct里存放了各种函数指针，用来指向操作文件的各种函数 // 比如read/write等。这样不同类型的文件，就可以有不同的函数实现 const struct file_operations *f_op; //... // struct file 里的数据字段存放的是所有file类型通用的数据 // 而下面这个字段存放的是和具体文件类型相关的数据 void *private_data; //... }

调用anon_inode_getfile方法传入的参数中，eventpoll_fops最终被赋值到上面的f_op字段，ep被赋值到上面的private_data字段。 4. 调用fd_install方法在内核中建立 fd 与 file 的对应关系，这样以后就可以通过fd来找到对应的file。 5. 返回fd给用户。至此，epoll_create1方法结束。

/* * 打开一个eventpoll文件描述符. */ static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; /* 检查EPOLL_* 常量的一致性. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * 创建内部数据结构 ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. * 创建设置事件轮询文件所需的所有项目。即，文件结构和空闲的文件描述符。 */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd; out_free_fd: put_unused_fd(fd); out_free_ep: ep_free(ep); return error; } SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); }

epollFD文件描述符的回调实现

/* 实现eventpoll文件行为的文件回调 */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, };

epoll_ctl 系统调用

int epoll_ctl(int epfd, intop, int fd, struct epoll_event*event);

epoll的事件注册函数，它不同与select()是在监听事件时告诉内核要监听什么类型的事件，而是在这里先注册要监听的事件类型。

/* 要发布给sys_epoll_ctl()的有效操作码 */ #define EPOLL_CTL_ADD 1 #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3

第一个参数是epoll_create()的返回值，第二个参数表示动作，用三个宏来表示：

EPOLL_CTL_ADD：注册新的fd到epfd中； EPOLL_CTL_MOD：修改已经注册的fd的监听事件； EPOLL_CTL_DEL：从epfd中删除一个fd；

第三个参数是需要监听的fd ，第四个参数是告诉内核需要监听什么事件，struct epoll_event结构如下：

struct epoll_event { __poll_t events; __u64 data; } EPOLL_PACKED;

events 有以下掩码

/* Epoll event masks */ #define EPOLLIN (__force __poll_t)0x00000001 //触发该事件，表示对应的文件描述符上有可读数据。(包括对端SOCKET正常关闭)； #define EPOLLPRI (__force __poll_t)0x00000002 //表示对应的文件描述符有紧急的数据可读（这里应该表示有带外数据到来）； #define EPOLLOUT (__force __poll_t)0x00000004 //触发该事件，表示对应的文件描述符上可以写数据； #define EPOLLERR (__force __poll_t)0x00000008 //表示对应的文件描述符发生错误； #define EPOLLHUP (__force __poll_t)0x00000010 //表示对应的文件描述符被挂断； #define EPOLLNVAL (__force __poll_t)0x00000020 #define EPOLLRDNORM (__force __poll_t)0x00000040 #define EPOLLRDBAND (__force __poll_t)0x00000080 #define EPOLLWRNORM (__force __poll_t)0x00000100 #define EPOLLWRBAND (__force __poll_t)0x00000200 #define EPOLLMSG (__force __poll_t)0x00000400 #define EPOLLRDHUP (__force __poll_t)0x00002000 /* 设置目标文件描述符的独占唤醒模式 */ #define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28)) /* * 请求处理系统唤醒事件，以防止在处理这些事件时发生系统挂起。 * * 假设既未设置EPOLLET也未设置EPOLLONESHOT，则在使用唤醒事件后再次调用epoll_wait之前，不会重新允许系统挂起。 * 需要CAP_BLOCK_SUSPEND */ #define EPOLLWAKEUP ((__force __poll_t)(1U << 29)) /* 设置目标文件描述符的“单发”行为 */ #define EPOLLONESHOT ((__force __poll_t)(1U << 30)) //只监听一次事件，当监听完这次事件之后，如果还需要继续监听这个socket的话，需要再次把这个socket加入到EPOLL队列里。 /* 为目标文件描述符设置 Edge Triggered (边缘触发)行为 */ #define EPOLLET ((__force __poll_t)(1U << 31)) //将EPOLL设为边缘触发(Edge Triggered)模式，这是相对于水平触发(Level Triggered)来说的

epoll_ctl的系统调用实现

/* * 以下功能为eventpoll文件实现了控制器接口，该接口允许集合中插入/删除/更改文件描述符。 */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; struct eventpoll *tep = NULL; error = -EFAULT; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; error = -EBADF; f = fdget(epfd); if (!f.file) goto error_return; /* 获取目标文件的“ struct file * ” */ tf = fdget(fd); if (!tf.file) goto error_fput; /* 目标文件描述符必须支持轮询 */ error = -EPERM; if (!file_can_poll(tf.file)) goto error_tgt_fput; /* 检查是否允许 EPOLLWAKEUP */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(&epds); /* * 我们必须检查用户传递给我们_is_一个eventpoll文件的文件描述符下方的文件结构。 * 而且我们也不允许在其内部添加epoll文件描述符。 */ error = -EINVAL; if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * epoll仅在EPOLL_CTL_ADD时间添加到唤醒队列， * 因此EPOLL_CTL_MOD操作不允许使用EPOLLEXCLUSIVE。 * 另外，我们目前不支持嵌套排他唤醒。 */ if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * 此时，可以安全地假设“ private_data”包含我们自己的数据结构。 */ ep = f.file->private_data; /* * 当我们在另一个epoll文件描述符内插入一个epoll文件描述符时， * 创建闭环的变化有所变化，与更关键的路径相比，在此处更好地进行处理。 * 在检查循环时，我们还确定了可访问的文件列表，并将其挂在tfile_check_list上， * 以便可以检查是否没有创建太多可能的唤醒路径。 * * 当epoll文件描述符直接附加到唤醒源时， * 我们不需要在EPOLL_CTL_ADD上使用全局'epumutex'，除非epoll文件描述符是嵌套的。 * 加上'epmutex'的目的是为了防止复杂的拓扑（例如循环和深度唤醒路径） * 通过多个EPOLL_CTL_ADD操作并行形成。 */ mutex_lock_nested(&ep->mtx, 0); if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || is_file_epoll(tf.file)) { full_check = 1; mutex_unlock(&ep->mtx); mutex_lock(&epmutex); if (is_file_epoll(tf.file)) { error = -ELOOP; if (ep_loop_check(ep, tf.file) != 0) { clear_tfile_check_list(); goto error_tgt_fput; } } else list_add(&tf.file->f_tfile_llink, &tfile_check_list); mutex_lock_nested(&ep->mtx, 0); if (is_file_epoll(tf.file)) { tep = tf.file->private_data; mutex_lock_nested(&tep->mtx, 1); } } } /* * 尝试在我们的RB树中查找文件， * 由于我们在上面抓取了“ mtx”， * 因此可以确定在释放互斥锁之前， * 可以使用ep_find（）查找的项目。 */ epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, &epds, tf.file, fd, full_check); } else error = -EEXIST; if (full_check) clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) error = ep_remove(ep, epi); else error = -ENOENT; break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds.events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, &epds); } } else error = -ENOENT; break; } if (tep != NULL) mutex_unlock(&tep->mtx); mutex_unlock(&ep->mtx); error_tgt_fput: if (full_check) mutex_unlock(&epmutex); fdput(tf); error_fput: fdput(f); error_return: return error; } /* * 添加到eventpoll接口的每个文件描述符都将具有链接到“ rbr” RB树的此类型的条目。 * 避免增加此结构的大小，服务器上可能有成千上万个这样的结构，我们不希望这占用另一个缓存行。 */ struct epitem { union { /* RB树节点将此结构链接到eventpoll RB树 */ struct rb_node rbn; /* 用于释放结构表位 */ struct rcu_head rcu; }; /* 列表头，用于将此结构链接到事件轮询就绪列表 */ struct list_head rdllink; /* * 在保持项目的单个链接链的同时， * “ struct eventpoll”-> ovflist一起工作。 */ struct epitem *next; /* 此项所指的文件描述符信息 */ struct epoll_filefd ffd; /* 附加到轮询操作的活动等待队列数 */ int nwait; /* 包含轮询等待队列的列表 */ struct list_head pwqlist; /* 该项目的“容器” */ struct eventpoll *ep; /* 用于将此项目链接到“结构文件”项目列表的列表头 */ struct list_head fllink; /*设置EPOLLWAKEUP时使用的akeupup_source */ struct wakeup_source __rcu *ws; /* 描述感兴趣事件和源fd的结构 */ struct epoll_event event; }; static LIST_HEAD(tfile_check_list); //具有新添加的链接的文件列表，我们可能需要在其中限制发出路径的数量。受epmutex保护。全局列表 f = fdget(epfd);获取epfd的file指针tf = fdget(fd);获取到需要操作的file指针ep = f.file->private_data；获取eventpollepi = ep_find(ep, tf.file, fd); 从eventpoll总查看有没有操作文件描述符的epi

EPOLL_CTL_ADD

list_add(&tf.file->f_tfile_llink, &tfile_check_list); 将目标file结构体添加到全局tfile_check_list表中。找eventpoll的红黑树中查找这个文件和文件描述符在不在eventpoll的rbr数中如果存在则反错，文件存在否则将这个文件描述符添加到eventpoll error = ep_insert(ep, &epds, tf.file, fd, full_check);

EPOLL_CTL_DEl

ep_remove(ep, epi);将epi从ep中移除即可

EPOLL_CTL_MOD

error = ep_modify(ep, epi, &epds); 修改相应的epi 即可

epoll_wait系统调用

do_epoll_wait

/* * 为eventpoll文件实现事件等待界面。 * 它是用户空间epoll_wait（2）的内核部分。 */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout) { int error; struct fd f; struct eventpoll *ep; /* 事件的最大数量必须大于零 */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* 验证用户传递的区域是否可写 */ if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* 获取 eventpoll 文件的“ struct file * ” */ /* 根据epfd找到对应的file */ f = fdget(epfd); if (!f.file) return -EBADF; /* * 我们必须检查用户传递给我们的fd下面的文件结构是一个eventpoll文件。 */ error = -EINVAL; if (!is_file_epoll(f.file)) goto error_fput; /* * 此时，可以安全地假设“ private_data”包含我们自己的数据结构。 */ /* epoll_create1方法中把eventpoll实例放到了private_data字段中 */ ep = f.file->private_data; /* 是时候钓鱼了 ... */ error = ep_poll(ep, events, maxevents, timeout); error_fput: fdput(f); return error; } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { return do_epoll_wait(epfd, events, maxevents, timeout); }

epfd为epoll_create1方法返回的fd，events为用户提供的 struct epoll_event 类型的数组，用于存放有监听事件发生的那些监听对象，maxevents 表示这个数组的长度，也表示epoll_wait方法最多可返回maxevents个事件就绪的监听对象。

ep_poll

/** fs/eventpoll.c * ep_poll-检索就绪事件，并将其传递给调用方提供的事件缓冲区。 * * @ep：指向eventpoll上下文的指针。 * @events：指向应该将就绪事件存储在其中的用户空间缓冲区的指针。 * @maxevents：调用者事件缓冲区的大小（根据事件数）。 * @timeout：准备事件获取操作的最大超时时间（以毫秒为单位）。 * 如果@timeout为零，则函数不会阻塞，而如果@timeout小于零， * 则函数将阻塞，直到至少检索到一个事件（或发生错误）。 * *返回：返回已获取的就绪事件的数量，如果发生错误，则返回错误代码。 */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res = 0, eavail, timed_out = 0; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; lockdep_assert_irqs_enabled(); if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(&end_time); to = &expires; *to = timespec64_to_ktime(end_time); } else if (timeout == 0) { /* * 如果调用方指定了非阻塞操作，则避免不必要的跳入等待队列循环。 */ timed_out = 1; spin_lock_irq(&ep->wq.lock); goto check_events; } fetch_events: if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); spin_lock_irq(&ep->wq.lock); if (!ep_events_available(ep)) { /* * 繁忙的轮询超时。现在删除NAPI ID， * 我们可以在将具有有效NAPI ID的套接字移至就绪列表后重新添加。 */ ep_reset_busy_poll_napi_id(ep); /* * 我们没有任何可用的事件可返回给调用者。 * 我们需要在这里休眠，当事件可用时，我们会被ep_poll_callback()唤醒。 */ init_waitqueue_entry(&wait, current); __add_wait_queue_exclusive(&ep->wq, &wait); for (;;) { /* * 如果ep_poll_callback（）在这之间给我们唤醒，我们就不休眠。 * 这就是为什么我们在执行检查之前 * 将任务状态设置为TASK_INTERRUPTIBLE的原因。 */ set_current_state(TASK_INTERRUPTIBLE); /* * 始终使致命信号短路，以使线程及时退出， * 而不会发现更多可用事件并重复获取。 */ if (fatal_signal_pending(current)) { res = -EINTR; break; } if (ep_events_available(ep) || timed_out) break; if (signal_pending(current)) { res = -EINTR; break; } spin_unlock_irq(&ep->wq.lock); if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irq(&ep->wq.lock); } __remove_wait_queue(&ep->wq, &wait); __set_current_state(TASK_RUNNING); } check_events: /* Is it worth to try to dig for events ? */ eavail = ep_events_available(ep); spin_unlock_irq(&ep->wq.lock); /* * 尝试将事件转移到用户空间。万一我们收到0个事件，并且还有超时时间， * 我们将再次尝试寻找更多的运气。 */ if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; return res; } 判断是否有监听事件就绪，如果有则直接调用ep_send_events方法把就绪对象拷贝到events里，然后返回。如果没有，则先调用 init_waitqueue_entry 方法初始化wait变量，其中current参数为线程私有变量，线程相关的数据会放到这个变量中，同时，通过这个变量也能找到相应的线程。 struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; struct list_head entry; }; static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p) { wq_entry->flags = 0; wq_entry->private = p; wq_entry->func = default_wake_function; } static inline void __add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq_head, wq_entry); }

wq_entry->func的 default_wake_function 方法就是用来唤醒当前进程current对应的线程的。 3. 初始化完wait变量之后，把它放到eventpoll的wq队列中，这个上面我们也有提到过。 4. 然后进入for循环，其逻辑为，检查是否有监听事件就绪，如果没有，则调用 schedule_hrtimeout_range 方法，使当前线程进入休眠状态。 5. 当各种情况，比如signal、timeout、监听事件发生，导致该线程被唤醒，则会再进入下一次for循环，并检查监听事件是否就绪，如果就绪了，则跳出for循环，同时把wait变量从eventpoll的wq队列中移除。 6. 调用 ep_send_events 方法把就绪事件的对象拷贝到用户提供的events数组中，然后返回。

ep_send_events

static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct ep_send_events_data esed; esed.maxevents = maxevents; esed.events = events; ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); return esed.res; }

ep_scan_ready_list的ep_send_events_proc参数是一个回调方法，在ep_scan_ready_list中调用

ep_scan_ready_list

/** * * ep_scan_ready_list-以一种可能的扫描代码方式扫描就绪列表， * 以调用f_op->poll()。还允许O（NumReady）性能。 * @ep：指向epoll私有数据结构的指针。 * @sproc：指向扫描回调的指针。 * @priv：传递给@sproc回调的私有不透明数据。 * @depth：递归f_op-> poll调用的当前深度。 * @ep_locked：呼叫者已经持有ep-> mtx * 返回：由@sproc回调返回的相同整数错误代码。 */ static __poll_t ep_scan_ready_list(struct eventpoll *ep, __poll_t (*sproc)(struct eventpoll *, struct list_head *, void *), void *priv, int depth, bool ep_locked) { __poll_t res; int pwake = 0; struct epitem *epi, *nepi; LIST_HEAD(txlist); lockdep_assert_irqs_enabled(); /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). * 我们需要锁定它，因为我们可能会被eventpoll_release_file（） * 和epoll_ctl（）击中。 */ if (!ep_locked) mutex_lock_nested(&ep->mtx, depth); /* * 偷走准备好的列表，然后将原始列表重新初始化为空白列表。 * 另外，将ep-> ovflist设置为NULL， * 以使在丢失w / out锁时发生的事件不会丢失。 * 我们不能使poll回调直接在ep-> rdllist上排队， * 因为我们希望“ sproc”回调能够以无锁方式进行。 */ spin_lock_irq(&ep->wq.lock); list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; spin_unlock_irq(&ep->wq.lock); /* * 现在调用回调函数。 */ res = (*sproc)(ep, &txlist, priv); spin_lock_irq(&ep->wq.lock); /* * 在我们使用“ sproc”回调的过程中， * 其他一些事件可能已由poll回调排队。 * 我们在这里将它们重新插入到主要准备列表中。 */ for (nepi = ep->ovflist; (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * 我们需要检查项目是否已经在列表中。 * 在“ sproc”回调执行期间，项目会排队进入->ovflist， * 但“ txlist”可能已包含它们，并且下面的list_splice（）会照顾它们。 */ if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } /* * 我们需要将ep-> ovflist设置回EP_UNACTIVE_PTR， * 以便在释放锁定后，事件将以常规方式在ep-> rdllist内部排队。 */ ep->ovflist = EP_UNACTIVE_PTR; /* * 快速重新注入留在“ txlist”上的项目。 */ list_splice(&txlist, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { /* * 唤醒（如果激活）eventpoll等待列表和->poll（）等待列表（在释放锁定后延迟）。 */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irq(&ep->wq.lock); if (!ep_locked) mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return res; }

ep_scan_ready_list是将eventpoll中的rdllist列表内容转移到txlist列表中，同时把rdllist列表置为空，现在txlist就持有了所有有就绪事件的对象。然后调用上面的回调方法 ep_send_events_proc，将该列表传入其中。

ep_send_events_proc

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { struct ep_send_events_data *esed = priv; __poll_t revents; struct epitem *epi; struct epoll_event __user *uevent; struct wakeup_source *ws; poll_table pt; init_poll_funcptr(&pt, NULL); /* * 我们可以无锁循环，因为我们传递了任务私有列表。 * 在此循环期间，项目无法消失， * 因为ep_scan_ready_list（）在此调用期间保持“ mtx”。 */ for (esed->res = 0, uevent = esed->events; !list_empty(head) && esed->res < esed->maxevents;) { epi = list_first_entry(head, struct epitem, rdllink); /* * 在停用epi-> ws之前先激活ep-> ws， * 以防止在此处触发自动暂停（以防我们在下面对epi-> ws进行反应）。 * * 可以重新安排它以延迟epi->ws的停用， * 但是epi->ws会暂时与ep_is_linked（）不同步。 */ ws = ep_wakeup_source(epi); if (ws) { if (ws->active) __pm_stay_awake(ep->ws); __pm_relax(ws); } list_del_init(&epi->rdllink); revents = ep_item_poll(epi, &pt, 1); /* * 如果事件掩码与调用方请求的掩码相交，请将事件传递到用户空间。 * 同样，ep_scan_ready_list（）持有“ mtx”，因此， * 来自用户空间的任何操作都不能更改该项目。 */ if (revents) { if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); ep_pm_stay_awake(epi); if (!esed->res) esed->res = -EFAULT; return 0; } esed->res++; uevent++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_scan_ready_list() holding "mtx" and the * poll callback will queue them in ep->ovflist. * 如果已使用“水平触发”模式添加了此文件， * 则需要重新插入就绪列表中， * 以便对epoll_wait（）的下一次调用将再次检查事件的可用性。 * 在这一点上，除了我们之外，没有人可以插入ep-> rdllist。 * 持有“ mtx”的ep_scan_ready_list（） * 将epoll_ctl（）调用者锁定， * 轮询回调将它们排队在ep->ovflist中。 * * 如果是 level-triggered，该对象还会被添加到就绪列表里 * 这样下次调用 epoll_wait 还会检查这个对象 */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } } return 0; }

遍历head就绪列表中的所有对象，对其调用 ep_item_poll 方法，真正的去检查我们关心的那些事件是否存在。

如果有我们感兴趣的事件，则将该事件拷贝到用户event中。

如果该监听对象是 level-triggered 模式，则会把该对象再加入到就绪列表中，这样下次再调用 epoll_wait 方法，还会检查这些对象。这也是 level-triggered 和 edge-triggered 在代码上表现出来的本质区别。

所有监听对象检查完毕后，此时满足条件的对象已经被拷贝到用户提供的events里，到这里方法就可以返回了。

ep_item_poll

/* * 与ep_eventpoll_poll（）的不同之处在于内部调用者已经具有ep-> mtx， * 因此我们需要从depth = 1开始，以便正确注释mutex_lock_nested（）。 */ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct eventpoll *ep; bool locked; pt->_key = epi->event.events; if (!is_file_epoll(epi->ffd.file)) return vfs_poll(epi->ffd.file, pt) & epi->event.events; ep = epi->ffd.file->private_data; poll_wait(epi->ffd.file, &ep->poll_wait, pt); locked = pt && (pt->_qproc == ep_ptable_queue_proc); return ep_scan_ready_list(epi->ffd.file->private_data, ep_read_events_proc, &depth, depth, locked) & epi->event.events; }

如果是epoll 则进行poll_wait即可

vfs_poll

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { if (unlikely(!file->f_op->poll)) return DEFAULT_POLLMASK; return file->f_op->poll(file, pt); }

如果是我们自定义或者是socket则执行vfs_poll 对于tcp socket对象，这个方法最终会调用 tcp_poll 方法，由于该方法涉及的都是tcp相关的内容

最新回复(0)