本文主要关注中断从硬件如何上报到用户态、用户态开关中断相关处理。
本文基于hns3 PMD driver,选取DPDK的Rx interrupt特性,使用DPDK工程中的l3fwd-power测试程序来进行说明。
一. 简短概括:
熟悉DPDK的小伙伴可以直接看本章节如下总结部分,如果需要详细细节,可以进一步查看第二节。
1)用户态执行设备驱动的.dev_start函数时,为网卡的每个硬件队列创建一个eventfd,使用epoll来关联监听所有的eventfd;
2)当网卡硬件中断上报cpu,内核态vfio-pci驱动进行中断处理,在中断处理函数中调用eventfd_signal向用户态上报事件信息;
3)在用户态,当通过epoll_wait监听到有事件发生后,通过read从相应的eventfd读取相关事件,应用程序以此可以进一步做相关处理。
二. 详细处理:
以hns3 PMD driver为例,说明网卡PMD驱动部分处理:
驱动初始化及.dev_start函数:(.dev_init钩子函数)hns3_dev_init -> hns3_init_pf
static int
hns3_init_pf(struct rte_eth_dev *eth_dev)
{
// 注册vector0相关中断处理函数,使能vector0中断
hns3_clear_all_event_cause(hw); ret = rte_intr_callback_register(&pci_dev->intr_handle, hns3_interrupt_handler, eth_dev); if (ret) { PMD_INIT_LOG(ERR, "Failed to register intr: %d", ret); goto err_intr_callback_register; }/* Enable interrupt */
rte_intr_enable(&pci_dev->intr_handle); hns3_pf_enable_irq0(hw);}
.dev_start函数
(.dev_start钩子函数)hns3_dev_start -> hns3_map_rx_interrupt
static int
hns3_map_rx_interrupt(struct rte_eth_dev *dev)
{
/* disable uio/vfio intr/eventfd mapping */ rte_intr_disable(intr_handle); intr_vector = hw->used_rx_queues; /* creates event fd for each intr vector when MSIX is used */ if (rte_intr_efd_enable(intr_handle, intr_vector)) return -EINVAL; }for (q_id = 0; q_id < hw->used_rx_queues; q_id++) {
ret = hns3_bind_ring_with_vector(hw, vec, true,,);}
rte_intr_enable(intr_handle);
}
DPDK相关API实现:
int
rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
{
uint32_t n = RTE_MIN(nb_efd,(uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { for (i = 0; i < n; i++) { fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); intr_handle->efds[i] = fd; }}
}
rte_intr_enable -> vfio_enable_msix
/* enable MSI-X interrupts */
static int
vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
irq_set = (struct vfio_irq_set *) irq_set_buf; irq_set->argsz = len; /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ irq_set->count = intr_handle->max_intr ? (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1; irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |VFIO_IRQ_SET_ACTION_TRIGGER;
irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; irq_set->start = 0; fd_ptr = (int *) &irq_set->data; /* INTR vector offset 0 reserve for non-efds mapping */ fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, sizeof(*intr_handle->efds) * intr_handle->nb_efd); ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS,irq_set);
}
l3fwd-power中的用法
收发包处理:
/* main processing loop */
static int
main_loop(__rte_unused void *dummy)
{
/* add into event wait list */ if (event_register(qconf) == 0) intr_en = 1; <snip> while (1) {start_rx:
//打开网卡硬件中断
turn_on_off_intr(qconf, 1);
sleep_until_rx_interrupt(qconf->n_rx_queue);
//关闭网卡硬件中断
turn_on_off_intr(qconf, 0);
/start receiving packets immediately/
goto start_rx;}
}
static int event_register(struct lcore_conf *qconf)
{
for (i = 0; i < qconf->n_rx_queue; ++i) { rx_queue = &(qconf->rx_queue_list[i]); portid = rx_queue->port_id; queueid = rx_queue->queue_id; data = portid << CHAR_BIT | queueid; ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid, RTE_EPOLL_PER_THREAD, RTE_INTR_EVENT_ADD, (void*)((uintptr_t)data)); if (ret) return ret; } return 0;}
static void turn_on_off_intr(struct lcore_conf *qconf, bool on)
{
int i; struct lcore_rx_queue *rx_queue; uint8_t queue_id; uint16_t port_id; for (i = 0; i < qconf->n_rx_queue; ++i) { rx_queue = &(qconf->rx_queue_list[i]); port_id = rx_queue->port_id; queue_id = rx_queue->queue_id; rte_spinlock_lock(&(locks[port_id])); if (on) rte_eth_dev_rx_intr_enable(port_id, queue_id); else rte_eth_dev_rx_intr_disable(port_id, queue_id); rte_spinlock_unlock(&(locks[port_id])); }}
/**
force polling thread sleep until one-shot rx interrupt triggers*/
static int
sleep_until_rx_interrupt(int num)
{
<snip> n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, -1); for (i = 0; i < n; i++) { data = event[i].epdata.data; port_id = ((uintptr_t)data) >> CHAR_BIT; queue_id = ((uintptr_t)data) & RTE_LEN2MASK(CHAR_BIT, uint8_t); RTE_LOG(INFO, L3FWD_POWER, "lcore %u is waked up from rx interrupt on" " port %d queue %d\n", rte_lcore_id(), port_id, queue_id); } return 0;}
DPDK相关API实现:
rte_eth_dev_rx_intr_ctl_q -> rte_intr_rx_ctl
eal_intr_proc_rxtx_intr -> read
rte_epoll_ctl -> epoll_ctl
rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
int op, unsigned int vec, void *data){
<snip> switch (op) { case RTE_INTR_EVENT_ADD: epfd_op = EPOLL_CTL_ADD; rev = &intr_handle->elist[efd_idx]; <snip> /* attach to intr vector fd */ epdata = &rev->epdata; epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; epdata->data = data; epdata->cb_fun =(rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
epdata->cb_arg = (void *)intr_handle; rc = rte_epoll_ctl(epfd, epfd_op, intr_handle->efds[efd_idx], rev);}
}
3.内核态vfio-pci处理
略