- 1. ã¯ããã«
- 2. 調æ»å¯¾è±¡ã®ç°å¢
- 3. ã¦ã¼ã¶ç©ºéãã©ã¤ãã¨ã«ã¼ãã«ç©ºéãã©ã¤ã
- 4. PCIããã¤ã¹ã®å¾©ç¿
- 5. PCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéããã®èªã¿åºã
- 6. PCI I/O空éã®èªã¿æ¸ã
- 7. PCI ã¡ã¢ãªç©ºéã®èªã¿æ¸ã
- 8. å²ãè¾¼ã¿ã®åãæ±ã
- 9. ä»ååãä¸ããªãã£ããã¨
- 10. ã¾ã¨ã
å·çè : å²¡é¨ ç©¶
1. ã¯ããã«
DPDKã触ã£ã¦ããã¨æ°ã«ãªããã¨ãããã¾ãããã ãã«ã¼ãã«ã®Ethernetãã©ã¤ãã®ä»£ããã«ã¦ã¼ã¶ç©ºéã§Ethernetãã©ã¤ããä½ã£ã¦ãããã¨ãããå®éã®PCIããã¤ã¹ã«ã©ããã£ã¦ã¦ã¼ã¶ç©ºéããã¢ã¯ã»ã¹ãã¦ããã®ã ãããï¼ã æ¬è¨äºã§ã¯DPDKã®ã½ã¼ã¹ã³ã¼ããèªã¿è§£ããã¨ã§ãä¸è¨ã®çåã(é¨åçãªãã)解決ãã¾ãã
2. 調æ»å¯¾è±¡ã®ç°å¢
- Debian GNU/Linux 11.6
- DPDK 22.11.0
3. ã¦ã¼ã¶ç©ºéãã©ã¤ãã¨ã«ã¼ãã«ç©ºéãã©ã¤ã
DPDKã®ã¦ã¼ã¶ç©ºéãã©ã¤ãã¯Poll Mode Driver (PMD)ã¨å¼ã°ããæ§ã ãªEthernetã対象ã¨ãã¦ãã¾ãã dpdk-devbind.pyã¹ã¯ãªãããè¦ããã¨ã§ãµãã¼ããã¦ãããããã®Ethernetã®ãªã¹ããç¥ããã¨ãã§ãã¾ãã æ¬è¨äºã§ã¯QEMUä¸ã§ãæ軽ã«è©¦é¨ã§ããããã«net_virtioãã©ã¤ãã調æ»ãã¾ãã
ããã«PMDã¯ã¦ã¼ã¶ç©ºéã«åã«é
ç½®ããã°åããã®ã§ã¯ãªããå®éã®PCIããã¤ã¹ã¨ç¹ãã«ã¼ãã«ç©ºéãã©ã¤ããå¿
è¦ã«ãªãã¾ãã
以åã¯igb_uio
ãªã©ã®ã«ã¼ãã«ç©ºéãã©ã¤ãã使ç¨ãããã±ã¼ã¹ãããã¾ããããIOMMUããµãã¼ããã¦ããããvfio-pciãã©ã¤ããä»åã¯ä½¿ãã¾ãã
æ¬æ¥ãã¼ãã¦ã§ã¢ã¨ãã¦æä¾ãããPCIããã¤ã¹ã¨ã«ã¼ãã«ç©ºéã§åãEthernetãã©ã¤ããå®ç¾ãã¦ããæ©è½ãã©ãä¸è¨2ã¤ã®ãã©ã¤ãã§å®ç¾ããã¦ããããæ¬è¨äºã§ã¯èª¿ã¹ã¾ãã
4. PCIããã¤ã¹ã®å¾©ç¿
復ç¿ã®ããã«PCIããã¤ã¹ã¨ã¯ã©ã®ãããªãã®ã ã£ãã®ãæ¯ãè¿ãã¾ãããã PCIããã¤ã¹ã¯ã以ä¸ã®ã¡ã¢ãªç©ºéãæã£ã¦ãã¾ããã
- PCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºé
- PCI I/O空é
- PCI ã¡ã¢ãªç©ºé
ããã«PCIããã¤ã¹ã¯å²ãè¾¼ã¿ãã³ãæã£ã¦ãããé常ã³ã³ããã¹ãã®å®è¡ä¸ã®ä»»æã®ã¿ã¤ãã³ã°ã§è¨å®ããå²ãè¾¼ã¿ãã³ãã©ãå®è¡ãããã¨ãã§ãã¾ããã
ä¸è¨4ã¤ã®è¦ç´ ãnet_virtio
ãã©ã¤ãã¨vfio-pci
ãã©ã¤ãã§ã©ã®ããã«æ±ããã¦ããã調ã¹ã¾ãããã
5. PCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéããã®èªã¿åºã
ãã天ä¸ãçã§ãããDPDKã«ã¯PCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéã®ãã¼ã¹ã¢ãã¬ã¹ã¬ã¸ã¹ã¿ãèªãã§ãPCI I/O空éããå¤å®ãã以ä¸ã®ã³ã¼ããããã¾ãã
static int pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) { uint32_t ioport_bar; int ret; ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + PCI_BASE_ADDRESS_0 + bar_index*4); // <= PCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéããã®èªã¿åºã if (ret != sizeof(ioport_bar)) { RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", PCI_BASE_ADDRESS_0 + bar_index*4); return -1; } return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0; }
ä¸è¨ã®ããã«ãã¡ã¤ã«ãã£ã¹ã¯ãªãã¿vfio_dev_fd
ããæå®ãããªãã»ãããèªã¿åºãã°PCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéãèªã¿åºãã¾ãã
vfio_dev_fd
ã¯åºå®å¹
ã®ããªã¼ã¸ã§ã³ãã«åå²ããã¦ãã¦ãVFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
ã¨ãã£ããªãã»ããæå®ã§èªã¿åºãã¾ãã
ããã§ã¯VFIO_PCI_CONFIG_REGION_INDEX
ãæå®ãã¦ããã®ã§ãPCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéã«ã¢ã¯ã»ã¹ãã¦ãã¾ãã
ã§ã¯ãã®vfio_dev_fd
ã¯ä½å¦ãããã£ã¦ããã®ã§ããããã
ãã®ãã¡ã¤ã«ãã£ã¹ã¯ãªãã¿ã¯rte_vfio_setup_device()é¢æ°ã§ãªã¼ãã³ããã¦ãã¾ãã
ãããã®åæåã³ã¼ãã¯å°ãé·ãã®ã§å
ã«è¦ç¹ã解説ãã¾ãã
- ã·ã³ããªãã¯ãªã³ã¯
/sys/bus/pci/devices/PCIã¢ãã¬ã¹/iommu_group
ãèªãã§IOMMUã°ã«ã¼ããå¾ã¾ããä¾ãã°ä»¥ä¸ã®ä¾ã ã¨18
ã§ãã
$ readlink /sys/bus/pci/devices/0000\:07\:00.0/iommu_group ../../../../kernel/iommu_groups/18
- ãã¡ã¤ã«
/dev/vfio/å ã®IOMMUã°ã«ã¼ã
ãéãã¦ã°ã«ã¼ããã¡ã¤ã«ãã£ã¹ã¯ãªãã¿vfio_group_fd
ãå¾ã¾ãã ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, PCIã¢ãã¬ã¹)
ãå®è¡ããè¿ãå¤ã¨ãã¦vfio_dev_fd
ãå¾ã¾ãã
以ä¸ã¯ä¸è¨è§£éã®æ ¹æ ã¨ãªãã³ã¼ãã®æç²ã§ãã
static int pci_vfio_map_resource_primary(struct rte_pci_device *dev) { // --snip-- ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, &vfio_dev_fd, &device_info); // --snip-- #define SYSFS_PCI_DEVICES "/sys/bus/pci/devices" const char *rte_pci_get_sysfs_path(void) { const char *path = NULL; #ifdef RTE_EXEC_ENV_LINUX path = getenv("SYSFS_PCI_DEVICES"); if (path == NULL) return SYSFS_PCI_DEVICES; #endif return path; } int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, int *vfio_dev_fd, struct vfio_device_info *device_info) { // --snip-- /* get group number */ ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); // --snip-- /* get the actual group fd */ vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); // --snip-- /* get a file descriptor for the device */ *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); // --snip-- int rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num) { // --snip-- /* try to find out IOMMU group for this device */ snprintf(linkname, sizeof(linkname), "%s/%s/iommu_group", sysfs_base, dev_addr); ret = readlink(linkname, filename, sizeof(filename)); // <= ã·ã³ããªãã¯ãªã³ã¯`/sys/bus/pci/devices/PCIã¢ãã¬ã¹/iommu_group`ãèªã // --snip-- ret = rte_strsplit(filename, sizeof(filename), tok, RTE_DIM(tok), '/'); // --snip-- /* IOMMU group is always the last token */ errno = 0; group_tok = tok[ret - 1]; end = group_tok; *iommu_group_num = strtol(group_tok, &end, 10); // <= IOMMUã°ã«ã¼ããæååããæ´æ°ã«å¤æ // --snip-- int rte_vfio_get_group_fd(int iommu_group_num) { struct vfio_config *vfio_cfg; /* get the vfio_config it belongs to */ vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; return vfio_get_group_fd(vfio_cfg, iommu_group_num); } static int vfio_get_group_fd(struct vfio_config *vfio_cfg, int iommu_group_num) { // --snip-- vfio_group_fd = vfio_open_group_fd(iommu_group_num); // --snip-- return vfio_group_fd; } #define VFIO_GROUP_FMT "/dev/vfio/%u" static int vfio_open_group_fd(int iommu_group_num) { // --snip-- /* if primary, try to open the group */ if (internal_conf->process_type == RTE_PROC_PRIMARY) { /* try regular group format */ snprintf(filename, sizeof(filename), VFIO_GROUP_FMT, iommu_group_num); vfio_group_fd = open(filename, O_RDWR); // <= ãã¡ã¤ã«`/dev/vfio/å ã®IOMMUã°ã«ã¼ã`ãéã // --snip-- return vfio_group_fd;
6. PCI I/O空éã®èªã¿æ¸ã
åç« ã§ã話ãããéãããã¡ã¤ã«ãã£ã¹ã¯ãªãã¿vfio_dev_fd
ã«ã¯ãªã¼ã¸ã§ã³ãããã¾ããã
DPDKã®ã½ã¼ã¹ã³ã¼ãã§VFIO_GET_REGION_ADDR()
ãã¯ãã使ã£ã¦PCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºé以å¤ã®ãªã¼ã¸ã§ã³ã«ã¢ã¯ã»ã¹ãã¦ããç®æã調ã¹ãã¨ä»¥ä¸ã®ã³ã¼ããè¦ã¤ããã¾ãã
int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, struct rte_pci_ioport *p) { if (bar < VFIO_PCI_BAR0_REGION_INDEX || bar > VFIO_PCI_BAR5_REGION_INDEX) { RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); return -1; } p->dev = dev; p->base = VFIO_GET_REGION_ADDR(bar); return 0; } void pci_vfio_ioport_read(struct rte_pci_ioport *p, void *data, size_t len, off_t offset) { const struct rte_intr_handle *intr_handle = p->dev->intr_handle; int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle); if (vfio_dev_fd < 0) return; if (pread64(vfio_dev_fd, data, len, p->base + offset) <= 0)
ãããä»åä½ã£ãç°å¢ã§ã¯ä¸è¨ã®é¢æ°ã¯ä½¿ãããªãããã§ãã virtioããã¤ã¹ã«ã¯modernã¨legacyã®äºç¨®é¡ããããä»åã®ç°å¢ã§åä½ãã¦ããã®ã¯modernã§ãPCI I/O空éã®ã¢ã¯ã»ã¹ã¯legacyã®æã®ã¿ã®ããã§ãã
ãããã«ãã¦ãPCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéã¨åæ§ã«ãPCI I/O空éã¸ããã¡ã¤ã«ãã£ã¹ã¯ãªãã¿vfio_dev_fd
ã®ãªã¼ã¸ã§ã³ã«èªã¿æ¸ãããã°ã¢ã¯ã»ã¹ã§ãããã¨ããããã¾ããã
7. PCI ã¡ã¢ãªç©ºéã®èªã¿æ¸ã
ããã天ä¸ãçã«ãªãã¾ãããDPDKã§ã¡ã¢ãªã¬ã¸ã¹ã¿ã«ã¢ã¯ã»ã¹ãã¦ããç®æã調ã¹ã¦ã¿ã¾ãããããã£ã¨volatile
ã使ã£ã¦ããã¯ãã§ãã
ããã¨rte_read16()é¢æ°ã®ãããªããããããAPIãè¦ã¤ããã¾ãã
static __rte_always_inline uint16_t rte_read16(const volatile void *addr) { uint16_t val; val = rte_read16_relaxed(addr); rte_io_rmb(); return val; } static __rte_always_inline uint16_t rte_read16_relaxed(const volatile void *addr) { return *(const volatile uint16_t *)addr; }
ãã®rte_read16()
é¢æ°ã¯net_virtio
ãã©ã¤ãã§ã¯ä¾ãã°ä»¥ä¸ã®ããã«ä½¿ããã¾ãã
static int modern_setup_queue(struct virtio_hw *hw, struct virtqueue *vq) { struct virtio_pci_dev *dev = virtio_pci_get_dev(hw); // --snip-- notify_off = rte_read16(&dev->common_cfg->queue_notify_off);
dev->common_cfg
ãã¤ã³ã¿ã¯struct virtio_pci_common_cfg *
åã§ã以ä¸ã®ããã«ã¬ã¸ã¹ã¿ã®å®ç¾©ãç¾
åãã¦ããã¾ãã
struct virtio_pci_common_cfg { /* About the whole device. */ uint32_t device_feature_select; /* read-write */ uint32_t device_feature; /* read-only */ uint32_t guest_feature_select; /* read-write */ uint32_t guest_feature; /* read-write */ uint16_t msix_config; /* read-write */ uint16_t num_queues; /* read-only */ uint8_t device_status; /* read-write */ uint8_t config_generation; /* read-only */ /* About a specific virtqueue. */ uint16_t queue_select; /* read-write */ uint16_t queue_size; /* read-write, power of 2. */ uint16_t queue_msix_vector; /* read-write */ uint16_t queue_enable; /* read-write */ uint16_t queue_notify_off; /* read-only */
dev->common_cfg
ãã¤ã³ã¿ã¯PCIãã¼ã¹ã¢ãã¬ã¹ã«é
ç½®ãããPCIã±ã¼ãããªãã£ã§ãã
ãã®PCIã±ã¼ãããªãã£ã¯VIRTIO Committee Specification "4.1.4 Virtio Structure PCI Capabilities"ã§è©³ãã解説ããã¦ãã¾ãã
ã³ã¼ããã追ã£ã¦ã¿ã¾ããããPCIã±ã¼ãããªãã£ã®èªã¿åºãã¯virtio_read_caps()é¢æ°ããã¯ãã¾ãã¾ãã
int vtpci_init(struct rte_pci_device *pci_dev, struct virtio_pci_dev *dev) { struct virtio_hw *hw = &dev->hw; RTE_BUILD_BUG_ON(offsetof(struct virtio_pci_dev, hw) != 0); /* * Try if we can succeed reading virtio pci caps, which exists * only on modern pci device. If failed, we fallback to legacy * virtio handling. */ if (virtio_read_caps(pci_dev, hw) == 0) {
virtio_read_caps()
é¢æ°ã¯rte_pci_map_device()é¢æ°ãå¼ã³åºãã¦PCI ã¡ã¢ãªç©ºéãããããã¾ãã
ãã®ãããã¯å¾ã«èª¿ã¹ãã¨ãã¦ãvirtio_read_caps()
é¢æ°ã®æ®ãã®é¨åã§ã¯VIRTIO Committee Specificationã§èª¬æããã¦ããéãPCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéããPCIã±ã¼ãããªãã£ã辿ãã¾ãã
static int virtio_read_caps(struct rte_pci_device *pci_dev, struct virtio_hw *hw) { struct virtio_pci_dev *dev = virtio_pci_get_dev(hw); uint8_t pos; struct virtio_pci_cap cap; int ret; if (rte_pci_map_device(pci_dev)) { PMD_INIT_LOG(DEBUG, "failed to map pci device!"); return -1; } ret = rte_pci_read_config(pci_dev, &pos, 1, PCI_CAPABILITY_LIST); // --snip-- while (pos) { ret = rte_pci_read_config(pci_dev, &cap, 2, pos); // --snip-- ret = rte_pci_read_config(pci_dev, &cap, sizeof(cap), pos); if (ret != sizeof(cap)) { PMD_INIT_LOG(DEBUG, "failed to read pci cap at pos: %x ret %d", pos, ret); break; } PMD_INIT_LOG(DEBUG, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u", pos, cap.cfg_type, cap.bar, cap.offset, cap.length); switch (cap.cfg_type) { case VIRTIO_PCI_CAP_COMMON_CFG: dev->common_cfg = get_cfg_addr(pci_dev, &cap); break; // --snip-- next: pos = cap.cap_next; } /* Read PCI config space. */ int rte_pci_read_config(const struct rte_pci_device *device, void *buf, size_t len, off_t offset) { char devname[RTE_DEV_NAME_MAX_LEN] = ""; const struct rte_intr_handle *intr_handle = device->intr_handle; switch (device->kdrv) { case RTE_PCI_KDRV_IGB_UIO: case RTE_PCI_KDRV_UIO_GENERIC: return pci_uio_read_config(intr_handle, buf, len, offset); #ifdef VFIO_PRESENT case RTE_PCI_KDRV_VFIO: return pci_vfio_read_config(intr_handle, buf, len, offset); // --snip-- int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, void *buf, size_t len, off_t offs) { int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle); if (vfio_dev_fd < 0) return -1; return pread64(vfio_dev_fd, buf, len, VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); // <= 5ç« ã§èª¿ã¹ãéãPCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéããã®èªã¿åºã
ä¸è¨ã®ã³ã¼ãã§ã¯PCIã±ã¼ãããªãã£ã®ãã§ã¼ã³ã辿ã£ã¦ãVIRTIO_PCI_CAP_COMMON_CFG
ã¨ããã¿ã¤ããè¦ã¤ããããã®ã±ã¼ãããªãã£ãget_cfg_addr()é¢æ°ã§ã¦ã¼ã¶ç©ºéä»®æ³ã¢ãã¬ã¹ã®ãã¤ã³ã¿ã«å¤æãã¦dev->common_cfg
ãã¤ã³ã¿ã«ä»£å
¥ãã¦ãã¾ãã
å
ã®modern_setup_queue()
é¢æ°ã¯ãã®dev->common_cfg
ãã¤ã³ã¿ã®å
ãèªã¿åºãã¦ãã¾ããã
å
·ä½çã«get_cfg_addr()
é¢æ°ãä½ãè¿ãã®ãã¨ããã¨ã以ä¸ã®ããã«ãã¤ã³ã¿dev->mem_resource[cap->bar].addr + cap->offset
ã§ãã
static void * get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap) { uint8_t bar = cap->bar; uint32_t length = cap->length; uint32_t offset = cap->offset; uint8_t *base; // --snip-- base = dev->mem_resource[bar].addr; if (base == NULL) { PMD_INIT_LOG(ERR, "bar %u base addr is NULL", bar); return NULL; } return base + offset; }
cap->bar
ã¨cap->offset
ã¯PCI ã³ã³ãã£ã°ã¬ã¼ã·ã§ã³ç©ºéããèªã¿åºããå¤ã§ãã
ã§ã¯dev->mem_resource[]
é
åã¯ã©ãã§åæåãããã®ã§ããããï¼
ããã§ãrte_pci_map_device()
é¢æ°ã®ç¶ãã調ã¹ã¾ãããã
vfio-pci
ãã©ã¤ãã®å ´åããã®é¢æ°ã¯åã«pci_vfio_map_resource_primary()é¢æ°ãå¼ã³åºãã¾ãã
int rte_pci_map_device(struct rte_pci_device *dev) { int ret = -1; /* try mapping the NIC resources using VFIO if it exists */ switch (dev->kdrv) { case RTE_PCI_KDRV_VFIO: #ifdef VFIO_PRESENT if (pci_vfio_is_enabled()) ret = pci_vfio_map_resource(dev); #endif // --snip-- int pci_vfio_map_resource(struct rte_pci_device *dev) { if (rte_eal_process_type() == RTE_PROC_PRIMARY) return pci_vfio_map_resource_primary(dev); // --snip--
pci_vfio_map_resource_primary()
é¢æ°ã¯pci_vfio_get_region_info()é¢æ°ãçµç±ãã¦vfio_dev_fd
ã«ioctl(2)ãå¼ã³åºãã¦ãªã¼ã¸ã§ã³ã®æ
å ±ãèªã¿åºãã¾ãããã®æ
å ±ã®ä¸ã«ã¯ãªã¼ã¸ã§ã³ã®ãªãã»ããreg->offset
ã¨ãµã¤ãºreg->size
ãå
¥ã£ã¦ãã¾ãã
ããããªãã»ããã¨ãµã¤ãºã¨å
±ã«ãPCI ã¡ã¢ãªç©ºéããããããã¢ãã¬ã¹ãhugepageã®å¾ããã決ãã¦ãvfio_res->maps[]
é
åã«æ¸ãè¾¼ã¿ã¾ãã
void *pci_map_addr = NULL; static int pci_vfio_map_resource_primary(struct rte_pci_device *dev) { // --snip-- /* map BARs */ maps = vfio_res->maps; // --snip-- for (i = 0; i < vfio_res->nb_maps; i++) { struct vfio_region_info *reg = NULL; void *bar_addr; ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); // --snip-- /* try mapping somewhere close to the end of hugepages */ if (pci_map_addr == NULL) pci_map_addr = pci_find_max_end_va(); bar_addr = pci_map_addr; pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); pci_map_addr = RTE_PTR_ALIGN(pci_map_addr, sysconf(_SC_PAGE_SIZE)); maps[i].addr = bar_addr; maps[i].offset = reg->offset; maps[i].size = reg->size; maps[i].path = NULL; /* vfio doesn't have per-resource paths */ ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); if (ret < 0) { RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", pci_addr, i, strerror(errno)); free(reg); goto err_vfio_res; } dev->mem_resource[i].addr = maps[i].addr; free(reg); } // --snip-- static int pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, int region) { struct vfio_region_info *ri; size_t argsz = sizeof(*ri); int ret; ri = malloc(sizeof(*ri)); if (ri == NULL) { RTE_LOG(ERR, EAL, "Cannot allocate memory for VFIO region info\n"); return -1; } again: memset(ri, 0, argsz); ri->argsz = argsz; ri->index = region; ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri); // --snip-- *info = ri; return 0; }
pci_vfio_mmap_bar()é¢æ°ã¯vfio_res->maps[]
é
åã«æ¸ãè¾¼ã¾ãã¦ãããªã¼ã¸ã§ã³ã®ãªãã»ããã¨ãµã¤ãºã§vfio_dev_fd
ãã£ã¹ã¯ãªãã¿ãmmap(2)ã§ã¡ã¢ãªã«ããããã¾ãã
static int pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, int bar_index, int additional_flags) { struct memreg { uint64_t offset; size_t size; } memreg[2] = {}; void *bar_addr; struct pci_msix_table *msix_table = &vfio_res->msix_table; struct pci_map *bar = &vfio_res->maps[bar_index]; // --snip-- /* reserve the address using an inaccessible mapping */ bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE | MAP_ANONYMOUS | additional_flags, -1, 0); if (bar_addr != MAP_FAILED) { void *map_addr = NULL; if (memreg[0].size) { /* actual map of first part */ map_addr = pci_map_resource(bar_addr, vfio_dev_fd, memreg[0].offset, memreg[0].size, RTE_MAP_FORCE_ADDRESS); } // --snip-- bar->addr = bar_addr; return 0; } void * pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size, int additional_flags) { void *mapaddr; /* Map the PCI memory resource of device */ mapaddr = rte_mem_map(requested_addr, size, RTE_PROT_READ | RTE_PROT_WRITE, RTE_MAP_SHARED | additional_flags, fd, offset); // --snip-- void * rte_mem_map(void *requested_addr, size_t size, int prot, int flags, int fd, uint64_t offset) { // --snip-- return mem_map(requested_addr, size, sys_prot, sys_flags, fd, offset); } static void * mem_map(void *requested_addr, size_t size, int prot, int flags, int fd, uint64_t offset) { void *virt = mmap(requested_addr, size, prot, flags, fd, offset);
çµå±ã¡ã¢ãªããããI/Oã¨ãã£ã¦ãç¹æ®ãªãã¨ã¯ãã¦ããããPCI I/Oã¡ã¢ãªç©ºéãã¢ã¯ã»ã¹ããæã¨åæ§ã«vfio_dev_fd
ãã£ã¹ã¯ãªãã¿ãçµç±ãã¦ãªã¼ã¸ã§ã³ã«ã¢ã¯ã»ã¹ãã¦ããã ãã§ãã
PCI I/Oã¡ã¢ãªç©ºéã¨ã®éãã¯mmap(2)ã§ã¡ã¢ãªã«ããããã¦ããèªã¿æ¸ããããã¨ã§ãã
8. å²ãè¾¼ã¿ã®åãæ±ã
ãªã³ã¯ã¢ãã/ãã¦ã³ãQEMUä¸ã§å®è¡ããã¨virtio_interrupt_handler()é¢æ°ãå®è¡ããããã¨ããã°ãããããã¾ãã
ãã®é¢æ°ã¯rte_intr_callback_register()é¢æ°ã§DPDKã®èµ·åæã«ã³ã¼ã«ããã¯ã¨ãã¦struct rte_intr_source_list intr_sources
ã«ç»é²ãã¾ãã
int rte_intr_callback_register(const struct rte_intr_handle *intr_handle, rte_intr_callback_fn cb, void *cb_arg) { int ret, wake_thread; struct rte_intr_source *src; struct rte_intr_callback *callback; // --snip-- callback->cb_fn = cb; // <= ã³ã¼ã«ããã¯é¢æ°ãã¤ã³ã¿ callback->cb_arg = cb_arg; callback->pending_delete = 0; callback->ucb_fn = NULL; rte_spinlock_lock(&intr_lock); /* check if there is at least one callback registered for the fd */ TAILQ_FOREACH(src, &intr_sources, next) { if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) { /* we had no interrupts for this */ if (TAILQ_EMPTY(&src->callbacks)) wake_thread = 1; TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); ret = 0; break; } }
以ä¸ã®ããã«ãå°ç¨ã®ã¹ã¬ããã§epollå¾ ã¡åãããæãããããã®ãªã¹ãã«ç»é²ãããã³ã¼ã«ããã¯ãå®è¡ãã¾ãã
static __rte_noreturn void * eal_intr_thread_main(__rte_unused void *arg) { /* host thread, never break out */ for (;;) { /* build up the epoll fd with all descriptors we are to * wait on then pass it to the handle_interrupts function */ static struct epoll_event pipe_event = { .events = EPOLLIN | EPOLLPRI, }; struct rte_intr_source *src; unsigned numfds = 0; /* create epoll fd */ int pfd = epoll_create(1); // --snip-- rte_spinlock_lock(&intr_lock); TAILQ_FOREACH(src, &intr_sources, next) { struct epoll_event ev; if (src->callbacks.tqh_first == NULL) continue; /* skip those with no callbacks */ memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; ev.data.fd = rte_intr_fd_get(src->intr_handle); // <= å²ãè¾¼ã¿ãã¡ã¤ã«ãã£ã¹ã¯ãªãã¿åå¾ /** * add all the uio device file descriptor * into wait list. */ if (epoll_ctl(pfd, EPOLL_CTL_ADD, rte_intr_fd_get(src->intr_handle), &ev) < 0) { rte_panic("Error adding fd %d epoll_ctl, %s\n", rte_intr_fd_get(src->intr_handle), strerror(errno)); } else numfds++; } rte_spinlock_unlock(&intr_lock); /* serve the interrupt */ eal_intr_handle_interrupts(pfd, numfds); // --snip-- static void eal_intr_handle_interrupts(int pfd, unsigned totalfds) { struct epoll_event events[totalfds]; int nfds = 0; for(;;) { nfds = epoll_wait(pfd, events, totalfds, EAL_INTR_EPOLL_WAIT_FOREVER); // <= æå®ãããã¡ã¤ããã£ã¹ã¯ãªãã¿ã§epollå¾ ã¡åãã // --snip-- /* epoll_wait has at least one fd ready to read */ if (eal_intr_process_interrupts(events, nfds) < 0) // --snip-- static int eal_intr_process_interrupts(struct epoll_event *events, int nfds) { // --snip-- for (n = 0; n < nfds; n++) { // --snip-- rte_spinlock_lock(&intr_lock); TAILQ_FOREACH(src, &intr_sources, next) if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd) break; // --snip-- if (bytes_read > 0) { /** * read out to clear the ready-to-be-read flag * for epoll_wait. */ bytes_read = read(events[n].data.fd, &buf, bytes_read); if (bytes_read < 0) { // --snip-- } else if (bytes_read == 0) RTE_LOG(ERR, EAL, "Read nothing from file " "descriptor %d\n", events[n].data.fd); else call = true; } /* grab a lock, again to call callbacks and update status. */ rte_spinlock_lock(&intr_lock); if (call) { /* Finally, call all callbacks. */ TAILQ_FOREACH(cb, &src->callbacks, next) { /* make a copy and unlock. */ active_cb = *cb; rte_spinlock_unlock(&intr_lock); /* call the actual callback */ active_cb.cb_fn(active_cb.cb_arg); // <= ç»é²ããã¦ããã³ã¼ã«ããã¯ãå¼ã³åºã /*get the lock back. */ rte_spinlock_lock(&intr_lock); }
rte_intr_fd_get(src->intr_handle)
ã§åå¾ãã¦å¾ããããã¡ã¤ã«ãã£ã¹ã¯ãªãã¿ã¯pci_vfio_setup_interrupts()é¢æ°ã§è¨å®ãã¦ãã¾ãã
static int pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) { // --snip-- /* set up an eventfd for interrupts */ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); if (fd < 0) { RTE_LOG(ERR, EAL, "Cannot set up eventfd, error " "%i (%s)\n", errno, strerror(errno)); return -1; } if (rte_intr_fd_set(dev->intr_handle, fd)) return -1;
ãã®eventfdã¯vfio_enable_msix()é¢æ°ã§VFIOãã©ã¤ãã«æ¸¡ãã¦ãã¾ãã
static int vfio_enable_msix(const struct rte_intr_handle *intr_handle) { int len, ret; char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; struct vfio_irq_set *irq_set; int *fd_ptr, vfio_dev_fd, i; len = sizeof(irq_set_buf); irq_set = (struct vfio_irq_set *) irq_set_buf; irq_set->argsz = len; /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ irq_set->count = rte_intr_max_intr_get(intr_handle) ? (rte_intr_max_intr_get(intr_handle) > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? RTE_MAX_RXTX_INTR_VEC_ID + 1 : rte_intr_max_intr_get(intr_handle)) : 1; irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; irq_set->start = 0; fd_ptr = (int *) &irq_set->data; /* INTR vector offset 0 reserve for non-efds mapping */ fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle); for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) { fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = rte_intr_efds_index_get(intr_handle, i); } vfio_dev_fd = rte_intr_dev_fd_get(intr_handle); ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
VFIOãã©ã¤ããå²ãè¾¼ã¿ãã³ãã©ã§eventfd_signal(9)ãå¼ã³åºããã¨ã§ãå è¿°ã®epollãèµ·åºãã¾ãã
9. ä»ååãä¸ããªãã£ããã¨
ä»åã®è¨äºã§ã¯DPDKã®ã¦ã¼ã¶ç©ºéã¨ãLinuxã®ã«ã¼ãã«ç©ºéã®å¢çãç¥ãããã«PCIããã¤ã¹ãã©ãDPDKã«è¦ãã¦ãããæ·±æããã¾ããã ãããä»åã®è¨äºã§ã¯é½åä¸ã以ä¸ã«ã¤ãã¦ã¯æ±ãã¾ããã§ããã
- PMDã«ãããã±ããéåä¿¡ã®è©³ããä»çµã¿
vfio-pci
ãã©ã¤ãã¨å®éã®PCIããã¤ã¹ã®è²¬ä»»åæ
ãããã¯æ©ä¼ãããã°ã次å以éã®è¨äºã§æããã«ãããã¨æãã¾ãã
10. ã¾ã¨ã
æ¬è¨äºã§ã¯DPDKã®ã½ã¼ã¹ã³ã¼ããèªã¿ãªããPMDã¨PCIããã¤ã¹ã®é¢ä¿ã調ã¹ã¾ããã ã½ã¼ã¹ã³ã¼ããèªãã¾ã§ã¯PMDãã©ãå®ç¾ããã¦ããã®ãã¯ã£ããã¨ã¯ç解ã§ãã¾ããã§ãããã調æ»ã®çµæ以ä¸ã§å®ç¾ããã¦ãããã¨ããããã¾ããã
- epoll_wait(2)
- eventfd(2)
- ioctl(2)
- mmap(2)
- pthread
- sysfs
ãããã¯ç¹æ®ãªä»çµã¿ã§ã¯ãªããLinuxã§ã¯ãããµããæè¡ã§ãã ãã¦ã¼ã¶ç©ºéãã©ã¤ããä½ããã¨ãã£ãããã¾ã§ã®æ çµã¿ã¨é¢ããæ©è½ãå®ç¾ãããã¨ããéãã¾ãã¯æ¢åã®ä»çµã¿ã使ããªããèããã¹ãã ã¨è¨ããã§ãããã
ä¸æ¹ã§ä»åã®èª¿æ»ã§ã¯å²ãè¾¼ã¿ãã³ãã©ãå®è¡ãããã¾ã§ã¯æ¯è¼çæéããããããã«èªãã¾ããã ãªã³ã¯ã¢ãã/ãã¦ã³ãä¼ããç¨åº¦ã®å²ãè¾¼ã¿ã§ããã°ãã®PMDã®å®è£ ã§è¯ãããããã¾ããããå²ãè¾¼ã¿å¿çãããã©ã¼ãã³ã¹ã«å½±é¿ã大ããä¸ããå ´åã«ã¯å¥ã®æ¹æ³ãæ¤è¨ããæ¹ãè¯ãããã§ãã