Guide to writing an iflib driver - 0xffffffRabbit/NextBsd GitHub Wiki
-
#include <net/iflib.h>
-
#include “ifdi_if.h”
-
Add unit64_t paddr variable to transmit and receive ring respectively to store physical address
-
Recommend removing variables that iflib provides - next_avail_desc, next_to_clean, next_to_refresh, num_desc, buf ring, dma & lock related variables
-
Separate the driver queue struct that is an interrupt container for the associated tx and rx ring into 2 separate structures. The receive queue should contain a struct if_irq for interrupts.
-
Adapter structure should contain if_ctx_t & if_softc_ctx_t variables and a struct if_irq.
-
Change struct task to struct grouptask
-
Remove struct task link_task - Handled by iflib
-
Create the PCI DEVICE ID Table – This table is used by probe to select devices to load. Each entry contains your PCI IDs and device strings using the format: PVID($Vendor_Id, $Device_Id, $Subvendor_Id, “$String_Index”) with PVID_END as the last entry. Troubleshooting: If your driver will not load re-check this section.
static pci_vendor_info_t my_vendor_info_array[] = { PVID(MY_VENDOR_ID, MY_DEVICE_ID_1, "MegaCorp(R) v1 Network Driver"), PVID(MY_VENDOR_ID, MY_DEVICE_ID_2, "MegaCorp(R) v2 Network Driver"), ... PVID_END }; -
Create Device Method Table - Maps standard device methods to iflib and custom driver functions. A developer will need to write a mandatory my_register_function to register a driver with the iflib framework. Note: If the driver is non-IOV then the my_register_function is the only function that needs to be written.
/* FreeBSD Device Interface Entry Points */ static device_method_t xyz_if_methods[] = { /* Device interface */ DEVMETHOD(device_register, my_register_function), DEVMETHOD(device_probe, iflib_device_probe), DEVMETHOD(device_attach, iflib_device_attach), DEVMETHOD(device_detach, iflib_device_detach), DEVMETHOD(device_shutdown, iflib_device_shutdown), DEVMETHOD(device_suspend, iflib_device_suspend), DEVMETHOD(device_resume, iflib_device_resume), #ifdef PCI_IOV DEVMETHOD(pci_iov_init, portal_init_iov), DEVMETHOD(pci_iov_uninit, portal_uninit_iov), DEVMETHOD(pci_iov_add_vf, portal_add_vf), #endif /* PCI_IOV */ DEVMETHOD_END }; -
Complete _if_methods table to declare device independent functions and add iflib to MODULE_DEPEND. Troubleshooting: This module depends on iflib being in the kernel.
MODULE_DEPEND(xyz, iflib, 1, 1, 1); static device_method_t xyz_if_methods[] = { DEVMETHOD(ifdi_attach_pre, xyz_if_attach_pre), .... DEVMETHOD(ifdi_timer, xyz_if_timer), DEVMETHOD_END }; static driver_t xyz_if_driver = { "xyz_if", xyz_if_methods, sizeof(struct xyz_softc) }; -
The iflib shared context structure if_shared_ctx contains initialization values. The following fields must be completed prior to driver registration:
static struct if_shared_ctx my_sctx_init = { .isc_magic = IFLIB_MAGIC, /* DON'T CHANGE ME */ .isc_q_align = DBA_ALIGN, /* Alignment value-DBA_ALIGN,PAGE_SIZE,etc */ .isc_tx_maxsize = TSO_MAX_SIZE, /* Maximum Transfer Size */ .isc_tx_maxsegsize = TSO_MAX_SEG,/* Maximum Transfer Segment Size */ .isc_rx_maxsize = PAGE_SIZE*4, /* Maximum Receive Size */ .isc_rx_nsegments = 1, /* Number of Receive Segments - Default 1 */ .isc_rx_maxsegsize = PAGE_SIZE*4,/* Maximum Receive Segment Size */ .isc_ntxd = DEFAULT_TXD, /* Number of Transmit Desc in transmit ring */ .isc_nrxd = DEFAULT_RXD, /* Number of Receive Desc in receive ring */ .isc_nfl = 2, /* Number of Free Lists - Default 1 */ /* Set size of each transmit queue - Default single tx queue */ /* Troubleshooting: Queue size parameters are entirely dependent on each individual driver. For example + sizeof(32) is unique to this example driver. */ .isc_txqsizes[0] = roundup2((DEFAULT_TXD * sizeof(union portal_tx_desc)) + sizeof(u32), DBA_ALIGN), .isc_txqsizes[1] = roundup2(TX_BD_NUM_PAGES * PAGE_SIZE, DBA_ALIGN), /* Set size of each receive queue - Default single rx queue */ .isc_rxqsizes[0] = roundup2(DEFAULT_RXD * sizeof(union portal_rx_desc), DBA_ALIGN), .isc_rxqsizes[1] = roundup2(RX_BD_NUM_PAGES * PAGE_SIZE, DBA_ALIGN), .isc_rxqsizes[2] = roundup2(RCQ_NUM_PAGES * PAGE_SIZE, DBA_ALIGN), .isc_ntxqs = 2, /* Number of Transmit Queues */ .isc_nrxqs = 3, /* Number of Receive Queues */ .isc_admin_intrcnt = 1 /* Number of Admin Interrupts - Default 1 */ .isc_vendor_info = my_vendor_info_array, /* Vendor Info Table */ .isc_driver_version = my_driver_version, /* Driver Version */ .isc_txrx = &my_txrx, /* Structure containing transmit/receive functions */ .isc_driver = &my_if_driver, ........ }; -
Set my_sctx iflib shared context varible to the initialized context values.
if_shared_ctx_t my_sctx = &my_sctx_init;
-
Create a my_register_function. This function will do any necessary runtime checking on kenv tunable variables. It will return the shared context my_sctx.
static void * my_register_function(device_t dev) { ... return (my_sctx); } -
Implement the Attach_pre function:
typedef struct if_softc_ctx { int isc_msix_bar; /* MSIX Base Address Register */ int isc_tx_nsegments; /* Number of Transmit Segments */ int isc_tx_tso_segments_max;/* Max Transmit TSO Segments */ int isc_tx_tso_size_max; /* Max Transmit TSO Size */ int isc_tx_tso_segsize_max; /* Max Transmit TSO Segment Size */ int isc_rss_table_size; /* RSS Table Size- Required for Multiple Queues */ ... } *if_softc_ctx_t;
-
Initialize certain fields in if_softc_ctx_t
-
Initialize the struct xyz_softc
-
Initialize .isc_nrxd, .isc_ntxd .isc_txqsizes, .isc_rxqsizes .isc_max_txqsets /.isc_max_rxsets
-
Allocate pci resources, multicast array memory, and initialize shared code.
-
Setup some sysctls for the tunable interrupt delays
-
Notify iflib of the driver’s mac address by using iflib_set_mac
It may include the following:
-
Setup OS specific network interface
-
Initialization of Statistics
-
Set an initial default control flow & dmac value
-
Let the hardware know that the driver is loaded
DEVMETHOD(ifdi_detach, xyz_if_detach), DEVMETHOD(ifdi_shutdown, xyz_if_shutdown), DEVMETHOD(ifdi_suspend, xyz_if_suspend), DEVMETHOD(ifdi_resume, xyz_if_resume)
DEVMETHOD(ifdi_tx_queues_alloc, xyz_if_tx_queues_alloc),
DEVMETHOD(ifdi_rx_queues_alloc, xyz_if_rx_queues_alloc),
DEVMETHOD(ifdi_queues_free, xyz_if_queues_free)
static int xyz_if_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t
*paddrs, int ntxqs, int ntxqsets) {
....
/* Allocate queue memory structure */
xyz_softc->tx_queues = (struct xyz_tx_queue *)malloc(sizeof(struct xyz_tx_queue)
* ntxqsets, M_DEVBUF, M_NOWAIT | M_ZERO))
.....
/* Iterate through queue list */
for (i = 0, my_queue = xyz_softc->tx_queues; i < ntxqsets; i++, my_queue++) {
struct tx_ring *txr = &my_queue->txr;
/* Allocate memory for transmit buffers */
txr->tx_buffers = (struct xyz_tx_buf *)malloc(struct xyz_tx_buf) *
xyz_sctx->isc_ntxd, M_DEVBUF, M_NOWAIT | M_ZERO;
....
/* Get the virtual and physical address of the hardware queues */
txr->tx_base = (union xyz_tx_desc *)vaddrs[i];
txr->tx_paddr = paddrs[i];
....
/* If required set eop to -1 */
for (j = 0; j < xyz_sctx->isc_ntxd; j++) {
txr->tx_buffers[j].eop = -1;
}
....
/* Initialize and attach task groups */
iflib_config_gtask_init(ctx, &adapter->mod_task, xyz_handle_mod, "mod_task");
iflib_config_gtask_init(ctx, &adapter->phy_task, xyz_handle_phy, "phy_task");
....
}
Troubleshooting: You don’t need iflib_config_gtask_init for link task.
DEVMETHOD(ifdi_mtu_set, xyz_if_mtu_set), DEVMETHOD(ifdi_init, xyz_if_init), DEVMETHOD(ifdi_media_status, xyz_if_media_status) DEVMETHOD(ifdi_media_change, xyz_if_media_change) DEVMETHOD(ifdi_promisc_set, xyz_if_promisc_set) DEVMETHOD(ifdi_multi_set, xyz_if_multi_set) DEVMETHOD(ifdi_timer, xyz_if_timer) DEVMETHOD(ifdi_stop, xyz_if_stop) DEVMETHOD(ifdi_intr_enable, xyz_if_intr_enable) DEVMETHOD(ifdi_intr_disable, xyz_if_intr_disable) DEVMETHOD(ifdi_vlan_register, xyz_if_vlan_register) DEVMETHOD(ifdi_vlan_unregister, xyz_if_vlan_unregister)
Notes: You should be able to remove most instances of these functions from your code: callout functions, probe function, code referencing locks or mutexes, dma related functions, bus alloc/teardown/setup_intr functions, vlan attach/detach, buf_ring_alloc, and mbuf related code. Troubleshooting: Substitute iflib_link_state_change function for any existing link_state_change function.
12. Ioctl - There should be no ioctl function as iflib calls individual functions (ie xyz_if_init, xyz_if_mtu_set, etc) directly to configure the interface depending upon the case (ie. case SIOCSIFMTU, case SIOCADDMULTI, etc).
-
Setup MSIX/MSI/Legacy interrupt routines and handlers
Troubleshooting: admin que will either be allocated first or last depending upon driver specifications
static int xyz_if_msix_intr_assign(if_ctx_t ctx, int msix) {
struct xyz_rx_queue *rx_que = xyz_softc->rx_queues;
struct xyz_tx_queue *tx_que;
....
for (int i = 0; i < xyz_softc->num_rx_queues; i++, vectors++,
xyz_softc->rx_queues++) {
rid = vector + 1;
iflib_irq_alloc_generic(ctx, &rx_que->que_irq, rid, IFLIB_INTR_RX,
xyz_msix_que, rx_que, rx_que->index, buf)
}
....
for (int i = 0; i < xyz_softc->num_tx_queues; i++) {
snprintf(buf, sizeof(buf), "txq%d", i);
tx_que = &adapter->tx_queues[i];
iflib_softirq_alloc_generic(ctx, rid, IFLIB_INTR_TX, tx_que,
tx_que->txr.me, buf);
}
rid = vector + 1;
iflib_irq_alloc_generic(ctx, &adapter->irq, rid, IFLIB_INTR_ADMIN,
xyz_msix_link, xyz_softc, 0, "aq");
}
static int xyz_msix_que(void *arg)
static int xyz_msix_link(void *arg)
MSIX Interrupt and link interrupt service routines are required and should take a void *arg (pointer to softc structure). Routines return FILTER_HANDLED if there is no work to be done. For example if the interrupt is not ours or there is stats collection. FILTER_SCHEDULE_THREAD is returned to schedule the thread for additional work. Troubleshooting: IFLIB handles all disabling and enabling of interrupts.
Recommend: Place transmit & receive code in another file ie. xyz_txrx.c and then reference it in the main file
extern struct if_txrx xyz_txrx;
/* In xyz_txrx.c */
struct if_txrx xyz_txrx = {
xyz_isc_txd_encap,
xyz_isc_txd_flush,
xyz_isc_txd_credits_update,
xyz_isc_rxd_available,
xyz_isc_rxd_pkt_get,
xyz_isc_rxd_refill,
xyz_isc_rxd_flush,
xyz_intr
};
Recommend: Look at how other iflib drivers implement their txrx functions.
static int xyz_isc_txd_encap(void *arg, if_pkt_info_t pi)
isc_txd_encap routine maps the mbufs to tx descriptors allowing the TX engine to transmit the packets. The code in isc_txd_encap should incorporate the original drivers xmit function code that is not handled in iflib. This should include setting up the appropriate context descriptor for VLAN, CSUM, or TSO, setting up the transmit descriptors, and marking the end of packet. IFLIB handles all bus mapping and mbuf management. IFLIB provides if_pkt_info_t pi, a packet information structure containing information on the mbuf to be sent. Troubleshooting: pi->ipi_pidx is the packet to be sent. You must set pi->ipi_new_pidx to the next packet in the encap function.
/* Example Code */
static void xyz_isc_txd_flush(void *arg, uint16_t txqid, uint32_t pidx) {
....
IXGBE_WRITE_REG(&arg->hw, txr->tail, pidx);
}
isc_txd_flush advances the Transmit Descriptor Tail telling the hardware that this frame is available to transmit. Similarly isc_rxd_flush advances the Receive Tail Descriptor. arg refers to softc adapter, txqid is the id number of the particular transmit queue, and pidx is the id of the packet to transmit.
/* Example Code */
static int
ixl_isc_txd_credits_update(void *arg, uint16_t qid, uint32_t cidx, bool clear) {
.....
/* Get the Head WB value */
head = ixl_get_tx_head(que);
credits = head - cidx;
if (credits < 0)
credits += scctx->isc_ntxd;
return (credits);
}
isc_txd_credits_update reclaims buffers for completed transmissions. If the hardware is done processing the packet then put the associated buffer on the free queue. If the routine is called by iflib_tx_credits_update the clear flag will be true, otherwise if it is called by iflib_txq_can_drain the flag will be false.
/* Example Code - isc_rxd_refill */
static void xyz_isc_rxd_refill(void *arg, uint16_t rxqid, uint8_t flid __unused,
uint32_t pidx, uint64_t *paddrs, caddr_t *vaddrs __unused, uint16_t count) {
....
for (i = 0, next_pidx = pidx; i < count; i++) {
rxr->rx_base[next_pidx].read.pkt_addr = htole64(paddrs[i]);
if (++next_pidx == xyz_sctx->isc_nrxd)
next_pidx = 0;
}
}
isc_rxd_refill repopulates a receive queue free-buffer list with count new packets. The routine simply reads in new frames provided at physical address paddrs into the driver’s receive descriptors.
/* Example Code - isc_rxd_available */
static int xyz_isc_rxd_available(void *arg, uint16_t rxqid, uint32_t idx) {
....
for (cnt = 0, i = idx; cnt < scctx->isc_nrxd;) {
rxd = &rxr->rx_base[i];
staterr = le32toh(rxd->wb.upper.status_error);
.....
if (staterr & E1000_RXD_STAT_EOP)
cnt++;
}
return (cnt);
}
isc_rxd_available returns a count of new descriptors available. It should iterate through through the descriptors. It should not increment the count for empty descriptors. This count is used by iflib_rxeof to determine how many times to call iflib_rxd_pkt_get.
isc_rxd_pkt_get returns the next packet and fills in the if_rxd_info_t structure.
-
Fill in ri->iri_len, ri->iri_vtag, ri->iri_flags, and ri->iri_vtag
-
Handle any receive checksum
-
Handle packet fragments - Fill in the struct if_rxd_frag