2 * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * netmap modifications for ixgbe
31 * This file is meant to be a reference on how to implement
32 * netmap support for a network driver.
33 * This file contains code but only static or inline functions
34 * that are used by a single driver. To avoid replication of
35 * code we just #include it near the beginning of the
39 #include <net/netmap.h>
40 #include <sys/selinfo.h>
42 * Some drivers may need the following headers. Others
43 * already include them by default
49 #include <dev/netmap/netmap_kern.h>
52 * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
53 * During regular operations the CRC is stripped, but on some
54 * hardware reception of frames not multiple of 64 is slower,
55 * so using crcstrip=0 helps in benchmarks.
57 * ix_rx_miss, ix_rx_miss_bufs:
58 * count packets that might be missed due to lost interrupts.
61 * use the dd bit for completed tx transmissions.
62 * This is tricky, much better to use TDH for now.
64 SYSCTL_DECL(_dev_netmap);
65 static int ix_rx_miss, ix_rx_miss_bufs, ix_use_dd, ix_crcstrip;
66 SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip,
67 CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames");
68 SYSCTL_INT(_dev_netmap, OID_AUTO, ix_use_dd,
69 CTLFLAG_RW, &ix_use_dd, 0, "use dd instead of tdh to detect tx frames");
70 SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss,
71 CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr");
72 SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs,
73 CTLFLAG_RW, &ix_rx_miss_bufs, 0, "potentially missed rx intr bufs");
76 * wrapper to export locks to the generic netmap code.
79 ixgbe_netmap_lock_wrapper(struct ifnet *_a, int what, u_int queueid)
81 struct adapter *adapter = _a->if_softc;
83 ASSERT(queueid < adapter->num_queues);
85 case NETMAP_CORE_LOCK:
86 IXGBE_CORE_LOCK(adapter);
88 case NETMAP_CORE_UNLOCK:
89 IXGBE_CORE_UNLOCK(adapter);
92 IXGBE_TX_LOCK(&adapter->tx_rings[queueid]);
94 case NETMAP_TX_UNLOCK:
95 IXGBE_TX_UNLOCK(&adapter->tx_rings[queueid]);
98 IXGBE_RX_LOCK(&adapter->rx_rings[queueid]);
100 case NETMAP_RX_UNLOCK:
101 IXGBE_RX_UNLOCK(&adapter->rx_rings[queueid]);
108 set_crcstrip(struct ixgbe_hw *hw, int onoff)
110 /* crc stripping is set in two places:
111 * IXGBE_HLREG0 (modified on init_locked and hw reset)
112 * IXGBE_RDRXCTL (set by the original driver in
113 * ixgbe_setup_hw_rsc() called in init_locked.
114 * We disable the setting when netmap is compiled in).
115 * We update the values here, but also in ixgbe.c because
116 * init_locked sometimes is called outside our control.
120 hl = IXGBE_READ_REG(hw, IXGBE_HLREG0);
121 rxc = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
123 D("%s read HLREG 0x%x rxc 0x%x",
124 onoff ? "enter" : "exit", hl, rxc);
125 /* hw requirements ... */
126 rxc &= ~IXGBE_RDRXCTL_RSCFRSTSIZE;
127 rxc |= IXGBE_RDRXCTL_RSCACKC;
128 if (onoff && !ix_crcstrip) {
129 /* keep the crc. Fast rx */
130 hl &= ~IXGBE_HLREG0_RXCRCSTRP;
131 rxc &= ~IXGBE_RDRXCTL_CRCSTRIP;
133 /* reset default mode */
134 hl |= IXGBE_HLREG0_RXCRCSTRP;
135 rxc |= IXGBE_RDRXCTL_CRCSTRIP;
138 D("%s write HLREG 0x%x rxc 0x%x",
139 onoff ? "enter" : "exit", hl, rxc);
140 IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hl);
141 IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc);
145 * Register/unregister. We are already under core lock.
146 * Only called on the first register or the last unregister.
149 ixgbe_netmap_reg(struct ifnet *ifp, int onoff)
151 struct adapter *adapter = ifp->if_softc;
152 struct netmap_adapter *na = NA(ifp);
156 return EINVAL; /* no netmap support here */
158 ixgbe_disable_intr(adapter);
160 /* Tell the stack that the interface is no longer active */
161 ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
163 set_crcstrip(&adapter->hw, onoff);
164 if (onoff) { /* enable netmap mode */
165 ifp->if_capenable |= IFCAP_NETMAP;
167 /* save if_transmit and replace with our routine */
168 na->if_transmit = ifp->if_transmit;
169 ifp->if_transmit = netmap_start;
172 * reinitialize the adapter, now with netmap flag set,
173 * so the rings will be set accordingly.
175 ixgbe_init_locked(adapter);
176 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
180 } else { /* reset normal mode (explicit request or netmap failed) */
182 /* restore if_transmit */
183 ifp->if_transmit = na->if_transmit;
184 ifp->if_capenable &= ~IFCAP_NETMAP;
185 /* initialize the card, this time in standard mode */
186 ixgbe_init_locked(adapter); /* also enables intr */
188 set_crcstrip(&adapter->hw, onoff);
194 * Reconcile kernel and user view of the transmit ring.
195 * This routine might be called frequently so it must be efficient.
197 * ring->cur holds the userspace view of the current ring index. Userspace
198 * has filled the tx slots from the previous call's ring->cur up to but not
199 * including ring->cur for this call. In this function the kernel updates
200 * kring->nr_hwcur to ring->cur, thus slots [kring->nr_hwcur, ring->cur) are
201 * now ready to transmit. At the last interrupt kring->nr_hwavail slots were
204 * This function runs under lock (acquired from the caller or internally).
205 * It must first update ring->avail to what the kernel knows,
206 * subtract the newly used slots (ring->cur - kring->nr_hwcur)
207 * from both avail and nr_hwavail, and set ring->nr_hwcur = ring->cur
208 * issuing a dmamap_sync on all slots.
210 * Since ring comes from userspace, its content must be read only once,
211 * and validated before being used to update the kernel's structures.
212 * (this is also true for every use of ring in the kernel).
214 * ring->avail is never used, only checked for bogus values.
216 * do_lock is set iff the function is called from the ioctl handler.
217 * In this case, grab a lock around the body, and also reclaim transmitted
218 * buffers irrespective of interrupt mitigation.
221 ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
223 struct adapter *adapter = ifp->if_softc;
224 struct tx_ring *txr = &adapter->tx_rings[ring_nr];
225 struct netmap_adapter *na = NA(adapter->ifp);
226 struct netmap_kring *kring = &na->tx_rings[ring_nr];
227 struct netmap_ring *ring = kring->ring;
229 u_int const k = ring->cur, lim = kring->nkr_num_slots - 1;
232 * ixgbe can generate an interrupt on every tx packet, but it
233 * seems very expensive, so we interrupt once every half ring,
234 * or when requested with NS_REPORT
236 u_int report_frequency = kring->nkr_num_slots >> 1;
239 return netmap_ring_reinit(kring);
243 bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
244 BUS_DMASYNC_POSTREAD);
247 * Process new packets to send. j is the current index in the
248 * netmap ring, l is the corresponding index in the NIC ring.
249 * The two numbers differ because upon a *_init() we reset
250 * the NIC ring but leave the netmap ring unchanged.
251 * For the transmit ring, we have
253 * j = kring->nr_hwcur
254 * l = IXGBE_TDT (not tracked in the driver)
256 * j == (l + kring->nkr_hwofs) % ring_size
258 * In this driver kring->nkr_hwofs >= 0, but for other
259 * drivers it might be negative as well.
262 if (j != k) { /* we have new packets to send */
263 prefetch(&ring->slot[j]);
264 l = netmap_idx_k2n(kring, j); /* NIC index */
265 prefetch(&txr->tx_buffers[l]);
266 for (n = 0; j != k; n++) {
268 * Collect per-slot info.
269 * Note that txbuf and curr are indexed by l.
271 * In this driver we collect the buffer address
272 * (using the PNMB() macro) because we always
273 * need to rewrite it into the NIC ring.
274 * Many other drivers preserve the address, so
275 * we only need to access it if NS_BUF_CHANGED
277 * XXX note, on this device the dmamap* calls are
278 * not necessary because tag is 0, however just accessing
279 * the per-packet tag kills 1Mpps at 900 MHz.
281 struct netmap_slot *slot = &ring->slot[j];
282 union ixgbe_adv_tx_desc *curr = &txr->tx_base[l];
283 struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[l];
285 // XXX type for flags and len ?
286 int flags = ((slot->flags & NS_REPORT) ||
287 j == 0 || j == report_frequency) ?
288 IXGBE_TXD_CMD_RS : 0;
289 u_int len = slot->len;
290 void *addr = PNMB(slot, &paddr);
292 j = (j == lim) ? 0 : j + 1;
293 l = (l == lim) ? 0 : l + 1;
294 prefetch(&ring->slot[j]);
295 prefetch(&txr->tx_buffers[l]);
298 * Quick check for valid addr and len.
299 * NMB() returns netmap_buffer_base for invalid
300 * buffer indexes (but the address is still a
301 * valid one to be used in a ring). slot->len is
302 * unsigned so no need to check for negative values.
304 if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) {
307 IXGBE_TX_UNLOCK(txr);
308 return netmap_ring_reinit(kring);
311 if (slot->flags & NS_BUF_CHANGED) {
312 /* buffer has changed, unload and reload map */
313 netmap_reload_map(txr->txtag, txbuf->map, addr);
314 slot->flags &= ~NS_BUF_CHANGED;
316 slot->flags &= ~NS_REPORT;
318 * Fill the slot in the NIC ring.
319 * In this driver we need to rewrite the buffer
320 * address in the NIC ring. Other drivers do not
322 * Use legacy descriptor, it is faster.
324 curr->read.buffer_addr = htole64(paddr);
325 curr->read.olinfo_status = 0;
326 curr->read.cmd_type_len = htole32(len | flags |
327 IXGBE_ADVTXD_DCMD_IFCS | IXGBE_TXD_CMD_EOP);
329 /* make sure changes to the buffer are synced */
330 bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE);
332 kring->nr_hwcur = k; /* the saved ring->cur */
333 /* decrease avail by number of packets sent */
334 kring->nr_hwavail -= n;
336 /* synchronize the NIC ring */
337 bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
338 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
339 /* (re)start the transmitter up to slot l (excluded) */
340 IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), l);
344 * Reclaim buffers for completed transmissions.
345 * Because this is expensive (we read a NIC register etc.)
346 * we only do it in specific cases (see below).
347 * In all cases kring->nr_kflags indicates which slot will be
348 * checked upon a tx interrupt (nkr_num_slots means none).
351 j = 1; /* forced reclaim, ignore interrupts */
352 kring->nr_kflags = kring->nkr_num_slots;
353 } else if (kring->nr_hwavail > 0) {
354 j = 0; /* buffers still available: no reclaim, ignore intr. */
355 kring->nr_kflags = kring->nkr_num_slots;
358 * no buffers available, locate a slot for which we request
359 * ReportStatus (approximately half ring after next_to_clean)
360 * and record it in kring->nr_kflags.
361 * If the slot has DD set, do the reclaim looking at TDH,
362 * otherwise we go to sleep (in netmap_poll()) and will be
363 * woken up when slot nr_kflags will be ready.
365 struct ixgbe_legacy_tx_desc *txd =
366 (struct ixgbe_legacy_tx_desc *)txr->tx_base;
368 j = txr->next_to_clean + kring->nkr_num_slots/2;
369 if (j >= kring->nkr_num_slots)
370 j -= kring->nkr_num_slots;
371 // round to the closest with dd set
372 j= (j < kring->nkr_num_slots / 4 || j >= kring->nkr_num_slots*3/4) ?
373 0 : report_frequency;
374 kring->nr_kflags = j; /* the slot to check */
375 j = txd[j].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ?
381 * Record completed transmissions.
382 * We (re)use the driver's txr->next_to_clean to keep
383 * track of the most recently completed transmission.
385 * The datasheet discourages the use of TDH to find out the
386 * number of sent packets. We should rather check the DD
387 * status bit in a packet descriptor. However, we only set
388 * the "report status" bit for some descriptors (a kind of
389 * interrupt mitigation), so we can only check on those.
390 * For the time being we use TDH, as we do it infrequently
391 * enough not to pose performance problems.
394 struct ixgbe_legacy_tx_desc *txd =
395 (struct ixgbe_legacy_tx_desc *)txr->tx_base;
396 u_int k1 = netmap_idx_k2n(kring, kring->nr_hwcur);
397 l = txr->next_to_clean;
400 txd[l].upper.fields.status & IXGBE_TXD_STAT_DD) {
402 l = (l == lim) ? 0 : l + 1;
405 l = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr));
406 if (l >= kring->nkr_num_slots) { /* XXX can happen */
408 l -= kring->nkr_num_slots;
410 delta = l - txr->next_to_clean;
413 /* some tx completed, increment avail */
415 delta += kring->nkr_num_slots;
416 txr->next_to_clean = l;
417 kring->nr_hwavail += delta;
418 if (kring->nr_hwavail > lim)
422 /* update avail to what the kernel knows */
423 ring->avail = kring->nr_hwavail;
426 IXGBE_TX_UNLOCK(txr);
432 * Reconcile kernel and user view of the receive ring.
433 * Same as for the txsync, this routine must be efficient and
434 * avoid races in accessing the shared regions.
436 * When called, userspace has read data from slots kring->nr_hwcur
437 * up to ring->cur (excluded).
439 * The last interrupt reported kring->nr_hwavail slots available
440 * after kring->nr_hwcur.
441 * We must subtract the newly consumed slots (cur - nr_hwcur)
442 * from nr_hwavail, make the descriptors available for the next reads,
443 * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail.
445 * do_lock has a special meaning: please refer to txsync.
448 ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
450 struct adapter *adapter = ifp->if_softc;
451 struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
452 struct netmap_adapter *na = NA(adapter->ifp);
453 struct netmap_kring *kring = &na->rx_rings[ring_nr];
454 struct netmap_ring *ring = kring->ring;
455 u_int j, l, n, lim = kring->nkr_num_slots - 1;
456 int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR;
457 u_int k = ring->cur, resvd = ring->reserved;
460 return netmap_ring_reinit(kring);
464 /* XXX check sync modes */
465 bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
466 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
469 * First part, import newly received packets into the netmap ring.
471 * j is the index of the next free slot in the netmap ring,
472 * and l is the index of the next received packet in the NIC ring,
473 * and they may differ in case if_init() has been called while
474 * in netmap mode. For the receive ring we have
476 * j = (kring->nr_hwcur + kring->nr_hwavail) % ring_size
477 * l = rxr->next_to_check;
479 * j == (l + kring->nkr_hwofs) % ring_size
481 * rxr->next_to_check is set to 0 on a ring reinit
483 if (netmap_no_pendintr || force_update) {
484 int crclen = ix_crcstrip ? 0 : 4;
485 uint16_t slot_flags = kring->nkr_slot_flags;
487 l = rxr->next_to_check;
488 j = netmap_idx_n2k(kring, l);
491 union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l];
492 uint32_t staterr = le32toh(curr->wb.upper.status_error);
494 if ((staterr & IXGBE_RXD_STAT_DD) == 0)
496 ring->slot[j].len = le16toh(curr->wb.upper.length) - crclen;
497 ring->slot[j].flags = slot_flags;
498 bus_dmamap_sync(rxr->ptag,
499 rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD);
500 j = (j == lim) ? 0 : j + 1;
501 l = (l == lim) ? 0 : l + 1;
503 if (n) { /* update the state variables */
504 if (netmap_no_pendintr && !force_update) {
507 ix_rx_miss_bufs += n;
509 rxr->next_to_check = l;
510 kring->nr_hwavail += n;
512 kring->nr_kflags &= ~NKR_PENDINTR;
516 * Skip past packets that userspace has released
517 * (from kring->nr_hwcur to ring->cur - ring->reserved excluded),
518 * and make the buffers available for reception.
519 * As usual j is the index in the netmap ring, l is the index
520 * in the NIC ring, and j == (l + kring->nkr_hwofs) % ring_size
524 if (resvd + ring->avail >= lim + 1) {
525 D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
526 ring->reserved = resvd = 0; // XXX panic...
528 k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
530 if (j != k) { /* userspace has released some packets. */
531 l = netmap_idx_k2n(kring, j);
532 for (n = 0; j != k; n++) {
533 /* collect per-slot info, with similar validations
534 * and flag handling as in the txsync code.
536 * NOTE curr and rxbuf are indexed by l.
537 * Also, this driver needs to update the physical
538 * address in the NIC ring, but other drivers
539 * may not have this requirement.
541 struct netmap_slot *slot = &ring->slot[j];
542 union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l];
543 struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[l];
545 void *addr = PNMB(slot, &paddr);
547 if (addr == netmap_buffer_base) /* bad buf */
550 if (slot->flags & NS_BUF_CHANGED) {
551 netmap_reload_map(rxr->ptag, rxbuf->pmap, addr);
552 slot->flags &= ~NS_BUF_CHANGED;
554 curr->wb.upper.status_error = 0;
555 curr->read.pkt_addr = htole64(paddr);
556 bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
557 BUS_DMASYNC_PREREAD);
558 j = (j == lim) ? 0 : j + 1;
559 l = (l == lim) ? 0 : l + 1;
561 kring->nr_hwavail -= n;
563 bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
564 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
565 /* IMPORTANT: we must leave one free slot in the ring,
566 * so move l back by one unit
568 l = (l == 0) ? lim : l - 1;
569 IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), l);
571 /* tell userspace that there are new packets */
572 ring->avail = kring->nr_hwavail - resvd;
575 IXGBE_RX_UNLOCK(rxr);
580 IXGBE_RX_UNLOCK(rxr);
581 return netmap_ring_reinit(kring);
586 * The attach routine, called near the end of ixgbe_attach(),
587 * fills the parameters for netmap_attach() and calls it.
588 * It cannot fail, in the worst case (such as no memory)
589 * netmap mode will be disabled and the driver will only
590 * operate in standard mode.
593 ixgbe_netmap_attach(struct adapter *adapter)
595 struct netmap_adapter na;
597 bzero(&na, sizeof(na));
599 na.ifp = adapter->ifp;
600 na.separate_locks = 1; /* this card has separate rx/tx locks */
601 na.num_tx_desc = adapter->num_tx_desc;
602 na.num_rx_desc = adapter->num_rx_desc;
603 na.nm_txsync = ixgbe_netmap_txsync;
604 na.nm_rxsync = ixgbe_netmap_rxsync;
605 na.nm_lock = ixgbe_netmap_lock_wrapper;
606 na.nm_register = ixgbe_netmap_reg;
607 netmap_attach(&na, adapter->num_queues);