2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/types.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
40 #include <net/ethernet.h>
42 #include <net/if_vlan_var.h>
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <netinet/ip6.h>
47 #include <netinet/tcp.h>
49 #include <machine/in_cksum.h>
51 #include "common/efx.h"
57 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
59 /* Size of the LRO hash table. Must be a power of 2. A larger table
60 * means we can accelerate a larger number of streams.
62 static unsigned lro_table_size = 128;
64 /* Maximum length of a hash chain. If chains get too long then the lookup
65 * time increases and may exceed the benefit of LRO.
67 static unsigned lro_chain_max = 20;
69 /* Maximum time (in ticks) that a connection can be idle before it's LRO
72 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
74 /* Number of packets with payload that must arrive in-order before a
75 * connection is eligible for LRO. The idea is we should avoid coalescing
76 * segments when the sender is in slow-start because reducing the ACK rate
77 * can damage performance.
79 static int lro_slow_start_packets = 2000;
81 /* Number of packets with payload that must arrive in-order following loss
82 * before a connection is eligible for LRO. The idea is we should avoid
83 * coalescing segments when the sender is recovering from loss, because
84 * reducing the ACK rate can damage performance.
86 static int lro_loss_packets = 20;
88 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
89 #define SFXGE_LRO_L2_ID_VLAN 0x4000
90 #define SFXGE_LRO_L2_ID_IPV6 0x8000
91 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
92 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
94 /* Compare IPv6 addresses, avoiding conditional branches */
95 static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left,
96 const struct in6_addr *right)
99 const uint64_t *left64 = (const uint64_t *)left;
100 const uint64_t *right64 = (const uint64_t *)right;
101 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
103 return (left->s6_addr32[0] - right->s6_addr32[0]) |
104 (left->s6_addr32[1] - right->s6_addr32[1]) |
105 (left->s6_addr32[2] - right->s6_addr32[2]) |
106 (left->s6_addr32[3] - right->s6_addr32[3]);
111 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
114 rxq->flush_state = SFXGE_FLUSH_DONE;
118 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
121 rxq->flush_state = SFXGE_FLUSH_FAILED;
124 static uint8_t toep_key[] = {
125 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
126 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
127 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
128 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
129 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
133 sfxge_rx_post_refill(void *arg)
135 struct sfxge_rxq *rxq = arg;
136 struct sfxge_softc *sc;
138 struct sfxge_evq *evq;
143 evq = sc->evq[index];
145 magic = SFXGE_MAGIC_RX_QREFILL | index;
147 /* This is guaranteed due to the start/stop order of rx and ev */
148 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
149 ("evq not started"));
150 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
151 ("rxq not started"));
152 efx_ev_qpost(evq->common, magic);
156 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
158 /* Initially retry after 100 ms, but back off in case of
159 * repeated failures as we probably have to wait for the
160 * administrator to raise the pool limit. */
162 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
164 rxq->refill_delay = hz / 10;
166 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
167 sfxge_rx_post_refill, rxq);
170 static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
175 /* Allocate mbuf structure */
176 args.flags = M_PKTHDR;
178 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
180 /* Allocate (and attach) packet buffer */
181 if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
182 uma_zfree(zone_mbuf, m);
189 #define SFXGE_REFILL_BATCH 64
192 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
194 struct sfxge_softc *sc;
196 struct sfxge_evq *evq;
199 unsigned int mblksize;
201 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
205 evq = sc->evq[index];
207 prefetch_read_many(sc->enp);
208 prefetch_read_many(rxq->common);
210 mtx_assert(&evq->lock, MA_OWNED);
212 if (rxq->init_state != SFXGE_RXQ_STARTED)
215 rxfill = rxq->added - rxq->completed;
216 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
217 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
218 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
219 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
220 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
226 mblksize = sc->rx_buffer_size;
227 while (ntodo-- > 0) {
229 struct sfxge_rx_sw_desc *rx_desc;
230 bus_dma_segment_t seg;
233 id = (rxq->added + batch) & rxq->ptr_mask;
234 rx_desc = &rxq->queue[id];
235 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
237 rx_desc->flags = EFX_DISCARD;
238 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
241 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
242 addr[batch++] = seg.ds_addr;
244 if (batch == SFXGE_REFILL_BATCH) {
245 efx_rx_qpost(rxq->common, addr, mblksize, batch,
246 rxq->completed, rxq->added);
253 sfxge_rx_schedule_refill(rxq, retrying);
256 efx_rx_qpost(rxq->common, addr, mblksize, batch,
257 rxq->completed, rxq->added);
261 /* Make the descriptors visible to the hardware */
262 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
263 BUS_DMASYNC_PREWRITE);
265 efx_rx_qpush(rxq->common, rxq->added);
269 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
272 if (rxq->init_state != SFXGE_RXQ_STARTED)
275 /* Make sure the queue is full */
276 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
279 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
281 struct ifnet *ifp = sc->ifnet;
283 m->m_pkthdr.rcvif = ifp;
284 m->m_pkthdr.csum_data = 0xffff;
285 ifp->if_input(ifp, m);
289 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
291 struct mbuf *m = rx_desc->mbuf;
294 /* Convert checksum flags */
295 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
296 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
297 if (rx_desc->flags & EFX_CKSUM_TCPUDP)
298 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
301 /* The hash covers a 4-tuple for TCP only */
302 if (rx_desc->flags & EFX_PKT_TCP) {
303 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
305 m->m_flags |= M_FLOWID;
308 m->m_data += sc->rx_prefix_size;
309 m->m_len = rx_desc->size - sc->rx_prefix_size;
310 m->m_pkthdr.len = m->m_len;
311 m->m_pkthdr.csum_flags = csum_flags;
312 __sfxge_rx_deliver(sc, rx_desc->mbuf);
314 rx_desc->flags = EFX_DISCARD;
315 rx_desc->mbuf = NULL;
319 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
321 struct sfxge_softc *sc = st->sc;
322 struct mbuf *m = c->mbuf;
326 KASSERT(m, ("no mbuf to deliver"));
330 /* Finish off packet munging and recalculate IP header checksum. */
331 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
332 struct ip *iph = c->nh;
333 iph->ip_len = htons(iph->ip_len);
335 iph->ip_sum = in_cksum_hdr(iph);
336 c_th = (struct tcphdr *)(iph + 1);
337 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
338 CSUM_IP_CHECKED | CSUM_IP_VALID);
340 struct ip6_hdr *iph = c->nh;
341 iph->ip6_plen = htons(iph->ip6_plen);
342 c_th = (struct tcphdr *)(iph + 1);
343 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
346 c_th->th_win = c->th_last->th_win;
347 c_th->th_ack = c->th_last->th_ack;
348 if (c_th->th_off == c->th_last->th_off) {
349 /* Copy TCP options (take care to avoid going negative). */
350 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
351 memcpy(c_th + 1, c->th_last + 1, optlen);
355 m->m_pkthdr.flowid = c->conn_hash;
356 m->m_flags |= M_FLOWID;
358 m->m_pkthdr.csum_flags = csum_flags;
359 __sfxge_rx_deliver(sc, m);
365 /* Drop the given connection, and add it to the free list. */
366 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
370 KASSERT(!c->mbuf, ("found orphaned mbuf"));
372 if (c->next_buf.mbuf != NULL) {
373 sfxge_rx_deliver(rxq->sc, &c->next_buf);
374 LIST_REMOVE(c, active_link);
377 bucket = c->conn_hash & rxq->lro.conns_mask;
378 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
379 --rxq->lro.conns_n[bucket];
380 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
381 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
384 /* Stop tracking connections that have gone idle in order to keep hash
387 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
389 struct sfxge_lro_conn *c;
392 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
393 ("found active connections"));
395 rxq->lro.last_purge_ticks = now;
396 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
397 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
400 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
401 if (now - c->last_pkt_ticks > lro_idle_ticks) {
402 ++rxq->lro.n_drop_idle;
403 sfxge_lro_drop(rxq, c);
409 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
410 struct mbuf *mbuf, struct tcphdr *th)
414 /* Tack the new mbuf onto the chain. */
415 KASSERT(!mbuf->m_next, ("mbuf already chained"));
416 c->mbuf_tail->m_next = mbuf;
419 /* Increase length appropriately */
420 c->mbuf->m_pkthdr.len += mbuf->m_len;
422 /* Update the connection state flags */
423 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
424 struct ip *iph = c->nh;
425 iph->ip_len += mbuf->m_len;
426 c_th = (struct tcphdr *)(iph + 1);
428 struct ip6_hdr *iph = c->nh;
429 iph->ip6_plen += mbuf->m_len;
430 c_th = (struct tcphdr *)(iph + 1);
432 c_th->th_flags |= (th->th_flags & TH_PUSH);
436 /* Pass packet up now if another segment could overflow the IP
439 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
440 sfxge_lro_deliver(st, c);
444 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
445 struct mbuf *mbuf, void *nh, struct tcphdr *th)
447 /* Start the chain */
449 c->mbuf_tail = c->mbuf;
453 mbuf->m_pkthdr.len = mbuf->m_len;
455 /* Mangle header fields for later processing */
456 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
458 iph->ip_len = ntohs(iph->ip_len);
460 struct ip6_hdr *iph = nh;
461 iph->ip6_plen = ntohs(iph->ip6_plen);
465 /* Try to merge or otherwise hold or deliver (as appropriate) the
466 * packet buffered for this connection (c->next_buf). Return a flag
467 * indicating whether the connection is still active for LRO purposes.
470 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
472 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
473 char *eh = c->next_eh;
474 int data_length, hdr_length, dont_merge;
475 unsigned th_seq, pkt_length;
479 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
480 struct ip *iph = c->next_nh;
481 th = (struct tcphdr *)(iph + 1);
482 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
484 struct ip6_hdr *iph = c->next_nh;
485 th = (struct tcphdr *)(iph + 1);
486 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
489 hdr_length = (char *) th + th->th_off * 4 - eh;
490 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
492 th_seq = ntohl(th->th_seq);
493 dont_merge = ((data_length <= 0)
494 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
496 /* Check for options other than aligned timestamp. */
497 if (th->th_off != 5) {
498 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
499 if (th->th_off == 8 &&
500 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
502 (TCPOPT_TIMESTAMP << 8) |
503 TCPOLEN_TIMESTAMP)) {
504 /* timestamp option -- okay */
510 if (__predict_false(th_seq != c->next_seq)) {
511 /* Out-of-order, so start counting again. */
513 sfxge_lro_deliver(&rxq->lro, c);
514 c->n_in_order_pkts -= lro_loss_packets;
515 c->next_seq = th_seq + data_length;
516 ++rxq->lro.n_misorder;
517 goto deliver_buf_out;
519 c->next_seq = th_seq + data_length;
522 if (now - c->last_pkt_ticks > lro_idle_ticks) {
523 ++rxq->lro.n_drop_idle;
525 sfxge_lro_deliver(&rxq->lro, c);
526 sfxge_lro_drop(rxq, c);
529 c->last_pkt_ticks = ticks;
531 if (c->n_in_order_pkts < lro_slow_start_packets) {
532 /* May be in slow-start, so don't merge. */
533 ++rxq->lro.n_slow_start;
534 ++c->n_in_order_pkts;
535 goto deliver_buf_out;
538 if (__predict_false(dont_merge)) {
540 sfxge_lro_deliver(&rxq->lro, c);
541 if (th->th_flags & (TH_FIN | TH_RST)) {
542 ++rxq->lro.n_drop_closed;
543 sfxge_lro_drop(rxq, c);
546 goto deliver_buf_out;
549 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
551 if (__predict_true(c->mbuf != NULL)) {
552 /* Remove headers and any padding */
553 rx_buf->mbuf->m_data += hdr_length;
554 rx_buf->mbuf->m_len = data_length;
556 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
558 /* Remove any padding */
559 rx_buf->mbuf->m_len = pkt_length;
561 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
568 sfxge_rx_deliver(rxq->sc, rx_buf);
572 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
573 uint16_t l2_id, void *nh, struct tcphdr *th)
575 unsigned bucket = conn_hash & st->conns_mask;
576 struct sfxge_lro_conn *c;
578 if (st->conns_n[bucket] >= lro_chain_max) {
583 if (!TAILQ_EMPTY(&st->free_conns)) {
584 c = TAILQ_FIRST(&st->free_conns);
585 TAILQ_REMOVE(&st->free_conns, c, link);
587 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
591 c->next_buf.mbuf = NULL;
594 /* Create the connection tracking data */
595 ++st->conns_n[bucket];
596 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
598 c->conn_hash = conn_hash;
599 c->source = th->th_sport;
600 c->dest = th->th_dport;
601 c->n_in_order_pkts = 0;
602 c->last_pkt_ticks = *(volatile int *)&ticks;
605 /* NB. We don't initialise c->next_seq, and it doesn't matter what
606 * value it has. Most likely the next packet received for this
607 * connection will not match -- no harm done.
611 /* Process mbuf and decide whether to dispatch it to the stack now or
615 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
617 struct sfxge_softc *sc = rxq->sc;
618 struct mbuf *m = rx_buf->mbuf;
619 struct ether_header *eh;
620 struct sfxge_lro_conn *c;
628 /* Get the hardware hash */
629 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
632 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
633 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
634 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
635 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
636 SFXGE_LRO_L2_ID_VLAN;
637 l3_proto = veh->evl_proto;
641 l3_proto = eh->ether_type;
645 /* Check whether this is a suitable packet (unfragmented
646 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
647 * length, and compute a hash if necessary. If not, return.
649 if (l3_proto == htons(ETHERTYPE_IP)) {
651 if ((iph->ip_p - IPPROTO_TCP) |
652 (iph->ip_hl - (sizeof(*iph) >> 2u)) |
653 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
655 th = (struct tcphdr *)(iph + 1);
656 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
657 struct ip6_hdr *iph = nh;
658 if (iph->ip6_nxt != IPPROTO_TCP)
660 l2_id |= SFXGE_LRO_L2_ID_IPV6;
661 th = (struct tcphdr *)(iph + 1);
666 bucket = conn_hash & rxq->lro.conns_mask;
668 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
669 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
671 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
673 if (c->mbuf != NULL) {
674 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
675 struct ip *c_iph, *iph = nh;
677 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
678 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
681 struct ip6_hdr *c_iph, *iph = nh;
683 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
684 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
689 /* Re-insert at head of list to reduce lookup time. */
690 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
691 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
693 if (c->next_buf.mbuf != NULL) {
694 if (!sfxge_lro_try_merge(rxq, c))
697 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
700 c->next_buf = *rx_buf;
705 rx_buf->flags = EFX_DISCARD;
709 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
711 sfxge_rx_deliver(sc, rx_buf);
714 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
716 struct sfxge_lro_state *st = &rxq->lro;
717 struct sfxge_lro_conn *c;
720 while (!LIST_EMPTY(&st->active_conns)) {
721 c = LIST_FIRST(&st->active_conns);
722 if (!c->delivered && c->mbuf != NULL)
723 sfxge_lro_deliver(st, c);
724 if (sfxge_lro_try_merge(rxq, c)) {
726 sfxge_lro_deliver(st, c);
727 LIST_REMOVE(c, active_link);
732 t = *(volatile int *)&ticks;
733 if (__predict_false(t != st->last_purge_ticks))
734 sfxge_lro_purge_idle(rxq, t);
738 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
740 struct sfxge_softc *sc = rxq->sc;
741 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
743 struct sfxge_evq *evq;
744 unsigned int completed;
747 struct sfxge_rx_sw_desc *prev = NULL;
750 evq = sc->evq[index];
752 mtx_assert(&evq->lock, MA_OWNED);
754 completed = rxq->completed;
755 while (completed != rxq->pending) {
757 struct sfxge_rx_sw_desc *rx_desc;
759 id = completed++ & rxq->ptr_mask;
760 rx_desc = &rxq->queue[id];
763 if (rxq->init_state != SFXGE_RXQ_STARTED)
766 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
769 prefetch_read_many(mtod(m, caddr_t));
771 /* Check for loopback packets */
772 if (!(rx_desc->flags & EFX_PKT_IPV4) &&
773 !(rx_desc->flags & EFX_PKT_IPV6)) {
774 struct ether_header *etherhp;
777 etherhp = mtod(m, struct ether_header *);
779 if (etherhp->ether_type ==
780 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
781 EFSYS_PROBE(loopback);
788 /* Pass packet up the stack or into LRO (pipelined) */
791 sfxge_lro(rxq, prev);
793 sfxge_rx_deliver(sc, prev);
799 /* Return the packet to the pool */
801 rx_desc->mbuf = NULL;
803 rxq->completed = completed;
805 level = rxq->added - rxq->completed;
807 /* Pass last packet up the stack or into LRO */
810 sfxge_lro(rxq, prev);
812 sfxge_rx_deliver(sc, prev);
816 * If there are any pending flows and this is the end of the
817 * poll then they must be completed.
820 sfxge_lro_end_of_burst(rxq);
822 /* Top up the queue if necessary */
823 if (level < rxq->refill_threshold)
824 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
828 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
830 struct sfxge_rxq *rxq;
831 struct sfxge_evq *evq;
834 rxq = sc->rxq[index];
835 evq = sc->evq[index];
837 mtx_lock(&evq->lock);
839 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
840 ("rxq not started"));
842 rxq->init_state = SFXGE_RXQ_INITIALIZED;
844 callout_stop(&rxq->refill_callout);
847 rxq->flush_state = SFXGE_FLUSH_PENDING;
849 /* Flush the receive queue */
850 efx_rx_qflush(rxq->common);
852 mtx_unlock(&evq->lock);
856 /* Spin for 100 ms */
859 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
862 } while (++count < 20);
864 mtx_lock(&evq->lock);
866 if (rxq->flush_state == SFXGE_FLUSH_FAILED)
869 rxq->flush_state = SFXGE_FLUSH_DONE;
871 rxq->pending = rxq->added;
872 sfxge_rx_qcomplete(rxq, B_TRUE);
874 KASSERT(rxq->completed == rxq->pending,
875 ("rxq->completed != rxq->pending"));
882 /* Destroy the common code receive queue. */
883 efx_rx_qdestroy(rxq->common);
885 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
886 EFX_RXQ_NBUFS(sc->rxq_entries));
888 mtx_unlock(&evq->lock);
892 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
894 struct sfxge_rxq *rxq;
896 struct sfxge_evq *evq;
899 rxq = sc->rxq[index];
901 evq = sc->evq[index];
903 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
904 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
905 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
906 ("evq->init_state != SFXGE_EVQ_STARTED"));
908 /* Program the buffer table. */
909 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
910 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
913 /* Create the common code receive queue. */
914 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
915 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
919 mtx_lock(&evq->lock);
921 /* Enable the receive queue. */
922 efx_rx_qenable(rxq->common);
924 rxq->init_state = SFXGE_RXQ_STARTED;
926 /* Try to fill the queue from the pool. */
927 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
929 mtx_unlock(&evq->lock);
934 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
935 EFX_RXQ_NBUFS(sc->rxq_entries));
940 sfxge_rx_stop(struct sfxge_softc *sc)
942 struct sfxge_intr *intr;
947 /* Stop the receive queue(s) */
948 index = intr->n_alloc;
950 sfxge_rx_qstop(sc, index);
952 sc->rx_prefix_size = 0;
953 sc->rx_buffer_size = 0;
955 efx_rx_fini(sc->enp);
959 sfxge_rx_start(struct sfxge_softc *sc)
961 struct sfxge_intr *intr;
967 /* Initialize the common code receive module. */
968 if ((rc = efx_rx_init(sc->enp)) != 0)
971 /* Calculate the receive packet buffer size. */
972 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
973 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
976 /* Select zone for packet buffers */
977 if (sc->rx_buffer_size <= MCLBYTES)
978 sc->rx_buffer_zone = zone_clust;
979 else if (sc->rx_buffer_size <= MJUMPAGESIZE)
980 sc->rx_buffer_zone = zone_jumbop;
981 else if (sc->rx_buffer_size <= MJUM9BYTES)
982 sc->rx_buffer_zone = zone_jumbo9;
984 sc->rx_buffer_zone = zone_jumbo16;
987 * Set up the scale table. Enable all hash types and hash insertion.
989 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
990 sc->rx_indir_table[index] = index % sc->intr.n_alloc;
991 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
992 SFXGE_RX_SCALE_MAX)) != 0)
994 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
995 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
996 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
998 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
999 sizeof(toep_key))) != 0)
1002 /* Start the receive queue(s). */
1003 for (index = 0; index < intr->n_alloc; index++) {
1004 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1011 while (--index >= 0)
1012 sfxge_rx_qstop(sc, index);
1015 efx_rx_fini(sc->enp);
1020 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1022 struct sfxge_lro_state *st = &rxq->lro;
1025 st->conns_mask = lro_table_size - 1;
1026 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1027 ("lro_table_size must be a power of 2"));
1029 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1031 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1033 for (i = 0; i <= st->conns_mask; ++i) {
1034 TAILQ_INIT(&st->conns[i]);
1037 LIST_INIT(&st->active_conns);
1038 TAILQ_INIT(&st->free_conns);
1041 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1043 struct sfxge_lro_state *st = &rxq->lro;
1044 struct sfxge_lro_conn *c;
1047 /* Return cleanly if sfxge_lro_init() has not been called. */
1048 if (st->conns == NULL)
1051 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1053 for (i = 0; i <= st->conns_mask; ++i) {
1054 while (!TAILQ_EMPTY(&st->conns[i])) {
1055 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1056 sfxge_lro_drop(rxq, c);
1060 while (!TAILQ_EMPTY(&st->free_conns)) {
1061 c = TAILQ_FIRST(&st->free_conns);
1062 TAILQ_REMOVE(&st->free_conns, c, link);
1063 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1067 free(st->conns_n, M_SFXGE);
1068 free(st->conns, M_SFXGE);
1073 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1075 struct sfxge_rxq *rxq;
1077 rxq = sc->rxq[index];
1079 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1080 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1082 /* Free the context array and the flow table. */
1083 free(rxq->queue, M_SFXGE);
1084 sfxge_lro_fini(rxq);
1086 /* Release DMA memory. */
1087 sfxge_dma_free(&rxq->mem);
1089 sc->rxq[index] = NULL;
1095 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1097 struct sfxge_rxq *rxq;
1098 struct sfxge_evq *evq;
1102 KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc));
1104 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1107 rxq->entries = sc->rxq_entries;
1108 rxq->ptr_mask = rxq->entries - 1;
1109 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1111 sc->rxq[index] = rxq;
1114 evq = sc->evq[index];
1116 /* Allocate and zero DMA space. */
1117 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1119 (void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(sc->rxq_entries));
1121 /* Allocate buffer table entries. */
1122 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1125 /* Allocate the context array and the flow table. */
1126 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1127 M_SFXGE, M_WAITOK | M_ZERO);
1128 sfxge_lro_init(rxq);
1130 callout_init(&rxq->refill_callout, B_TRUE);
1132 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1137 static const struct {
1140 } sfxge_rx_stats[] = {
1141 #define SFXGE_RX_STAT(name, member) \
1142 { #name, offsetof(struct sfxge_rxq, member) }
1143 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1144 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1145 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1146 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1147 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1148 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1149 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1150 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1154 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1156 struct sfxge_softc *sc = arg1;
1157 unsigned int id = arg2;
1158 unsigned int sum, index;
1160 /* Sum across all RX queues */
1162 for (index = 0; index < sc->intr.n_alloc; index++)
1163 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1164 sfxge_rx_stats[id].offset);
1166 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1170 sfxge_rx_stat_init(struct sfxge_softc *sc)
1172 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1173 struct sysctl_oid_list *stat_list;
1176 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1179 id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]);
1183 OID_AUTO, sfxge_rx_stats[id].name,
1184 CTLTYPE_UINT|CTLFLAG_RD,
1185 sc, id, sfxge_rx_stat_handler, "IU",
1191 sfxge_rx_fini(struct sfxge_softc *sc)
1193 struct sfxge_intr *intr;
1198 index = intr->n_alloc;
1199 while (--index >= 0)
1200 sfxge_rx_qfini(sc, index);
1204 sfxge_rx_init(struct sfxge_softc *sc)
1206 struct sfxge_intr *intr;
1210 if (lro_idle_ticks == 0)
1211 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1215 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1216 ("intr->state != SFXGE_INTR_INITIALIZED"));
1218 /* Initialize the receive queue(s) - one per interrupt. */
1219 for (index = 0; index < intr->n_alloc; index++) {
1220 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1224 sfxge_rx_stat_init(sc);
1229 /* Tear down the receive queue(s). */
1230 while (--index >= 0)
1231 sfxge_rx_qfini(sc, index);