2 * Copyright (c) 2010-2016 Solarflare Communications Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
11 * 1. Redistributions of source code must retain the above copyright notice,
12 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #include <sys/malloc.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
49 #include <net/ethernet.h>
51 #include <net/if_vlan_var.h>
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
58 #include <machine/in_cksum.h>
61 #include <net/rss_config.h>
64 #include "common/efx.h"
70 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
74 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75 "Large receive offload (LRO) parameters");
77 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
79 /* Size of the LRO hash table. Must be a power of 2. A larger table
80 * means we can accelerate a larger number of streams.
82 static unsigned lro_table_size = 128;
83 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
86 "Size of the LRO hash table (must be a power of 2)");
88 /* Maximum length of a hash chain. If chains get too long then the lookup
89 * time increases and may exceed the benefit of LRO.
91 static unsigned lro_chain_max = 20;
92 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
95 "The maximum length of a hash chain");
97 /* Maximum time (in ticks) that a connection can be idle before it's LRO
100 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
104 "The maximum time (in ticks) that a connection can be idle "
105 "before it's LRO state is discarded");
107 /* Number of packets with payload that must arrive in-order before a
108 * connection is eligible for LRO. The idea is we should avoid coalescing
109 * segments when the sender is in slow-start because reducing the ACK rate
110 * can damage performance.
112 static int lro_slow_start_packets = 2000;
113 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115 &lro_slow_start_packets, 0,
116 "Number of packets with payload that must arrive in-order before "
117 "a connection is eligible for LRO");
119 /* Number of packets with payload that must arrive in-order following loss
120 * before a connection is eligible for LRO. The idea is we should avoid
121 * coalescing segments when the sender is recovering from loss, because
122 * reducing the ACK rate can damage performance.
124 static int lro_loss_packets = 20;
125 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127 &lro_loss_packets, 0,
128 "Number of packets with payload that must arrive in-order "
129 "following loss before a connection is eligible for LRO");
131 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132 #define SFXGE_LRO_L2_ID_VLAN 0x4000
133 #define SFXGE_LRO_L2_ID_IPV6 0x8000
134 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
137 /* Compare IPv6 addresses, avoiding conditional branches */
138 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139 const struct in6_addr *right)
142 const uint64_t *left64 = (const uint64_t *)left;
143 const uint64_t *right64 = (const uint64_t *)right;
144 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
146 return (left->s6_addr32[0] - right->s6_addr32[0]) |
147 (left->s6_addr32[1] - right->s6_addr32[1]) |
148 (left->s6_addr32[2] - right->s6_addr32[2]) |
149 (left->s6_addr32[3] - right->s6_addr32[3]);
153 #endif /* SFXGE_LRO */
156 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
159 rxq->flush_state = SFXGE_FLUSH_DONE;
163 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
166 rxq->flush_state = SFXGE_FLUSH_FAILED;
170 static uint8_t toep_key[RSS_KEYSIZE];
172 static uint8_t toep_key[] = {
173 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
182 sfxge_rx_post_refill(void *arg)
184 struct sfxge_rxq *rxq = arg;
185 struct sfxge_softc *sc;
187 struct sfxge_evq *evq;
192 evq = sc->evq[index];
193 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
195 /* This is guaranteed due to the start/stop order of rx and ev */
196 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197 ("evq not started"));
198 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199 ("rxq not started"));
200 efx_ev_qpost(evq->common, magic);
204 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
206 /* Initially retry after 100 ms, but back off in case of
207 * repeated failures as we probably have to wait for the
208 * administrator to raise the pool limit. */
210 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
212 rxq->refill_delay = hz / 10;
214 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215 sfxge_rx_post_refill, rxq);
218 #define SFXGE_REFILL_BATCH 64
221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
223 struct sfxge_softc *sc;
225 struct sfxge_evq *evq;
228 unsigned int mblksize;
230 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
234 evq = sc->evq[index];
236 prefetch_read_many(sc->enp);
237 prefetch_read_many(rxq->common);
239 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
241 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
244 rxfill = rxq->added - rxq->completed;
245 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
255 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256 while (ntodo-- > 0) {
258 struct sfxge_rx_sw_desc *rx_desc;
259 bus_dma_segment_t seg;
262 id = (rxq->added + batch) & rxq->ptr_mask;
263 rx_desc = &rxq->queue[id];
264 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
266 rx_desc->flags = EFX_DISCARD;
267 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268 sc->rx_cluster_size);
272 /* m_len specifies length of area to be mapped for DMA */
274 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
275 m->m_data += sc->rx_buffer_align;
277 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278 addr[batch++] = seg.ds_addr;
280 if (batch == SFXGE_REFILL_BATCH) {
281 efx_rx_qpost(rxq->common, addr, mblksize, batch,
282 rxq->completed, rxq->added);
289 sfxge_rx_schedule_refill(rxq, retrying);
292 efx_rx_qpost(rxq->common, addr, mblksize, batch,
293 rxq->completed, rxq->added);
297 /* Make the descriptors visible to the hardware */
298 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299 BUS_DMASYNC_PREWRITE);
301 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
303 /* The queue could still be empty if no descriptors were actually
304 * pushed, in which case there will be no event to cause the next
305 * refill, so we must schedule a refill ourselves.
307 if(rxq->pushed == rxq->completed) {
308 sfxge_rx_schedule_refill(rxq, retrying);
313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
316 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
319 /* Make sure the queue is full */
320 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
325 struct ifnet *ifp = sc->ifnet;
327 m->m_pkthdr.rcvif = ifp;
328 m->m_pkthdr.csum_data = 0xffff;
329 ifp->if_input(ifp, m);
333 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
335 struct mbuf *m = rx_desc->mbuf;
336 int flags = rx_desc->flags;
339 /* Convert checksum flags */
340 csum_flags = (flags & EFX_CKSUM_IPV4) ?
341 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
342 if (flags & EFX_CKSUM_TCPUDP)
343 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
345 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
347 efx_psuedo_hdr_hash_get(sc->enp,
348 EFX_RX_HASHALG_TOEPLITZ,
350 /* The hash covers a 4-tuple for TCP only */
352 (flags & EFX_PKT_IPV4) ?
353 ((flags & EFX_PKT_TCP) ?
354 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
355 ((flags & EFX_PKT_TCP) ?
356 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
358 m->m_data += sc->rx_prefix_size;
359 m->m_len = rx_desc->size - sc->rx_prefix_size;
360 m->m_pkthdr.len = m->m_len;
361 m->m_pkthdr.csum_flags = csum_flags;
362 __sfxge_rx_deliver(sc, rx_desc->mbuf);
364 rx_desc->flags = EFX_DISCARD;
365 rx_desc->mbuf = NULL;
371 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
373 struct sfxge_softc *sc = st->sc;
374 struct mbuf *m = c->mbuf;
378 KASSERT(m, ("no mbuf to deliver"));
382 /* Finish off packet munging and recalculate IP header checksum. */
383 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
384 struct ip *iph = c->nh;
385 iph->ip_len = htons(iph->ip_len);
387 iph->ip_sum = in_cksum_hdr(iph);
388 c_th = (struct tcphdr *)(iph + 1);
389 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
390 CSUM_IP_CHECKED | CSUM_IP_VALID);
392 struct ip6_hdr *iph = c->nh;
393 iph->ip6_plen = htons(iph->ip6_plen);
394 c_th = (struct tcphdr *)(iph + 1);
395 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
398 c_th->th_win = c->th_last->th_win;
399 c_th->th_ack = c->th_last->th_ack;
400 if (c_th->th_off == c->th_last->th_off) {
401 /* Copy TCP options (take care to avoid going negative). */
402 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
403 memcpy(c_th + 1, c->th_last + 1, optlen);
406 m->m_pkthdr.flowid = c->conn_hash;
408 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
409 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
411 m->m_pkthdr.csum_flags = csum_flags;
412 __sfxge_rx_deliver(sc, m);
418 /* Drop the given connection, and add it to the free list. */
419 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
423 KASSERT(!c->mbuf, ("found orphaned mbuf"));
425 if (c->next_buf.mbuf != NULL) {
426 sfxge_rx_deliver(rxq->sc, &c->next_buf);
427 LIST_REMOVE(c, active_link);
430 bucket = c->conn_hash & rxq->lro.conns_mask;
431 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
432 --rxq->lro.conns_n[bucket];
433 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
434 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
437 /* Stop tracking connections that have gone idle in order to keep hash
440 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
442 struct sfxge_lro_conn *c;
445 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
446 ("found active connections"));
448 rxq->lro.last_purge_ticks = now;
449 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
450 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
453 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
454 if (now - c->last_pkt_ticks > lro_idle_ticks) {
455 ++rxq->lro.n_drop_idle;
456 sfxge_lro_drop(rxq, c);
462 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
463 struct mbuf *mbuf, struct tcphdr *th)
467 /* Tack the new mbuf onto the chain. */
468 KASSERT(!mbuf->m_next, ("mbuf already chained"));
469 c->mbuf_tail->m_next = mbuf;
472 /* Increase length appropriately */
473 c->mbuf->m_pkthdr.len += mbuf->m_len;
475 /* Update the connection state flags */
476 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
477 struct ip *iph = c->nh;
478 iph->ip_len += mbuf->m_len;
479 c_th = (struct tcphdr *)(iph + 1);
481 struct ip6_hdr *iph = c->nh;
482 iph->ip6_plen += mbuf->m_len;
483 c_th = (struct tcphdr *)(iph + 1);
485 c_th->th_flags |= (th->th_flags & TH_PUSH);
489 /* Pass packet up now if another segment could overflow the IP
492 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
493 sfxge_lro_deliver(st, c);
497 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
498 struct mbuf *mbuf, void *nh, struct tcphdr *th)
500 /* Start the chain */
502 c->mbuf_tail = c->mbuf;
506 mbuf->m_pkthdr.len = mbuf->m_len;
508 /* Mangle header fields for later processing */
509 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
511 iph->ip_len = ntohs(iph->ip_len);
513 struct ip6_hdr *iph = nh;
514 iph->ip6_plen = ntohs(iph->ip6_plen);
518 /* Try to merge or otherwise hold or deliver (as appropriate) the
519 * packet buffered for this connection (c->next_buf). Return a flag
520 * indicating whether the connection is still active for LRO purposes.
523 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
525 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
526 char *eh = c->next_eh;
527 int data_length, hdr_length, dont_merge;
528 unsigned th_seq, pkt_length;
532 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
533 struct ip *iph = c->next_nh;
534 th = (struct tcphdr *)(iph + 1);
535 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
537 struct ip6_hdr *iph = c->next_nh;
538 th = (struct tcphdr *)(iph + 1);
539 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
542 hdr_length = (char *) th + th->th_off * 4 - eh;
543 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
545 th_seq = ntohl(th->th_seq);
546 dont_merge = ((data_length <= 0)
547 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
549 /* Check for options other than aligned timestamp. */
550 if (th->th_off != 5) {
551 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
552 if (th->th_off == 8 &&
553 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
555 (TCPOPT_TIMESTAMP << 8) |
556 TCPOLEN_TIMESTAMP)) {
557 /* timestamp option -- okay */
563 if (__predict_false(th_seq != c->next_seq)) {
564 /* Out-of-order, so start counting again. */
566 sfxge_lro_deliver(&rxq->lro, c);
567 c->n_in_order_pkts -= lro_loss_packets;
568 c->next_seq = th_seq + data_length;
569 ++rxq->lro.n_misorder;
570 goto deliver_buf_out;
572 c->next_seq = th_seq + data_length;
575 if (now - c->last_pkt_ticks > lro_idle_ticks) {
576 ++rxq->lro.n_drop_idle;
578 sfxge_lro_deliver(&rxq->lro, c);
579 sfxge_lro_drop(rxq, c);
582 c->last_pkt_ticks = ticks;
584 if (c->n_in_order_pkts < lro_slow_start_packets) {
585 /* May be in slow-start, so don't merge. */
586 ++rxq->lro.n_slow_start;
587 ++c->n_in_order_pkts;
588 goto deliver_buf_out;
591 if (__predict_false(dont_merge)) {
593 sfxge_lro_deliver(&rxq->lro, c);
594 if (th->th_flags & (TH_FIN | TH_RST)) {
595 ++rxq->lro.n_drop_closed;
596 sfxge_lro_drop(rxq, c);
599 goto deliver_buf_out;
602 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
604 if (__predict_true(c->mbuf != NULL)) {
605 /* Remove headers and any padding */
606 rx_buf->mbuf->m_data += hdr_length;
607 rx_buf->mbuf->m_len = data_length;
609 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
611 /* Remove any padding */
612 rx_buf->mbuf->m_len = pkt_length;
614 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
621 sfxge_rx_deliver(rxq->sc, rx_buf);
625 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
626 uint16_t l2_id, void *nh, struct tcphdr *th)
628 unsigned bucket = conn_hash & st->conns_mask;
629 struct sfxge_lro_conn *c;
631 if (st->conns_n[bucket] >= lro_chain_max) {
636 if (!TAILQ_EMPTY(&st->free_conns)) {
637 c = TAILQ_FIRST(&st->free_conns);
638 TAILQ_REMOVE(&st->free_conns, c, link);
640 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
644 c->next_buf.mbuf = NULL;
647 /* Create the connection tracking data */
648 ++st->conns_n[bucket];
649 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
651 c->conn_hash = conn_hash;
652 c->source = th->th_sport;
653 c->dest = th->th_dport;
654 c->n_in_order_pkts = 0;
655 c->last_pkt_ticks = *(volatile int *)&ticks;
658 /* NB. We don't initialise c->next_seq, and it doesn't matter what
659 * value it has. Most likely the next packet received for this
660 * connection will not match -- no harm done.
664 /* Process mbuf and decide whether to dispatch it to the stack now or
668 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
670 struct sfxge_softc *sc = rxq->sc;
671 struct mbuf *m = rx_buf->mbuf;
672 struct ether_header *eh;
673 struct sfxge_lro_conn *c;
681 /* Get the hardware hash */
682 conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
683 EFX_RX_HASHALG_TOEPLITZ,
686 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
687 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
688 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
689 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
690 SFXGE_LRO_L2_ID_VLAN;
691 l3_proto = veh->evl_proto;
695 l3_proto = eh->ether_type;
699 /* Check whether this is a suitable packet (unfragmented
700 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
701 * length, and compute a hash if necessary. If not, return.
703 if (l3_proto == htons(ETHERTYPE_IP)) {
706 KASSERT(iph->ip_p == IPPROTO_TCP,
707 ("IPv4 protocol is not TCP, but packet marker is set"));
708 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
709 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
711 th = (struct tcphdr *)(iph + 1);
712 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
713 struct ip6_hdr *iph = nh;
715 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
716 ("IPv6 next header is not TCP, but packet marker is set"));
717 l2_id |= SFXGE_LRO_L2_ID_IPV6;
718 th = (struct tcphdr *)(iph + 1);
723 bucket = conn_hash & rxq->lro.conns_mask;
725 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
726 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
728 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
730 if (c->mbuf != NULL) {
731 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
732 struct ip *c_iph, *iph = nh;
734 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
735 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
738 struct ip6_hdr *c_iph, *iph = nh;
740 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
741 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
746 /* Re-insert at head of list to reduce lookup time. */
747 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
748 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
750 if (c->next_buf.mbuf != NULL) {
751 if (!sfxge_lro_try_merge(rxq, c))
754 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
757 c->next_buf = *rx_buf;
762 rx_buf->flags = EFX_DISCARD;
766 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
768 sfxge_rx_deliver(sc, rx_buf);
771 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
773 struct sfxge_lro_state *st = &rxq->lro;
774 struct sfxge_lro_conn *c;
777 while (!LIST_EMPTY(&st->active_conns)) {
778 c = LIST_FIRST(&st->active_conns);
779 if (!c->delivered && c->mbuf != NULL)
780 sfxge_lro_deliver(st, c);
781 if (sfxge_lro_try_merge(rxq, c)) {
783 sfxge_lro_deliver(st, c);
784 LIST_REMOVE(c, active_link);
789 t = *(volatile int *)&ticks;
790 if (__predict_false(t != st->last_purge_ticks))
791 sfxge_lro_purge_idle(rxq, t);
794 #else /* !SFXGE_LRO */
797 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
802 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
806 #endif /* SFXGE_LRO */
809 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
811 struct sfxge_softc *sc = rxq->sc;
812 int if_capenable = sc->ifnet->if_capenable;
813 int lro_enabled = if_capenable & IFCAP_LRO;
815 struct sfxge_evq *evq;
816 unsigned int completed;
819 struct sfxge_rx_sw_desc *prev = NULL;
822 evq = sc->evq[index];
824 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
826 completed = rxq->completed;
827 while (completed != rxq->pending) {
829 struct sfxge_rx_sw_desc *rx_desc;
831 id = completed++ & rxq->ptr_mask;
832 rx_desc = &rxq->queue[id];
835 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
838 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
841 /* Read the length from the pseudo header if required */
842 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
845 rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
848 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
849 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
852 prefetch_read_many(mtod(m, caddr_t));
854 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
856 if (~if_capenable & IFCAP_RXCSUM)
858 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
861 if (~if_capenable & IFCAP_RXCSUM_IPV6)
862 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
865 /* Check for loopback packets */
867 struct ether_header *etherhp;
870 etherhp = mtod(m, struct ether_header *);
872 if (etherhp->ether_type ==
873 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
874 EFSYS_PROBE(loopback);
883 ("Rx descriptor with both IPv4 and IPv6 flags"));
887 /* Pass packet up the stack or into LRO (pipelined) */
890 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
891 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
892 sfxge_lro(rxq, prev);
894 sfxge_rx_deliver(sc, prev);
900 /* Return the packet to the pool */
902 rx_desc->mbuf = NULL;
904 rxq->completed = completed;
906 level = rxq->added - rxq->completed;
908 /* Pass last packet up the stack or into LRO */
911 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
912 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
913 sfxge_lro(rxq, prev);
915 sfxge_rx_deliver(sc, prev);
919 * If there are any pending flows and this is the end of the
920 * poll then they must be completed.
923 sfxge_lro_end_of_burst(rxq);
925 /* Top up the queue if necessary */
926 if (level < rxq->refill_threshold)
927 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
931 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
933 struct sfxge_rxq *rxq;
934 struct sfxge_evq *evq;
936 unsigned int retry = 3;
938 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
940 rxq = sc->rxq[index];
941 evq = sc->evq[index];
945 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
946 ("rxq not started"));
948 rxq->init_state = SFXGE_RXQ_INITIALIZED;
950 callout_stop(&rxq->refill_callout);
952 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
953 rxq->flush_state = SFXGE_FLUSH_PENDING;
955 SFXGE_EVQ_UNLOCK(evq);
957 /* Flush the receive queue */
958 if (efx_rx_qflush(rxq->common) != 0) {
960 rxq->flush_state = SFXGE_FLUSH_FAILED;
966 /* Spin for 100 ms */
969 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
972 } while (++count < 20);
976 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
977 /* Flush timeout - neither done nor failed */
978 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
979 device_get_nameunit(sc->dev), index);
980 rxq->flush_state = SFXGE_FLUSH_DONE;
984 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
985 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
986 device_get_nameunit(sc->dev), index);
987 rxq->flush_state = SFXGE_FLUSH_DONE;
990 rxq->pending = rxq->added;
991 sfxge_rx_qcomplete(rxq, B_TRUE);
993 KASSERT(rxq->completed == rxq->pending,
994 ("rxq->completed != rxq->pending"));
1002 /* Destroy the common code receive queue. */
1003 efx_rx_qdestroy(rxq->common);
1005 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1006 EFX_RXQ_NBUFS(sc->rxq_entries));
1008 SFXGE_EVQ_UNLOCK(evq);
1012 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1014 struct sfxge_rxq *rxq;
1016 struct sfxge_evq *evq;
1019 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1021 rxq = sc->rxq[index];
1023 evq = sc->evq[index];
1025 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1026 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1027 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1028 ("evq->init_state != SFXGE_EVQ_STARTED"));
1030 /* Program the buffer table. */
1031 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1032 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1035 /* Create the common code receive queue. */
1036 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1037 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1038 &rxq->common)) != 0)
1041 SFXGE_EVQ_LOCK(evq);
1043 /* Enable the receive queue. */
1044 efx_rx_qenable(rxq->common);
1046 rxq->init_state = SFXGE_RXQ_STARTED;
1047 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1049 /* Try to fill the queue from the pool. */
1050 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1052 SFXGE_EVQ_UNLOCK(evq);
1057 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1058 EFX_RXQ_NBUFS(sc->rxq_entries));
1063 sfxge_rx_stop(struct sfxge_softc *sc)
1067 efx_mac_filter_default_rxq_clear(sc->enp);
1069 /* Stop the receive queue(s) */
1070 index = sc->rxq_count;
1071 while (--index >= 0)
1072 sfxge_rx_qstop(sc, index);
1074 sc->rx_prefix_size = 0;
1075 sc->rx_buffer_size = 0;
1077 efx_rx_fini(sc->enp);
1081 sfxge_rx_start(struct sfxge_softc *sc)
1083 struct sfxge_intr *intr;
1084 const efx_nic_cfg_t *encp;
1085 size_t hdrlen, align, reserved;
1091 /* Initialize the common code receive module. */
1092 if ((rc = efx_rx_init(sc->enp)) != 0)
1095 encp = efx_nic_cfg_get(sc->enp);
1096 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1098 /* Calculate the receive packet buffer size. */
1099 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1101 /* Ensure IP headers are 32bit aligned */
1102 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1103 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1105 sc->rx_buffer_size += sc->rx_buffer_align;
1107 /* Align end of packet buffer for RX DMA end padding */
1108 align = MAX(1, encp->enc_rx_buf_align_end);
1109 EFSYS_ASSERT(ISP2(align));
1110 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1113 * Standard mbuf zones only guarantee pointer-size alignment;
1114 * we need extra space to align to the cache line
1116 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1118 /* Select zone for packet buffers */
1119 if (reserved <= MCLBYTES)
1120 sc->rx_cluster_size = MCLBYTES;
1121 else if (reserved <= MJUMPAGESIZE)
1122 sc->rx_cluster_size = MJUMPAGESIZE;
1123 else if (reserved <= MJUM9BYTES)
1124 sc->rx_cluster_size = MJUM9BYTES;
1126 sc->rx_cluster_size = MJUM16BYTES;
1129 * Set up the scale table. Enable all hash types and hash insertion.
1131 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1132 sc->rx_indir_table[index] = index % sc->rxq_count;
1133 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1134 SFXGE_RX_SCALE_MAX)) != 0)
1136 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1137 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1138 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1141 rss_getkey(toep_key);
1143 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1144 sizeof(toep_key))) != 0)
1147 /* Start the receive queue(s). */
1148 for (index = 0; index < sc->rxq_count; index++) {
1149 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1153 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1154 sc->intr.n_alloc > 1);
1162 while (--index >= 0)
1163 sfxge_rx_qstop(sc, index);
1166 efx_rx_fini(sc->enp);
1173 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1175 struct sfxge_lro_state *st = &rxq->lro;
1178 st->conns_mask = lro_table_size - 1;
1179 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1180 ("lro_table_size must be a power of 2"));
1182 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1184 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1186 for (i = 0; i <= st->conns_mask; ++i) {
1187 TAILQ_INIT(&st->conns[i]);
1190 LIST_INIT(&st->active_conns);
1191 TAILQ_INIT(&st->free_conns);
1194 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1196 struct sfxge_lro_state *st = &rxq->lro;
1197 struct sfxge_lro_conn *c;
1200 /* Return cleanly if sfxge_lro_init() has not been called. */
1201 if (st->conns == NULL)
1204 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1206 for (i = 0; i <= st->conns_mask; ++i) {
1207 while (!TAILQ_EMPTY(&st->conns[i])) {
1208 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1209 sfxge_lro_drop(rxq, c);
1213 while (!TAILQ_EMPTY(&st->free_conns)) {
1214 c = TAILQ_FIRST(&st->free_conns);
1215 TAILQ_REMOVE(&st->free_conns, c, link);
1216 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1220 free(st->conns_n, M_SFXGE);
1221 free(st->conns, M_SFXGE);
1228 sfxge_lro_init(struct sfxge_rxq *rxq)
1233 sfxge_lro_fini(struct sfxge_rxq *rxq)
1237 #endif /* SFXGE_LRO */
1240 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1242 struct sfxge_rxq *rxq;
1244 rxq = sc->rxq[index];
1246 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1247 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1249 /* Free the context array and the flow table. */
1250 free(rxq->queue, M_SFXGE);
1251 sfxge_lro_fini(rxq);
1253 /* Release DMA memory. */
1254 sfxge_dma_free(&rxq->mem);
1256 sc->rxq[index] = NULL;
1262 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1264 struct sfxge_rxq *rxq;
1265 struct sfxge_evq *evq;
1269 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1271 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1274 rxq->entries = sc->rxq_entries;
1275 rxq->ptr_mask = rxq->entries - 1;
1276 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1278 sc->rxq[index] = rxq;
1281 evq = sc->evq[index];
1283 /* Allocate and zero DMA space. */
1284 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1287 /* Allocate buffer table entries. */
1288 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1291 /* Allocate the context array and the flow table. */
1292 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1293 M_SFXGE, M_WAITOK | M_ZERO);
1294 sfxge_lro_init(rxq);
1296 callout_init(&rxq->refill_callout, 1);
1298 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1303 static const struct {
1306 } sfxge_rx_stats[] = {
1307 #define SFXGE_RX_STAT(name, member) \
1308 { #name, offsetof(struct sfxge_rxq, member) }
1310 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1311 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1312 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1313 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1314 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1315 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1316 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1317 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1322 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1324 struct sfxge_softc *sc = arg1;
1325 unsigned int id = arg2;
1326 unsigned int sum, index;
1328 /* Sum across all RX queues */
1330 for (index = 0; index < sc->rxq_count; index++)
1331 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1332 sfxge_rx_stats[id].offset);
1334 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1338 sfxge_rx_stat_init(struct sfxge_softc *sc)
1340 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1341 struct sysctl_oid_list *stat_list;
1344 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1346 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1349 OID_AUTO, sfxge_rx_stats[id].name,
1350 CTLTYPE_UINT|CTLFLAG_RD,
1351 sc, id, sfxge_rx_stat_handler, "IU",
1357 sfxge_rx_fini(struct sfxge_softc *sc)
1361 index = sc->rxq_count;
1362 while (--index >= 0)
1363 sfxge_rx_qfini(sc, index);
1369 sfxge_rx_init(struct sfxge_softc *sc)
1371 struct sfxge_intr *intr;
1376 if (!ISP2(lro_table_size)) {
1377 log(LOG_ERR, "%s=%u must be power of 2",
1378 SFXGE_LRO_PARAM(table_size), lro_table_size);
1380 goto fail_lro_table_size;
1383 if (lro_idle_ticks == 0)
1384 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1389 sc->rxq_count = intr->n_alloc;
1391 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1392 ("intr->state != SFXGE_INTR_INITIALIZED"));
1394 /* Initialize the receive queue(s) - one per interrupt. */
1395 for (index = 0; index < sc->rxq_count; index++) {
1396 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1400 sfxge_rx_stat_init(sc);
1405 /* Tear down the receive queue(s). */
1406 while (--index >= 0)
1407 sfxge_rx_qfini(sc, index);
1412 fail_lro_table_size: