2 * Copyright (c) 2010-2016 Solarflare Communications Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
11 * 1. Redistributions of source code must retain the above copyright notice,
12 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #include <sys/malloc.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
49 #include <net/ethernet.h>
51 #include <net/if_vlan_var.h>
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
58 #include <machine/in_cksum.h>
61 #include <net/rss_config.h>
64 #include "common/efx.h"
70 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
74 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75 "Large receive offload (LRO) parameters");
77 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
79 /* Size of the LRO hash table. Must be a power of 2. A larger table
80 * means we can accelerate a larger number of streams.
82 static unsigned lro_table_size = 128;
83 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
86 "Size of the LRO hash table (must be a power of 2)");
88 /* Maximum length of a hash chain. If chains get too long then the lookup
89 * time increases and may exceed the benefit of LRO.
91 static unsigned lro_chain_max = 20;
92 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
95 "The maximum length of a hash chain");
97 /* Maximum time (in ticks) that a connection can be idle before it's LRO
100 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
104 "The maximum time (in ticks) that a connection can be idle "
105 "before it's LRO state is discarded");
107 /* Number of packets with payload that must arrive in-order before a
108 * connection is eligible for LRO. The idea is we should avoid coalescing
109 * segments when the sender is in slow-start because reducing the ACK rate
110 * can damage performance.
112 static int lro_slow_start_packets = 2000;
113 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115 &lro_slow_start_packets, 0,
116 "Number of packets with payload that must arrive in-order before "
117 "a connection is eligible for LRO");
119 /* Number of packets with payload that must arrive in-order following loss
120 * before a connection is eligible for LRO. The idea is we should avoid
121 * coalescing segments when the sender is recovering from loss, because
122 * reducing the ACK rate can damage performance.
124 static int lro_loss_packets = 20;
125 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127 &lro_loss_packets, 0,
128 "Number of packets with payload that must arrive in-order "
129 "following loss before a connection is eligible for LRO");
131 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132 #define SFXGE_LRO_L2_ID_VLAN 0x4000
133 #define SFXGE_LRO_L2_ID_IPV6 0x8000
134 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
137 /* Compare IPv6 addresses, avoiding conditional branches */
138 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139 const struct in6_addr *right)
142 const uint64_t *left64 = (const uint64_t *)left;
143 const uint64_t *right64 = (const uint64_t *)right;
144 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
146 return (left->s6_addr32[0] - right->s6_addr32[0]) |
147 (left->s6_addr32[1] - right->s6_addr32[1]) |
148 (left->s6_addr32[2] - right->s6_addr32[2]) |
149 (left->s6_addr32[3] - right->s6_addr32[3]);
153 #endif /* SFXGE_LRO */
156 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
159 rxq->flush_state = SFXGE_FLUSH_DONE;
163 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
166 rxq->flush_state = SFXGE_FLUSH_FAILED;
170 static uint8_t toep_key[RSS_KEYSIZE];
172 static uint8_t toep_key[] = {
173 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
182 sfxge_rx_post_refill(void *arg)
184 struct sfxge_rxq *rxq = arg;
185 struct sfxge_softc *sc;
187 struct sfxge_evq *evq;
192 evq = sc->evq[index];
193 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
195 /* This is guaranteed due to the start/stop order of rx and ev */
196 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197 ("evq not started"));
198 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199 ("rxq not started"));
200 efx_ev_qpost(evq->common, magic);
204 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
206 /* Initially retry after 100 ms, but back off in case of
207 * repeated failures as we probably have to wait for the
208 * administrator to raise the pool limit. */
210 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
212 rxq->refill_delay = hz / 10;
214 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215 sfxge_rx_post_refill, rxq);
218 #define SFXGE_REFILL_BATCH 64
221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
223 struct sfxge_softc *sc;
225 struct sfxge_evq *evq;
228 unsigned int mblksize;
230 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
234 evq = sc->evq[index];
236 prefetch_read_many(sc->enp);
237 prefetch_read_many(rxq->common);
239 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
241 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
244 rxfill = rxq->added - rxq->completed;
245 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
255 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256 while (ntodo-- > 0) {
258 struct sfxge_rx_sw_desc *rx_desc;
259 bus_dma_segment_t seg;
262 id = (rxq->added + batch) & rxq->ptr_mask;
263 rx_desc = &rxq->queue[id];
264 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
266 rx_desc->flags = EFX_DISCARD;
267 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268 sc->rx_cluster_size);
272 /* m_len specifies length of area to be mapped for DMA */
274 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
275 m->m_data += sc->rx_buffer_align;
277 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278 addr[batch++] = seg.ds_addr;
280 if (batch == SFXGE_REFILL_BATCH) {
281 efx_rx_qpost(rxq->common, addr, mblksize, batch,
282 rxq->completed, rxq->added);
289 sfxge_rx_schedule_refill(rxq, retrying);
292 efx_rx_qpost(rxq->common, addr, mblksize, batch,
293 rxq->completed, rxq->added);
297 /* Make the descriptors visible to the hardware */
298 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299 BUS_DMASYNC_PREWRITE);
301 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
303 /* The queue could still be empty if no descriptors were actually
304 * pushed, in which case there will be no event to cause the next
305 * refill, so we must schedule a refill ourselves.
307 if(rxq->pushed == rxq->completed) {
308 sfxge_rx_schedule_refill(rxq, retrying);
313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
316 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
319 /* Make sure the queue is full */
320 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
325 struct ifnet *ifp = sc->ifnet;
327 m->m_pkthdr.rcvif = ifp;
328 m->m_pkthdr.csum_data = 0xffff;
329 ifp->if_input(ifp, m);
333 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
335 struct sfxge_softc *sc = rxq->sc;
336 struct mbuf *m = rx_desc->mbuf;
337 int flags = rx_desc->flags;
340 /* Convert checksum flags */
341 csum_flags = (flags & EFX_CKSUM_IPV4) ?
342 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
343 if (flags & EFX_CKSUM_TCPUDP)
344 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
346 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
348 efx_pseudo_hdr_hash_get(rxq->common,
349 EFX_RX_HASHALG_TOEPLITZ,
351 /* The hash covers a 4-tuple for TCP only */
353 (flags & EFX_PKT_IPV4) ?
354 ((flags & EFX_PKT_TCP) ?
355 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
356 ((flags & EFX_PKT_TCP) ?
357 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
359 m->m_data += sc->rx_prefix_size;
360 m->m_len = rx_desc->size - sc->rx_prefix_size;
361 m->m_pkthdr.len = m->m_len;
362 m->m_pkthdr.csum_flags = csum_flags;
363 __sfxge_rx_deliver(sc, rx_desc->mbuf);
365 rx_desc->flags = EFX_DISCARD;
366 rx_desc->mbuf = NULL;
372 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
374 struct sfxge_softc *sc = st->sc;
375 struct mbuf *m = c->mbuf;
379 KASSERT(m, ("no mbuf to deliver"));
383 /* Finish off packet munging and recalculate IP header checksum. */
384 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
385 struct ip *iph = c->nh;
386 iph->ip_len = htons(iph->ip_len);
388 iph->ip_sum = in_cksum_hdr(iph);
389 c_th = (struct tcphdr *)(iph + 1);
390 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
391 CSUM_IP_CHECKED | CSUM_IP_VALID);
393 struct ip6_hdr *iph = c->nh;
394 iph->ip6_plen = htons(iph->ip6_plen);
395 c_th = (struct tcphdr *)(iph + 1);
396 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
399 c_th->th_win = c->th_last->th_win;
400 c_th->th_ack = c->th_last->th_ack;
401 if (c_th->th_off == c->th_last->th_off) {
402 /* Copy TCP options (take care to avoid going negative). */
403 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
404 memcpy(c_th + 1, c->th_last + 1, optlen);
407 m->m_pkthdr.flowid = c->conn_hash;
409 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
410 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
412 m->m_pkthdr.csum_flags = csum_flags;
413 __sfxge_rx_deliver(sc, m);
419 /* Drop the given connection, and add it to the free list. */
420 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
424 KASSERT(!c->mbuf, ("found orphaned mbuf"));
426 if (c->next_buf.mbuf != NULL) {
427 sfxge_rx_deliver(rxq, &c->next_buf);
428 LIST_REMOVE(c, active_link);
431 bucket = c->conn_hash & rxq->lro.conns_mask;
432 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
433 --rxq->lro.conns_n[bucket];
434 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
435 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
438 /* Stop tracking connections that have gone idle in order to keep hash
441 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
443 struct sfxge_lro_conn *c;
446 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
447 ("found active connections"));
449 rxq->lro.last_purge_ticks = now;
450 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
451 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
454 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
455 if (now - c->last_pkt_ticks > lro_idle_ticks) {
456 ++rxq->lro.n_drop_idle;
457 sfxge_lro_drop(rxq, c);
463 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
464 struct mbuf *mbuf, struct tcphdr *th)
468 /* Tack the new mbuf onto the chain. */
469 KASSERT(!mbuf->m_next, ("mbuf already chained"));
470 c->mbuf_tail->m_next = mbuf;
473 /* Increase length appropriately */
474 c->mbuf->m_pkthdr.len += mbuf->m_len;
476 /* Update the connection state flags */
477 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
478 struct ip *iph = c->nh;
479 iph->ip_len += mbuf->m_len;
480 c_th = (struct tcphdr *)(iph + 1);
482 struct ip6_hdr *iph = c->nh;
483 iph->ip6_plen += mbuf->m_len;
484 c_th = (struct tcphdr *)(iph + 1);
486 c_th->th_flags |= (th->th_flags & TH_PUSH);
490 /* Pass packet up now if another segment could overflow the IP
493 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
494 sfxge_lro_deliver(st, c);
498 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
499 struct mbuf *mbuf, void *nh, struct tcphdr *th)
501 /* Start the chain */
503 c->mbuf_tail = c->mbuf;
507 mbuf->m_pkthdr.len = mbuf->m_len;
509 /* Mangle header fields for later processing */
510 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
512 iph->ip_len = ntohs(iph->ip_len);
514 struct ip6_hdr *iph = nh;
515 iph->ip6_plen = ntohs(iph->ip6_plen);
519 /* Try to merge or otherwise hold or deliver (as appropriate) the
520 * packet buffered for this connection (c->next_buf). Return a flag
521 * indicating whether the connection is still active for LRO purposes.
524 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
526 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
527 char *eh = c->next_eh;
528 int data_length, hdr_length, dont_merge;
529 unsigned th_seq, pkt_length;
533 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
534 struct ip *iph = c->next_nh;
535 th = (struct tcphdr *)(iph + 1);
536 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
538 struct ip6_hdr *iph = c->next_nh;
539 th = (struct tcphdr *)(iph + 1);
540 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
543 hdr_length = (char *) th + th->th_off * 4 - eh;
544 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
546 th_seq = ntohl(th->th_seq);
547 dont_merge = ((data_length <= 0)
548 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
550 /* Check for options other than aligned timestamp. */
551 if (th->th_off != 5) {
552 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
553 if (th->th_off == 8 &&
554 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
556 (TCPOPT_TIMESTAMP << 8) |
557 TCPOLEN_TIMESTAMP)) {
558 /* timestamp option -- okay */
564 if (__predict_false(th_seq != c->next_seq)) {
565 /* Out-of-order, so start counting again. */
567 sfxge_lro_deliver(&rxq->lro, c);
568 c->n_in_order_pkts -= lro_loss_packets;
569 c->next_seq = th_seq + data_length;
570 ++rxq->lro.n_misorder;
571 goto deliver_buf_out;
573 c->next_seq = th_seq + data_length;
576 if (now - c->last_pkt_ticks > lro_idle_ticks) {
577 ++rxq->lro.n_drop_idle;
579 sfxge_lro_deliver(&rxq->lro, c);
580 sfxge_lro_drop(rxq, c);
583 c->last_pkt_ticks = ticks;
585 if (c->n_in_order_pkts < lro_slow_start_packets) {
586 /* May be in slow-start, so don't merge. */
587 ++rxq->lro.n_slow_start;
588 ++c->n_in_order_pkts;
589 goto deliver_buf_out;
592 if (__predict_false(dont_merge)) {
594 sfxge_lro_deliver(&rxq->lro, c);
595 if (th->th_flags & (TH_FIN | TH_RST)) {
596 ++rxq->lro.n_drop_closed;
597 sfxge_lro_drop(rxq, c);
600 goto deliver_buf_out;
603 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
605 if (__predict_true(c->mbuf != NULL)) {
606 /* Remove headers and any padding */
607 rx_buf->mbuf->m_data += hdr_length;
608 rx_buf->mbuf->m_len = data_length;
610 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
612 /* Remove any padding */
613 rx_buf->mbuf->m_len = pkt_length;
615 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
622 sfxge_rx_deliver(rxq, rx_buf);
626 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
627 uint16_t l2_id, void *nh, struct tcphdr *th)
629 unsigned bucket = conn_hash & st->conns_mask;
630 struct sfxge_lro_conn *c;
632 if (st->conns_n[bucket] >= lro_chain_max) {
637 if (!TAILQ_EMPTY(&st->free_conns)) {
638 c = TAILQ_FIRST(&st->free_conns);
639 TAILQ_REMOVE(&st->free_conns, c, link);
641 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
645 c->next_buf.mbuf = NULL;
648 /* Create the connection tracking data */
649 ++st->conns_n[bucket];
650 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
652 c->conn_hash = conn_hash;
653 c->source = th->th_sport;
654 c->dest = th->th_dport;
655 c->n_in_order_pkts = 0;
656 c->last_pkt_ticks = *(volatile int *)&ticks;
659 /* NB. We don't initialise c->next_seq, and it doesn't matter what
660 * value it has. Most likely the next packet received for this
661 * connection will not match -- no harm done.
665 /* Process mbuf and decide whether to dispatch it to the stack now or
669 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
671 struct sfxge_softc *sc = rxq->sc;
672 struct mbuf *m = rx_buf->mbuf;
673 struct ether_header *eh;
674 struct sfxge_lro_conn *c;
682 /* Get the hardware hash */
683 conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
684 EFX_RX_HASHALG_TOEPLITZ,
687 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
688 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
689 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
690 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
691 SFXGE_LRO_L2_ID_VLAN;
692 l3_proto = veh->evl_proto;
696 l3_proto = eh->ether_type;
700 /* Check whether this is a suitable packet (unfragmented
701 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
702 * length, and compute a hash if necessary. If not, return.
704 if (l3_proto == htons(ETHERTYPE_IP)) {
707 KASSERT(iph->ip_p == IPPROTO_TCP,
708 ("IPv4 protocol is not TCP, but packet marker is set"));
709 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
710 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
712 th = (struct tcphdr *)(iph + 1);
713 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
714 struct ip6_hdr *iph = nh;
716 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
717 ("IPv6 next header is not TCP, but packet marker is set"));
718 l2_id |= SFXGE_LRO_L2_ID_IPV6;
719 th = (struct tcphdr *)(iph + 1);
724 bucket = conn_hash & rxq->lro.conns_mask;
726 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
727 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
729 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
731 if (c->mbuf != NULL) {
732 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
733 struct ip *c_iph, *iph = nh;
735 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
736 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
739 struct ip6_hdr *c_iph, *iph = nh;
741 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
742 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
747 /* Re-insert at head of list to reduce lookup time. */
748 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
749 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
751 if (c->next_buf.mbuf != NULL) {
752 if (!sfxge_lro_try_merge(rxq, c))
755 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
758 c->next_buf = *rx_buf;
763 rx_buf->flags = EFX_DISCARD;
767 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
769 sfxge_rx_deliver(rxq, rx_buf);
772 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
774 struct sfxge_lro_state *st = &rxq->lro;
775 struct sfxge_lro_conn *c;
778 while (!LIST_EMPTY(&st->active_conns)) {
779 c = LIST_FIRST(&st->active_conns);
780 if (!c->delivered && c->mbuf != NULL)
781 sfxge_lro_deliver(st, c);
782 if (sfxge_lro_try_merge(rxq, c)) {
784 sfxge_lro_deliver(st, c);
785 LIST_REMOVE(c, active_link);
790 t = *(volatile int *)&ticks;
791 if (__predict_false(t != st->last_purge_ticks))
792 sfxge_lro_purge_idle(rxq, t);
795 #else /* !SFXGE_LRO */
798 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
803 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
807 #endif /* SFXGE_LRO */
810 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
812 struct sfxge_softc *sc = rxq->sc;
813 int if_capenable = sc->ifnet->if_capenable;
814 int lro_enabled = if_capenable & IFCAP_LRO;
816 struct sfxge_evq *evq;
817 unsigned int completed;
820 struct sfxge_rx_sw_desc *prev = NULL;
823 evq = sc->evq[index];
825 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
827 completed = rxq->completed;
828 while (completed != rxq->pending) {
830 struct sfxge_rx_sw_desc *rx_desc;
832 id = completed++ & rxq->ptr_mask;
833 rx_desc = &rxq->queue[id];
836 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
839 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
842 /* Read the length from the pseudo header if required */
843 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
846 rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
849 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
850 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
853 prefetch_read_many(mtod(m, caddr_t));
855 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
857 if (~if_capenable & IFCAP_RXCSUM)
859 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
862 if (~if_capenable & IFCAP_RXCSUM_IPV6)
863 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
866 /* Check for loopback packets */
868 struct ether_header *etherhp;
871 etherhp = mtod(m, struct ether_header *);
873 if (etherhp->ether_type ==
874 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
875 EFSYS_PROBE(loopback);
884 ("Rx descriptor with both IPv4 and IPv6 flags"));
888 /* Pass packet up the stack or into LRO (pipelined) */
891 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
892 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
893 sfxge_lro(rxq, prev);
895 sfxge_rx_deliver(rxq, prev);
901 /* Return the packet to the pool */
903 rx_desc->mbuf = NULL;
905 rxq->completed = completed;
907 level = rxq->added - rxq->completed;
909 /* Pass last packet up the stack or into LRO */
912 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
913 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
914 sfxge_lro(rxq, prev);
916 sfxge_rx_deliver(rxq, prev);
920 * If there are any pending flows and this is the end of the
921 * poll then they must be completed.
924 sfxge_lro_end_of_burst(rxq);
926 /* Top up the queue if necessary */
927 if (level < rxq->refill_threshold)
928 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
932 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
934 struct sfxge_rxq *rxq;
935 struct sfxge_evq *evq;
937 unsigned int retry = 3;
939 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
941 rxq = sc->rxq[index];
942 evq = sc->evq[index];
946 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
947 ("rxq not started"));
949 rxq->init_state = SFXGE_RXQ_INITIALIZED;
951 callout_stop(&rxq->refill_callout);
953 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
954 rxq->flush_state = SFXGE_FLUSH_PENDING;
956 SFXGE_EVQ_UNLOCK(evq);
958 /* Flush the receive queue */
959 if (efx_rx_qflush(rxq->common) != 0) {
961 rxq->flush_state = SFXGE_FLUSH_FAILED;
967 /* Spin for 100 ms */
970 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
973 } while (++count < 20);
977 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
978 /* Flush timeout - neither done nor failed */
979 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
980 device_get_nameunit(sc->dev), index);
981 rxq->flush_state = SFXGE_FLUSH_DONE;
985 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
986 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
987 device_get_nameunit(sc->dev), index);
988 rxq->flush_state = SFXGE_FLUSH_DONE;
991 rxq->pending = rxq->added;
992 sfxge_rx_qcomplete(rxq, B_TRUE);
994 KASSERT(rxq->completed == rxq->pending,
995 ("rxq->completed != rxq->pending"));
1003 /* Destroy the common code receive queue. */
1004 efx_rx_qdestroy(rxq->common);
1006 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1007 EFX_RXQ_NBUFS(sc->rxq_entries));
1009 SFXGE_EVQ_UNLOCK(evq);
1013 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1015 struct sfxge_rxq *rxq;
1017 struct sfxge_evq *evq;
1020 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1022 rxq = sc->rxq[index];
1024 evq = sc->evq[index];
1026 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1027 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1028 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1029 ("evq->init_state != SFXGE_EVQ_STARTED"));
1031 /* Program the buffer table. */
1032 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1033 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1036 /* Create the common code receive queue. */
1037 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1038 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1039 &rxq->common)) != 0)
1042 SFXGE_EVQ_LOCK(evq);
1044 /* Enable the receive queue. */
1045 efx_rx_qenable(rxq->common);
1047 rxq->init_state = SFXGE_RXQ_STARTED;
1048 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1050 /* Try to fill the queue from the pool. */
1051 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1053 SFXGE_EVQ_UNLOCK(evq);
1058 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1059 EFX_RXQ_NBUFS(sc->rxq_entries));
1064 sfxge_rx_stop(struct sfxge_softc *sc)
1068 efx_mac_filter_default_rxq_clear(sc->enp);
1070 /* Stop the receive queue(s) */
1071 index = sc->rxq_count;
1072 while (--index >= 0)
1073 sfxge_rx_qstop(sc, index);
1075 sc->rx_prefix_size = 0;
1076 sc->rx_buffer_size = 0;
1078 efx_rx_fini(sc->enp);
1082 sfxge_rx_start(struct sfxge_softc *sc)
1084 struct sfxge_intr *intr;
1085 const efx_nic_cfg_t *encp;
1086 size_t hdrlen, align, reserved;
1092 /* Initialize the common code receive module. */
1093 if ((rc = efx_rx_init(sc->enp)) != 0)
1096 encp = efx_nic_cfg_get(sc->enp);
1097 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1099 /* Calculate the receive packet buffer size. */
1100 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1102 /* Ensure IP headers are 32bit aligned */
1103 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1104 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1106 sc->rx_buffer_size += sc->rx_buffer_align;
1108 /* Align end of packet buffer for RX DMA end padding */
1109 align = MAX(1, encp->enc_rx_buf_align_end);
1110 EFSYS_ASSERT(ISP2(align));
1111 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1114 * Standard mbuf zones only guarantee pointer-size alignment;
1115 * we need extra space to align to the cache line
1117 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1119 /* Select zone for packet buffers */
1120 if (reserved <= MCLBYTES)
1121 sc->rx_cluster_size = MCLBYTES;
1122 else if (reserved <= MJUMPAGESIZE)
1123 sc->rx_cluster_size = MJUMPAGESIZE;
1124 else if (reserved <= MJUM9BYTES)
1125 sc->rx_cluster_size = MJUM9BYTES;
1127 sc->rx_cluster_size = MJUM16BYTES;
1130 * Set up the scale table. Enable all hash types and hash insertion.
1132 for (index = 0; index < nitems(sc->rx_indir_table); index++)
1134 sc->rx_indir_table[index] =
1135 rss_get_indirection_to_bucket(index) % sc->rxq_count;
1137 sc->rx_indir_table[index] = index % sc->rxq_count;
1139 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1140 nitems(sc->rx_indir_table))) != 0)
1142 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1143 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1144 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1147 rss_getkey(toep_key);
1149 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1150 sizeof(toep_key))) != 0)
1153 /* Start the receive queue(s). */
1154 for (index = 0; index < sc->rxq_count; index++) {
1155 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1159 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1160 sc->intr.n_alloc > 1);
1168 while (--index >= 0)
1169 sfxge_rx_qstop(sc, index);
1172 efx_rx_fini(sc->enp);
1179 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1181 struct sfxge_lro_state *st = &rxq->lro;
1184 st->conns_mask = lro_table_size - 1;
1185 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1186 ("lro_table_size must be a power of 2"));
1188 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1190 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1192 for (i = 0; i <= st->conns_mask; ++i) {
1193 TAILQ_INIT(&st->conns[i]);
1196 LIST_INIT(&st->active_conns);
1197 TAILQ_INIT(&st->free_conns);
1200 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1202 struct sfxge_lro_state *st = &rxq->lro;
1203 struct sfxge_lro_conn *c;
1206 /* Return cleanly if sfxge_lro_init() has not been called. */
1207 if (st->conns == NULL)
1210 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1212 for (i = 0; i <= st->conns_mask; ++i) {
1213 while (!TAILQ_EMPTY(&st->conns[i])) {
1214 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1215 sfxge_lro_drop(rxq, c);
1219 while (!TAILQ_EMPTY(&st->free_conns)) {
1220 c = TAILQ_FIRST(&st->free_conns);
1221 TAILQ_REMOVE(&st->free_conns, c, link);
1222 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1226 free(st->conns_n, M_SFXGE);
1227 free(st->conns, M_SFXGE);
1234 sfxge_lro_init(struct sfxge_rxq *rxq)
1239 sfxge_lro_fini(struct sfxge_rxq *rxq)
1243 #endif /* SFXGE_LRO */
1246 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1248 struct sfxge_rxq *rxq;
1250 rxq = sc->rxq[index];
1252 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1253 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1255 /* Free the context array and the flow table. */
1256 free(rxq->queue, M_SFXGE);
1257 sfxge_lro_fini(rxq);
1259 /* Release DMA memory. */
1260 sfxge_dma_free(&rxq->mem);
1262 sc->rxq[index] = NULL;
1268 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1270 struct sfxge_rxq *rxq;
1271 struct sfxge_evq *evq;
1275 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1277 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1280 rxq->entries = sc->rxq_entries;
1281 rxq->ptr_mask = rxq->entries - 1;
1282 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1284 sc->rxq[index] = rxq;
1287 evq = sc->evq[index];
1289 /* Allocate and zero DMA space. */
1290 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1293 /* Allocate buffer table entries. */
1294 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1297 /* Allocate the context array and the flow table. */
1298 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1299 M_SFXGE, M_WAITOK | M_ZERO);
1300 sfxge_lro_init(rxq);
1302 callout_init(&rxq->refill_callout, 1);
1304 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1309 static const struct {
1312 } sfxge_rx_stats[] = {
1313 #define SFXGE_RX_STAT(name, member) \
1314 { #name, offsetof(struct sfxge_rxq, member) }
1316 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1317 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1318 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1319 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1320 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1321 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1322 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1323 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1328 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1330 struct sfxge_softc *sc = arg1;
1331 unsigned int id = arg2;
1332 unsigned int sum, index;
1334 /* Sum across all RX queues */
1336 for (index = 0; index < sc->rxq_count; index++)
1337 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1338 sfxge_rx_stats[id].offset);
1340 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1344 sfxge_rx_stat_init(struct sfxge_softc *sc)
1346 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1347 struct sysctl_oid_list *stat_list;
1350 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1352 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1355 OID_AUTO, sfxge_rx_stats[id].name,
1356 CTLTYPE_UINT|CTLFLAG_RD,
1357 sc, id, sfxge_rx_stat_handler, "IU",
1363 sfxge_rx_fini(struct sfxge_softc *sc)
1367 index = sc->rxq_count;
1368 while (--index >= 0)
1369 sfxge_rx_qfini(sc, index);
1375 sfxge_rx_init(struct sfxge_softc *sc)
1377 struct sfxge_intr *intr;
1382 if (!ISP2(lro_table_size)) {
1383 log(LOG_ERR, "%s=%u must be power of 2",
1384 SFXGE_LRO_PARAM(table_size), lro_table_size);
1386 goto fail_lro_table_size;
1389 if (lro_idle_ticks == 0)
1390 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1395 sc->rxq_count = intr->n_alloc;
1397 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1398 ("intr->state != SFXGE_INTR_INITIALIZED"));
1400 /* Initialize the receive queue(s) - one per interrupt. */
1401 for (index = 0; index < sc->rxq_count; index++) {
1402 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1406 sfxge_rx_stat_init(sc);
1411 /* Tear down the receive queue(s). */
1412 while (--index >= 0)
1413 sfxge_rx_qfini(sc, index);
1418 fail_lro_table_size: