2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2010-2016 Solarflare Communications Inc.
7 * This software was developed in part by Philip Paeps under contract for
8 * Solarflare Communications, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
13 * 1. Redistributions of source code must retain the above copyright notice,
14 * this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * The views and conclusions contained in the software and documentation are
32 * those of the authors and should not be interpreted as representing official
33 * policies, either expressed or implied, of the FreeBSD Project.
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
41 #include <sys/param.h>
42 #include <sys/malloc.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/limits.h>
49 #include <sys/syslog.h>
51 #include <net/ethernet.h>
53 #include <net/if_vlan_var.h>
55 #include <netinet/in.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet/tcp.h>
60 #include <machine/in_cksum.h>
63 #include <net/rss_config.h>
66 #include "common/efx.h"
72 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
76 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
77 "Large receive offload (LRO) parameters");
79 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
81 /* Size of the LRO hash table. Must be a power of 2. A larger table
82 * means we can accelerate a larger number of streams.
84 static unsigned lro_table_size = 128;
85 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
88 "Size of the LRO hash table (must be a power of 2)");
90 /* Maximum length of a hash chain. If chains get too long then the lookup
91 * time increases and may exceed the benefit of LRO.
93 static unsigned lro_chain_max = 20;
94 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
97 "The maximum length of a hash chain");
99 /* Maximum time (in ticks) that a connection can be idle before it's LRO
100 * state is discarded.
102 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
103 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
104 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
106 "The maximum time (in ticks) that a connection can be idle "
107 "before it's LRO state is discarded");
109 /* Number of packets with payload that must arrive in-order before a
110 * connection is eligible for LRO. The idea is we should avoid coalescing
111 * segments when the sender is in slow-start because reducing the ACK rate
112 * can damage performance.
114 static int lro_slow_start_packets = 2000;
115 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
116 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
117 &lro_slow_start_packets, 0,
118 "Number of packets with payload that must arrive in-order before "
119 "a connection is eligible for LRO");
121 /* Number of packets with payload that must arrive in-order following loss
122 * before a connection is eligible for LRO. The idea is we should avoid
123 * coalescing segments when the sender is recovering from loss, because
124 * reducing the ACK rate can damage performance.
126 static int lro_loss_packets = 20;
127 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
128 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
129 &lro_loss_packets, 0,
130 "Number of packets with payload that must arrive in-order "
131 "following loss before a connection is eligible for LRO");
133 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
134 #define SFXGE_LRO_L2_ID_VLAN 0x4000
135 #define SFXGE_LRO_L2_ID_IPV6 0x8000
136 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
137 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
139 /* Compare IPv6 addresses, avoiding conditional branches */
140 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
141 const struct in6_addr *right)
144 const uint64_t *left64 = (const uint64_t *)left;
145 const uint64_t *right64 = (const uint64_t *)right;
146 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
148 return (left->s6_addr32[0] - right->s6_addr32[0]) |
149 (left->s6_addr32[1] - right->s6_addr32[1]) |
150 (left->s6_addr32[2] - right->s6_addr32[2]) |
151 (left->s6_addr32[3] - right->s6_addr32[3]);
155 #endif /* SFXGE_LRO */
158 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
161 rxq->flush_state = SFXGE_FLUSH_DONE;
165 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
168 rxq->flush_state = SFXGE_FLUSH_FAILED;
172 static uint8_t toep_key[RSS_KEYSIZE];
174 static uint8_t toep_key[] = {
175 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
176 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
177 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
178 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
179 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
184 sfxge_rx_post_refill(void *arg)
186 struct sfxge_rxq *rxq = arg;
187 struct sfxge_softc *sc;
189 struct sfxge_evq *evq;
194 evq = sc->evq[index];
195 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
197 /* This is guaranteed due to the start/stop order of rx and ev */
198 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
199 ("evq not started"));
200 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
201 ("rxq not started"));
202 efx_ev_qpost(evq->common, magic);
206 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
208 /* Initially retry after 100 ms, but back off in case of
209 * repeated failures as we probably have to wait for the
210 * administrator to raise the pool limit. */
212 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
214 rxq->refill_delay = hz / 10;
216 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
217 sfxge_rx_post_refill, rxq);
220 #define SFXGE_REFILL_BATCH 64
223 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
225 struct sfxge_softc *sc;
227 struct sfxge_evq *evq;
230 unsigned int mblksize;
232 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
236 evq = sc->evq[index];
238 prefetch_read_many(sc->enp);
239 prefetch_read_many(rxq->common);
241 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
243 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246 rxfill = rxq->added - rxq->completed;
247 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
248 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
249 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
250 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
251 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
257 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
258 while (ntodo-- > 0) {
260 struct sfxge_rx_sw_desc *rx_desc;
261 bus_dma_segment_t seg;
264 id = (rxq->added + batch) & rxq->ptr_mask;
265 rx_desc = &rxq->queue[id];
266 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
268 rx_desc->flags = EFX_DISCARD;
269 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
270 sc->rx_cluster_size);
274 /* m_len specifies length of area to be mapped for DMA */
276 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
278 m->m_data += sc->rx_buffer_align;
280 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
281 addr[batch++] = seg.ds_addr;
283 if (batch == SFXGE_REFILL_BATCH) {
284 efx_rx_qpost(rxq->common, addr, mblksize, batch,
285 rxq->completed, rxq->added);
292 sfxge_rx_schedule_refill(rxq, retrying);
295 efx_rx_qpost(rxq->common, addr, mblksize, batch,
296 rxq->completed, rxq->added);
300 /* Make the descriptors visible to the hardware */
301 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
302 BUS_DMASYNC_PREWRITE);
304 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
306 /* The queue could still be empty if no descriptors were actually
307 * pushed, in which case there will be no event to cause the next
308 * refill, so we must schedule a refill ourselves.
310 if(rxq->pushed == rxq->completed) {
311 sfxge_rx_schedule_refill(rxq, retrying);
316 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
319 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
322 /* Make sure the queue is full */
323 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
326 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
328 struct ifnet *ifp = sc->ifnet;
330 m->m_pkthdr.rcvif = ifp;
331 m->m_pkthdr.csum_data = 0xffff;
332 ifp->if_input(ifp, m);
336 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
338 struct sfxge_softc *sc = rxq->sc;
339 struct mbuf *m = rx_desc->mbuf;
340 int flags = rx_desc->flags;
343 /* Convert checksum flags */
344 csum_flags = (flags & EFX_CKSUM_IPV4) ?
345 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
346 if (flags & EFX_CKSUM_TCPUDP)
347 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
349 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
351 efx_pseudo_hdr_hash_get(rxq->common,
352 EFX_RX_HASHALG_TOEPLITZ,
354 /* The hash covers a 4-tuple for TCP only */
356 (flags & EFX_PKT_IPV4) ?
357 ((flags & EFX_PKT_TCP) ?
358 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
359 ((flags & EFX_PKT_TCP) ?
360 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
362 m->m_data += sc->rx_prefix_size;
363 m->m_len = rx_desc->size - sc->rx_prefix_size;
364 m->m_pkthdr.len = m->m_len;
365 m->m_pkthdr.csum_flags = csum_flags;
366 __sfxge_rx_deliver(sc, rx_desc->mbuf);
368 rx_desc->flags = EFX_DISCARD;
369 rx_desc->mbuf = NULL;
375 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
377 struct sfxge_softc *sc = st->sc;
378 struct mbuf *m = c->mbuf;
382 KASSERT(m, ("no mbuf to deliver"));
386 /* Finish off packet munging and recalculate IP header checksum. */
387 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
388 struct ip *iph = c->nh;
389 iph->ip_len = htons(iph->ip_len);
391 iph->ip_sum = in_cksum_hdr(iph);
392 c_th = (struct tcphdr *)(iph + 1);
393 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
394 CSUM_IP_CHECKED | CSUM_IP_VALID);
396 struct ip6_hdr *iph = c->nh;
397 iph->ip6_plen = htons(iph->ip6_plen);
398 c_th = (struct tcphdr *)(iph + 1);
399 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
402 c_th->th_win = c->th_last->th_win;
403 c_th->th_ack = c->th_last->th_ack;
404 if (c_th->th_off == c->th_last->th_off) {
405 /* Copy TCP options (take care to avoid going negative). */
406 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
407 memcpy(c_th + 1, c->th_last + 1, optlen);
410 m->m_pkthdr.flowid = c->conn_hash;
412 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
413 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
415 m->m_pkthdr.csum_flags = csum_flags;
416 __sfxge_rx_deliver(sc, m);
422 /* Drop the given connection, and add it to the free list. */
423 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
427 KASSERT(!c->mbuf, ("found orphaned mbuf"));
429 if (c->next_buf.mbuf != NULL) {
430 sfxge_rx_deliver(rxq, &c->next_buf);
431 LIST_REMOVE(c, active_link);
434 bucket = c->conn_hash & rxq->lro.conns_mask;
435 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
436 --rxq->lro.conns_n[bucket];
437 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
438 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
441 /* Stop tracking connections that have gone idle in order to keep hash
444 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
446 struct sfxge_lro_conn *c;
449 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
450 ("found active connections"));
452 rxq->lro.last_purge_ticks = now;
453 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
454 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
457 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
458 if (now - c->last_pkt_ticks > lro_idle_ticks) {
459 ++rxq->lro.n_drop_idle;
460 sfxge_lro_drop(rxq, c);
466 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
467 struct mbuf *mbuf, struct tcphdr *th)
471 /* Tack the new mbuf onto the chain. */
472 KASSERT(!mbuf->m_next, ("mbuf already chained"));
473 c->mbuf_tail->m_next = mbuf;
476 /* Increase length appropriately */
477 c->mbuf->m_pkthdr.len += mbuf->m_len;
479 /* Update the connection state flags */
480 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
481 struct ip *iph = c->nh;
482 iph->ip_len += mbuf->m_len;
483 c_th = (struct tcphdr *)(iph + 1);
485 struct ip6_hdr *iph = c->nh;
486 iph->ip6_plen += mbuf->m_len;
487 c_th = (struct tcphdr *)(iph + 1);
489 c_th->th_flags |= (th->th_flags & TH_PUSH);
493 /* Pass packet up now if another segment could overflow the IP
496 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
497 sfxge_lro_deliver(st, c);
501 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
502 struct mbuf *mbuf, void *nh, struct tcphdr *th)
504 /* Start the chain */
506 c->mbuf_tail = c->mbuf;
510 mbuf->m_pkthdr.len = mbuf->m_len;
512 /* Mangle header fields for later processing */
513 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
515 iph->ip_len = ntohs(iph->ip_len);
517 struct ip6_hdr *iph = nh;
518 iph->ip6_plen = ntohs(iph->ip6_plen);
522 /* Try to merge or otherwise hold or deliver (as appropriate) the
523 * packet buffered for this connection (c->next_buf). Return a flag
524 * indicating whether the connection is still active for LRO purposes.
527 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
529 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
530 char *eh = c->next_eh;
531 int data_length, hdr_length, dont_merge;
532 unsigned th_seq, pkt_length;
536 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
537 struct ip *iph = c->next_nh;
538 th = (struct tcphdr *)(iph + 1);
539 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
541 struct ip6_hdr *iph = c->next_nh;
542 th = (struct tcphdr *)(iph + 1);
543 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
546 hdr_length = (char *) th + th->th_off * 4 - eh;
547 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
549 th_seq = ntohl(th->th_seq);
550 dont_merge = ((data_length <= 0)
551 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
553 /* Check for options other than aligned timestamp. */
554 if (th->th_off != 5) {
555 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
556 if (th->th_off == 8 &&
557 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
559 (TCPOPT_TIMESTAMP << 8) |
560 TCPOLEN_TIMESTAMP)) {
561 /* timestamp option -- okay */
567 if (__predict_false(th_seq != c->next_seq)) {
568 /* Out-of-order, so start counting again. */
570 sfxge_lro_deliver(&rxq->lro, c);
571 c->n_in_order_pkts -= lro_loss_packets;
572 c->next_seq = th_seq + data_length;
573 ++rxq->lro.n_misorder;
574 goto deliver_buf_out;
576 c->next_seq = th_seq + data_length;
579 if (now - c->last_pkt_ticks > lro_idle_ticks) {
580 ++rxq->lro.n_drop_idle;
582 sfxge_lro_deliver(&rxq->lro, c);
583 sfxge_lro_drop(rxq, c);
586 c->last_pkt_ticks = ticks;
588 if (c->n_in_order_pkts < lro_slow_start_packets) {
589 /* May be in slow-start, so don't merge. */
590 ++rxq->lro.n_slow_start;
591 ++c->n_in_order_pkts;
592 goto deliver_buf_out;
595 if (__predict_false(dont_merge)) {
597 sfxge_lro_deliver(&rxq->lro, c);
598 if (th->th_flags & (TH_FIN | TH_RST)) {
599 ++rxq->lro.n_drop_closed;
600 sfxge_lro_drop(rxq, c);
603 goto deliver_buf_out;
606 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
608 if (__predict_true(c->mbuf != NULL)) {
609 /* Remove headers and any padding */
610 rx_buf->mbuf->m_data += hdr_length;
611 rx_buf->mbuf->m_len = data_length;
613 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
615 /* Remove any padding */
616 rx_buf->mbuf->m_len = pkt_length;
618 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
625 sfxge_rx_deliver(rxq, rx_buf);
629 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
630 uint16_t l2_id, void *nh, struct tcphdr *th)
632 unsigned bucket = conn_hash & st->conns_mask;
633 struct sfxge_lro_conn *c;
635 if (st->conns_n[bucket] >= lro_chain_max) {
640 if (!TAILQ_EMPTY(&st->free_conns)) {
641 c = TAILQ_FIRST(&st->free_conns);
642 TAILQ_REMOVE(&st->free_conns, c, link);
644 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
648 c->next_buf.mbuf = NULL;
651 /* Create the connection tracking data */
652 ++st->conns_n[bucket];
653 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
655 c->conn_hash = conn_hash;
656 c->source = th->th_sport;
657 c->dest = th->th_dport;
658 c->n_in_order_pkts = 0;
659 c->last_pkt_ticks = *(volatile int *)&ticks;
662 /* NB. We don't initialise c->next_seq, and it doesn't matter what
663 * value it has. Most likely the next packet received for this
664 * connection will not match -- no harm done.
668 /* Process mbuf and decide whether to dispatch it to the stack now or
672 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
674 struct sfxge_softc *sc = rxq->sc;
675 struct mbuf *m = rx_buf->mbuf;
676 struct ether_header *eh;
677 struct sfxge_lro_conn *c;
685 /* Get the hardware hash */
686 conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
687 EFX_RX_HASHALG_TOEPLITZ,
690 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
691 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
692 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
693 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
694 SFXGE_LRO_L2_ID_VLAN;
695 l3_proto = veh->evl_proto;
699 l3_proto = eh->ether_type;
703 /* Check whether this is a suitable packet (unfragmented
704 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
705 * length, and compute a hash if necessary. If not, return.
707 if (l3_proto == htons(ETHERTYPE_IP)) {
710 KASSERT(iph->ip_p == IPPROTO_TCP,
711 ("IPv4 protocol is not TCP, but packet marker is set"));
712 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
713 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
715 th = (struct tcphdr *)(iph + 1);
716 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
717 struct ip6_hdr *iph = nh;
719 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
720 ("IPv6 next header is not TCP, but packet marker is set"));
721 l2_id |= SFXGE_LRO_L2_ID_IPV6;
722 th = (struct tcphdr *)(iph + 1);
727 bucket = conn_hash & rxq->lro.conns_mask;
729 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
730 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
732 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
734 if (c->mbuf != NULL) {
735 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
736 struct ip *c_iph, *iph = nh;
738 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
739 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
742 struct ip6_hdr *c_iph, *iph = nh;
744 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
745 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
750 /* Re-insert at head of list to reduce lookup time. */
751 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
752 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
754 if (c->next_buf.mbuf != NULL) {
755 if (!sfxge_lro_try_merge(rxq, c))
758 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
761 c->next_buf = *rx_buf;
766 rx_buf->flags = EFX_DISCARD;
770 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
772 sfxge_rx_deliver(rxq, rx_buf);
775 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
777 struct sfxge_lro_state *st = &rxq->lro;
778 struct sfxge_lro_conn *c;
781 while (!LIST_EMPTY(&st->active_conns)) {
782 c = LIST_FIRST(&st->active_conns);
783 if (!c->delivered && c->mbuf != NULL)
784 sfxge_lro_deliver(st, c);
785 if (sfxge_lro_try_merge(rxq, c)) {
787 sfxge_lro_deliver(st, c);
788 LIST_REMOVE(c, active_link);
793 t = *(volatile int *)&ticks;
794 if (__predict_false(t != st->last_purge_ticks))
795 sfxge_lro_purge_idle(rxq, t);
798 #else /* !SFXGE_LRO */
801 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
806 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
810 #endif /* SFXGE_LRO */
813 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
815 struct sfxge_softc *sc = rxq->sc;
816 int if_capenable = sc->ifnet->if_capenable;
817 int lro_enabled = if_capenable & IFCAP_LRO;
819 struct sfxge_evq *evq;
820 unsigned int completed;
823 struct sfxge_rx_sw_desc *prev = NULL;
826 evq = sc->evq[index];
828 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
830 completed = rxq->completed;
831 while (completed != rxq->pending) {
833 struct sfxge_rx_sw_desc *rx_desc;
835 id = completed++ & rxq->ptr_mask;
836 rx_desc = &rxq->queue[id];
839 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
842 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
845 /* Read the length from the pseudo header if required */
846 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
849 rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
852 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
853 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
856 prefetch_read_many(mtod(m, caddr_t));
858 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
860 if (~if_capenable & IFCAP_RXCSUM)
862 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
865 if (~if_capenable & IFCAP_RXCSUM_IPV6)
866 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
869 /* Check for loopback packets */
871 struct ether_header *etherhp;
874 etherhp = mtod(m, struct ether_header *);
876 if (etherhp->ether_type ==
877 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
878 EFSYS_PROBE(loopback);
887 ("Rx descriptor with both IPv4 and IPv6 flags"));
891 /* Pass packet up the stack or into LRO (pipelined) */
894 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
895 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
896 sfxge_lro(rxq, prev);
898 sfxge_rx_deliver(rxq, prev);
904 /* Return the packet to the pool */
906 rx_desc->mbuf = NULL;
908 rxq->completed = completed;
910 level = rxq->added - rxq->completed;
912 /* Pass last packet up the stack or into LRO */
915 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
916 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
917 sfxge_lro(rxq, prev);
919 sfxge_rx_deliver(rxq, prev);
923 * If there are any pending flows and this is the end of the
924 * poll then they must be completed.
927 sfxge_lro_end_of_burst(rxq);
929 /* Top up the queue if necessary */
930 if (level < rxq->refill_threshold)
931 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
935 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
937 struct sfxge_rxq *rxq;
938 struct sfxge_evq *evq;
940 unsigned int retry = 3;
942 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
944 rxq = sc->rxq[index];
945 evq = sc->evq[index];
949 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
950 ("rxq not started"));
952 rxq->init_state = SFXGE_RXQ_INITIALIZED;
954 callout_stop(&rxq->refill_callout);
956 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
957 rxq->flush_state = SFXGE_FLUSH_PENDING;
959 SFXGE_EVQ_UNLOCK(evq);
961 /* Flush the receive queue */
962 if (efx_rx_qflush(rxq->common) != 0) {
964 rxq->flush_state = SFXGE_FLUSH_FAILED;
970 /* Spin for 100 ms */
973 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
976 } while (++count < 20);
980 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
981 /* Flush timeout - neither done nor failed */
982 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
983 device_get_nameunit(sc->dev), index);
984 rxq->flush_state = SFXGE_FLUSH_DONE;
988 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
989 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
990 device_get_nameunit(sc->dev), index);
991 rxq->flush_state = SFXGE_FLUSH_DONE;
994 rxq->pending = rxq->added;
995 sfxge_rx_qcomplete(rxq, B_TRUE);
997 KASSERT(rxq->completed == rxq->pending,
998 ("rxq->completed != rxq->pending"));
1006 /* Destroy the common code receive queue. */
1007 efx_rx_qdestroy(rxq->common);
1009 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1010 EFX_RXQ_NBUFS(sc->rxq_entries));
1012 SFXGE_EVQ_UNLOCK(evq);
1016 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1018 struct sfxge_rxq *rxq;
1020 struct sfxge_evq *evq;
1023 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1025 rxq = sc->rxq[index];
1027 evq = sc->evq[index];
1029 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1030 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1031 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1032 ("evq->init_state != SFXGE_EVQ_STARTED"));
1034 /* Program the buffer table. */
1035 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1036 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1039 /* Create the common code receive queue. */
1040 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1041 esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1042 evq->common, &rxq->common)) != 0)
1045 SFXGE_EVQ_LOCK(evq);
1047 /* Enable the receive queue. */
1048 efx_rx_qenable(rxq->common);
1050 rxq->init_state = SFXGE_RXQ_STARTED;
1051 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1053 /* Try to fill the queue from the pool. */
1054 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1056 SFXGE_EVQ_UNLOCK(evq);
1061 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1062 EFX_RXQ_NBUFS(sc->rxq_entries));
1067 sfxge_rx_stop(struct sfxge_softc *sc)
1071 efx_mac_filter_default_rxq_clear(sc->enp);
1073 /* Stop the receive queue(s) */
1074 index = sc->rxq_count;
1075 while (--index >= 0)
1076 sfxge_rx_qstop(sc, index);
1078 sc->rx_prefix_size = 0;
1079 sc->rx_buffer_size = 0;
1081 efx_rx_fini(sc->enp);
1085 sfxge_rx_start(struct sfxge_softc *sc)
1087 struct sfxge_intr *intr;
1088 const efx_nic_cfg_t *encp;
1089 size_t hdrlen, align, reserved;
1095 /* Initialize the common code receive module. */
1096 if ((rc = efx_rx_init(sc->enp)) != 0)
1099 encp = efx_nic_cfg_get(sc->enp);
1100 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1102 /* Calculate the receive packet buffer size. */
1103 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1105 /* Ensure IP headers are 32bit aligned */
1106 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1107 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1109 sc->rx_buffer_size += sc->rx_buffer_align;
1111 /* Align end of packet buffer for RX DMA end padding */
1112 align = MAX(1, encp->enc_rx_buf_align_end);
1113 EFSYS_ASSERT(ISP2(align));
1114 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1117 * Standard mbuf zones only guarantee pointer-size alignment;
1118 * we need extra space to align to the cache line
1120 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1122 /* Select zone for packet buffers */
1123 if (reserved <= MCLBYTES)
1124 sc->rx_cluster_size = MCLBYTES;
1125 else if (reserved <= MJUMPAGESIZE)
1126 sc->rx_cluster_size = MJUMPAGESIZE;
1127 else if (reserved <= MJUM9BYTES)
1128 sc->rx_cluster_size = MJUM9BYTES;
1130 sc->rx_cluster_size = MJUM16BYTES;
1133 * Set up the scale table. Enable all hash types and hash insertion.
1135 for (index = 0; index < nitems(sc->rx_indir_table); index++)
1137 sc->rx_indir_table[index] =
1138 rss_get_indirection_to_bucket(index) % sc->rxq_count;
1140 sc->rx_indir_table[index] = index % sc->rxq_count;
1142 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1144 nitems(sc->rx_indir_table))) != 0)
1146 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1147 EFX_RX_HASHALG_TOEPLITZ,
1148 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1149 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1152 rss_getkey(toep_key);
1154 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1156 sizeof(toep_key))) != 0)
1159 /* Start the receive queue(s). */
1160 for (index = 0; index < sc->rxq_count; index++) {
1161 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1165 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1166 sc->intr.n_alloc > 1);
1174 while (--index >= 0)
1175 sfxge_rx_qstop(sc, index);
1178 efx_rx_fini(sc->enp);
1185 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1187 struct sfxge_lro_state *st = &rxq->lro;
1190 st->conns_mask = lro_table_size - 1;
1191 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1192 ("lro_table_size must be a power of 2"));
1194 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1196 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1198 for (i = 0; i <= st->conns_mask; ++i) {
1199 TAILQ_INIT(&st->conns[i]);
1202 LIST_INIT(&st->active_conns);
1203 TAILQ_INIT(&st->free_conns);
1206 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1208 struct sfxge_lro_state *st = &rxq->lro;
1209 struct sfxge_lro_conn *c;
1212 /* Return cleanly if sfxge_lro_init() has not been called. */
1213 if (st->conns == NULL)
1216 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1218 for (i = 0; i <= st->conns_mask; ++i) {
1219 while (!TAILQ_EMPTY(&st->conns[i])) {
1220 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1221 sfxge_lro_drop(rxq, c);
1225 while (!TAILQ_EMPTY(&st->free_conns)) {
1226 c = TAILQ_FIRST(&st->free_conns);
1227 TAILQ_REMOVE(&st->free_conns, c, link);
1228 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1232 free(st->conns_n, M_SFXGE);
1233 free(st->conns, M_SFXGE);
1240 sfxge_lro_init(struct sfxge_rxq *rxq)
1245 sfxge_lro_fini(struct sfxge_rxq *rxq)
1249 #endif /* SFXGE_LRO */
1252 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1254 struct sfxge_rxq *rxq;
1256 rxq = sc->rxq[index];
1258 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1259 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1261 /* Free the context array and the flow table. */
1262 free(rxq->queue, M_SFXGE);
1263 sfxge_lro_fini(rxq);
1265 /* Release DMA memory. */
1266 sfxge_dma_free(&rxq->mem);
1268 sc->rxq[index] = NULL;
1274 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1276 struct sfxge_rxq *rxq;
1277 struct sfxge_evq *evq;
1281 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1283 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1286 rxq->entries = sc->rxq_entries;
1287 rxq->ptr_mask = rxq->entries - 1;
1288 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1290 sc->rxq[index] = rxq;
1293 evq = sc->evq[index];
1295 /* Allocate and zero DMA space. */
1296 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1299 /* Allocate buffer table entries. */
1300 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1303 /* Allocate the context array and the flow table. */
1304 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1305 M_SFXGE, M_WAITOK | M_ZERO);
1306 sfxge_lro_init(rxq);
1308 callout_init(&rxq->refill_callout, 1);
1310 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1315 static const struct {
1318 } sfxge_rx_stats[] = {
1319 #define SFXGE_RX_STAT(name, member) \
1320 { #name, offsetof(struct sfxge_rxq, member) }
1322 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1323 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1324 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1325 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1326 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1327 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1328 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1329 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1334 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1336 struct sfxge_softc *sc = arg1;
1337 unsigned int id = arg2;
1338 unsigned int sum, index;
1340 /* Sum across all RX queues */
1342 for (index = 0; index < sc->rxq_count; index++)
1343 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1344 sfxge_rx_stats[id].offset);
1346 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1350 sfxge_rx_stat_init(struct sfxge_softc *sc)
1352 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1353 struct sysctl_oid_list *stat_list;
1356 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1358 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1361 OID_AUTO, sfxge_rx_stats[id].name,
1362 CTLTYPE_UINT|CTLFLAG_RD,
1363 sc, id, sfxge_rx_stat_handler, "IU",
1369 sfxge_rx_fini(struct sfxge_softc *sc)
1373 index = sc->rxq_count;
1374 while (--index >= 0)
1375 sfxge_rx_qfini(sc, index);
1381 sfxge_rx_init(struct sfxge_softc *sc)
1383 struct sfxge_intr *intr;
1388 if (!ISP2(lro_table_size)) {
1389 log(LOG_ERR, "%s=%u must be power of 2",
1390 SFXGE_LRO_PARAM(table_size), lro_table_size);
1392 goto fail_lro_table_size;
1395 if (lro_idle_ticks == 0)
1396 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1401 sc->rxq_count = intr->n_alloc;
1403 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1404 ("intr->state != SFXGE_INTR_INITIALIZED"));
1406 /* Initialize the receive queue(s) - one per interrupt. */
1407 for (index = 0; index < sc->rxq_count; index++) {
1408 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1412 sfxge_rx_stat_init(sc);
1417 /* Tear down the receive queue(s). */
1418 while (--index >= 0)
1419 sfxge_rx_qfini(sc, index);
1424 fail_lro_table_size: