2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2010-2016 Solarflare Communications Inc.
7 * This software was developed in part by Philip Paeps under contract for
8 * Solarflare Communications, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
13 * 1. Redistributions of source code must retain the above copyright notice,
14 * this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * The views and conclusions contained in the software and documentation are
32 * those of the authors and should not be interpreted as representing official
33 * policies, either expressed or implied, of the FreeBSD Project.
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
41 #include <sys/param.h>
42 #include <sys/malloc.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/limits.h>
49 #include <sys/syslog.h>
51 #include <net/ethernet.h>
53 #include <net/if_vlan_var.h>
55 #include <netinet/in.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet/tcp.h>
60 #include <machine/in_cksum.h>
63 #include <net/rss_config.h>
66 #include "common/efx.h"
72 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
76 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
77 "Large receive offload (LRO) parameters");
79 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
81 /* Size of the LRO hash table. Must be a power of 2. A larger table
82 * means we can accelerate a larger number of streams.
84 static unsigned lro_table_size = 128;
85 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
88 "Size of the LRO hash table (must be a power of 2)");
90 /* Maximum length of a hash chain. If chains get too long then the lookup
91 * time increases and may exceed the benefit of LRO.
93 static unsigned lro_chain_max = 20;
94 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
97 "The maximum length of a hash chain");
99 /* Maximum time (in ticks) that a connection can be idle before it's LRO
100 * state is discarded.
102 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
103 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
104 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
106 "The maximum time (in ticks) that a connection can be idle "
107 "before it's LRO state is discarded");
109 /* Number of packets with payload that must arrive in-order before a
110 * connection is eligible for LRO. The idea is we should avoid coalescing
111 * segments when the sender is in slow-start because reducing the ACK rate
112 * can damage performance.
114 static int lro_slow_start_packets = 2000;
115 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
116 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
117 &lro_slow_start_packets, 0,
118 "Number of packets with payload that must arrive in-order before "
119 "a connection is eligible for LRO");
121 /* Number of packets with payload that must arrive in-order following loss
122 * before a connection is eligible for LRO. The idea is we should avoid
123 * coalescing segments when the sender is recovering from loss, because
124 * reducing the ACK rate can damage performance.
126 static int lro_loss_packets = 20;
127 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
128 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
129 &lro_loss_packets, 0,
130 "Number of packets with payload that must arrive in-order "
131 "following loss before a connection is eligible for LRO");
133 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
134 #define SFXGE_LRO_L2_ID_VLAN 0x4000
135 #define SFXGE_LRO_L2_ID_IPV6 0x8000
136 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
137 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
139 /* Compare IPv6 addresses, avoiding conditional branches */
140 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
141 const struct in6_addr *right)
144 const uint64_t *left64 = (const uint64_t *)left;
145 const uint64_t *right64 = (const uint64_t *)right;
146 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
148 return (left->s6_addr32[0] - right->s6_addr32[0]) |
149 (left->s6_addr32[1] - right->s6_addr32[1]) |
150 (left->s6_addr32[2] - right->s6_addr32[2]) |
151 (left->s6_addr32[3] - right->s6_addr32[3]);
155 #endif /* SFXGE_LRO */
158 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
161 rxq->flush_state = SFXGE_FLUSH_DONE;
165 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
168 rxq->flush_state = SFXGE_FLUSH_FAILED;
172 static uint8_t toep_key[RSS_KEYSIZE];
174 static uint8_t toep_key[] = {
175 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
176 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
177 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
178 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
179 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
184 sfxge_rx_post_refill(void *arg)
186 struct sfxge_rxq *rxq = arg;
187 struct sfxge_softc *sc;
189 struct sfxge_evq *evq;
194 evq = sc->evq[index];
195 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
197 /* This is guaranteed due to the start/stop order of rx and ev */
198 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
199 ("evq not started"));
200 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
201 ("rxq not started"));
202 efx_ev_qpost(evq->common, magic);
206 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
208 /* Initially retry after 100 ms, but back off in case of
209 * repeated failures as we probably have to wait for the
210 * administrator to raise the pool limit. */
212 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
214 rxq->refill_delay = hz / 10;
216 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
217 sfxge_rx_post_refill, rxq);
220 #define SFXGE_REFILL_BATCH 64
223 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
225 struct sfxge_softc *sc;
227 struct sfxge_evq *evq;
230 unsigned int mblksize;
232 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
236 evq = sc->evq[index];
238 prefetch_read_many(sc->enp);
239 prefetch_read_many(rxq->common);
241 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
243 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246 rxfill = rxq->added - rxq->completed;
247 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
248 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
249 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
250 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
251 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
257 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
258 while (ntodo-- > 0) {
260 struct sfxge_rx_sw_desc *rx_desc;
261 bus_dma_segment_t seg;
264 id = (rxq->added + batch) & rxq->ptr_mask;
265 rx_desc = &rxq->queue[id];
266 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
268 rx_desc->flags = EFX_DISCARD;
269 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
270 sc->rx_cluster_size);
274 /* m_len specifies length of area to be mapped for DMA */
276 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
277 m->m_data += sc->rx_buffer_align;
279 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
280 addr[batch++] = seg.ds_addr;
282 if (batch == SFXGE_REFILL_BATCH) {
283 efx_rx_qpost(rxq->common, addr, mblksize, batch,
284 rxq->completed, rxq->added);
291 sfxge_rx_schedule_refill(rxq, retrying);
294 efx_rx_qpost(rxq->common, addr, mblksize, batch,
295 rxq->completed, rxq->added);
299 /* Make the descriptors visible to the hardware */
300 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
301 BUS_DMASYNC_PREWRITE);
303 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
305 /* The queue could still be empty if no descriptors were actually
306 * pushed, in which case there will be no event to cause the next
307 * refill, so we must schedule a refill ourselves.
309 if(rxq->pushed == rxq->completed) {
310 sfxge_rx_schedule_refill(rxq, retrying);
315 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
318 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
321 /* Make sure the queue is full */
322 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
327 struct ifnet *ifp = sc->ifnet;
329 m->m_pkthdr.rcvif = ifp;
330 m->m_pkthdr.csum_data = 0xffff;
331 ifp->if_input(ifp, m);
335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
337 struct sfxge_softc *sc = rxq->sc;
338 struct mbuf *m = rx_desc->mbuf;
339 int flags = rx_desc->flags;
342 /* Convert checksum flags */
343 csum_flags = (flags & EFX_CKSUM_IPV4) ?
344 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
345 if (flags & EFX_CKSUM_TCPUDP)
346 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
348 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
350 efx_pseudo_hdr_hash_get(rxq->common,
351 EFX_RX_HASHALG_TOEPLITZ,
353 /* The hash covers a 4-tuple for TCP only */
355 (flags & EFX_PKT_IPV4) ?
356 ((flags & EFX_PKT_TCP) ?
357 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
358 ((flags & EFX_PKT_TCP) ?
359 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
361 m->m_data += sc->rx_prefix_size;
362 m->m_len = rx_desc->size - sc->rx_prefix_size;
363 m->m_pkthdr.len = m->m_len;
364 m->m_pkthdr.csum_flags = csum_flags;
365 __sfxge_rx_deliver(sc, rx_desc->mbuf);
367 rx_desc->flags = EFX_DISCARD;
368 rx_desc->mbuf = NULL;
374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
376 struct sfxge_softc *sc = st->sc;
377 struct mbuf *m = c->mbuf;
381 KASSERT(m, ("no mbuf to deliver"));
385 /* Finish off packet munging and recalculate IP header checksum. */
386 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
387 struct ip *iph = c->nh;
388 iph->ip_len = htons(iph->ip_len);
390 iph->ip_sum = in_cksum_hdr(iph);
391 c_th = (struct tcphdr *)(iph + 1);
392 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
393 CSUM_IP_CHECKED | CSUM_IP_VALID);
395 struct ip6_hdr *iph = c->nh;
396 iph->ip6_plen = htons(iph->ip6_plen);
397 c_th = (struct tcphdr *)(iph + 1);
398 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
401 c_th->th_win = c->th_last->th_win;
402 c_th->th_ack = c->th_last->th_ack;
403 if (c_th->th_off == c->th_last->th_off) {
404 /* Copy TCP options (take care to avoid going negative). */
405 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
406 memcpy(c_th + 1, c->th_last + 1, optlen);
409 m->m_pkthdr.flowid = c->conn_hash;
411 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
412 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
414 m->m_pkthdr.csum_flags = csum_flags;
415 __sfxge_rx_deliver(sc, m);
421 /* Drop the given connection, and add it to the free list. */
422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
426 KASSERT(!c->mbuf, ("found orphaned mbuf"));
428 if (c->next_buf.mbuf != NULL) {
429 sfxge_rx_deliver(rxq, &c->next_buf);
430 LIST_REMOVE(c, active_link);
433 bucket = c->conn_hash & rxq->lro.conns_mask;
434 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
435 --rxq->lro.conns_n[bucket];
436 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
437 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
440 /* Stop tracking connections that have gone idle in order to keep hash
443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
445 struct sfxge_lro_conn *c;
448 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
449 ("found active connections"));
451 rxq->lro.last_purge_ticks = now;
452 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
453 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
456 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
457 if (now - c->last_pkt_ticks > lro_idle_ticks) {
458 ++rxq->lro.n_drop_idle;
459 sfxge_lro_drop(rxq, c);
465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
466 struct mbuf *mbuf, struct tcphdr *th)
470 /* Tack the new mbuf onto the chain. */
471 KASSERT(!mbuf->m_next, ("mbuf already chained"));
472 c->mbuf_tail->m_next = mbuf;
475 /* Increase length appropriately */
476 c->mbuf->m_pkthdr.len += mbuf->m_len;
478 /* Update the connection state flags */
479 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
480 struct ip *iph = c->nh;
481 iph->ip_len += mbuf->m_len;
482 c_th = (struct tcphdr *)(iph + 1);
484 struct ip6_hdr *iph = c->nh;
485 iph->ip6_plen += mbuf->m_len;
486 c_th = (struct tcphdr *)(iph + 1);
488 c_th->th_flags |= (th->th_flags & TH_PUSH);
492 /* Pass packet up now if another segment could overflow the IP
495 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
496 sfxge_lro_deliver(st, c);
500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
501 struct mbuf *mbuf, void *nh, struct tcphdr *th)
503 /* Start the chain */
505 c->mbuf_tail = c->mbuf;
509 mbuf->m_pkthdr.len = mbuf->m_len;
511 /* Mangle header fields for later processing */
512 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
514 iph->ip_len = ntohs(iph->ip_len);
516 struct ip6_hdr *iph = nh;
517 iph->ip6_plen = ntohs(iph->ip6_plen);
521 /* Try to merge or otherwise hold or deliver (as appropriate) the
522 * packet buffered for this connection (c->next_buf). Return a flag
523 * indicating whether the connection is still active for LRO purposes.
526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
528 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
529 char *eh = c->next_eh;
530 int data_length, hdr_length, dont_merge;
531 unsigned th_seq, pkt_length;
535 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
536 struct ip *iph = c->next_nh;
537 th = (struct tcphdr *)(iph + 1);
538 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
540 struct ip6_hdr *iph = c->next_nh;
541 th = (struct tcphdr *)(iph + 1);
542 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
545 hdr_length = (char *) th + th->th_off * 4 - eh;
546 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
548 th_seq = ntohl(th->th_seq);
549 dont_merge = ((data_length <= 0)
550 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
552 /* Check for options other than aligned timestamp. */
553 if (th->th_off != 5) {
554 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
555 if (th->th_off == 8 &&
556 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
558 (TCPOPT_TIMESTAMP << 8) |
559 TCPOLEN_TIMESTAMP)) {
560 /* timestamp option -- okay */
566 if (__predict_false(th_seq != c->next_seq)) {
567 /* Out-of-order, so start counting again. */
569 sfxge_lro_deliver(&rxq->lro, c);
570 c->n_in_order_pkts -= lro_loss_packets;
571 c->next_seq = th_seq + data_length;
572 ++rxq->lro.n_misorder;
573 goto deliver_buf_out;
575 c->next_seq = th_seq + data_length;
578 if (now - c->last_pkt_ticks > lro_idle_ticks) {
579 ++rxq->lro.n_drop_idle;
581 sfxge_lro_deliver(&rxq->lro, c);
582 sfxge_lro_drop(rxq, c);
585 c->last_pkt_ticks = ticks;
587 if (c->n_in_order_pkts < lro_slow_start_packets) {
588 /* May be in slow-start, so don't merge. */
589 ++rxq->lro.n_slow_start;
590 ++c->n_in_order_pkts;
591 goto deliver_buf_out;
594 if (__predict_false(dont_merge)) {
596 sfxge_lro_deliver(&rxq->lro, c);
597 if (th->th_flags & (TH_FIN | TH_RST)) {
598 ++rxq->lro.n_drop_closed;
599 sfxge_lro_drop(rxq, c);
602 goto deliver_buf_out;
605 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
607 if (__predict_true(c->mbuf != NULL)) {
608 /* Remove headers and any padding */
609 rx_buf->mbuf->m_data += hdr_length;
610 rx_buf->mbuf->m_len = data_length;
612 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
614 /* Remove any padding */
615 rx_buf->mbuf->m_len = pkt_length;
617 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
624 sfxge_rx_deliver(rxq, rx_buf);
628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
629 uint16_t l2_id, void *nh, struct tcphdr *th)
631 unsigned bucket = conn_hash & st->conns_mask;
632 struct sfxge_lro_conn *c;
634 if (st->conns_n[bucket] >= lro_chain_max) {
639 if (!TAILQ_EMPTY(&st->free_conns)) {
640 c = TAILQ_FIRST(&st->free_conns);
641 TAILQ_REMOVE(&st->free_conns, c, link);
643 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
647 c->next_buf.mbuf = NULL;
650 /* Create the connection tracking data */
651 ++st->conns_n[bucket];
652 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
654 c->conn_hash = conn_hash;
655 c->source = th->th_sport;
656 c->dest = th->th_dport;
657 c->n_in_order_pkts = 0;
658 c->last_pkt_ticks = *(volatile int *)&ticks;
661 /* NB. We don't initialise c->next_seq, and it doesn't matter what
662 * value it has. Most likely the next packet received for this
663 * connection will not match -- no harm done.
667 /* Process mbuf and decide whether to dispatch it to the stack now or
671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
673 struct sfxge_softc *sc = rxq->sc;
674 struct mbuf *m = rx_buf->mbuf;
675 struct ether_header *eh;
676 struct sfxge_lro_conn *c;
684 /* Get the hardware hash */
685 conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
686 EFX_RX_HASHALG_TOEPLITZ,
689 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
690 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
691 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
692 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
693 SFXGE_LRO_L2_ID_VLAN;
694 l3_proto = veh->evl_proto;
698 l3_proto = eh->ether_type;
702 /* Check whether this is a suitable packet (unfragmented
703 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
704 * length, and compute a hash if necessary. If not, return.
706 if (l3_proto == htons(ETHERTYPE_IP)) {
709 KASSERT(iph->ip_p == IPPROTO_TCP,
710 ("IPv4 protocol is not TCP, but packet marker is set"));
711 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
712 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
714 th = (struct tcphdr *)(iph + 1);
715 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
716 struct ip6_hdr *iph = nh;
718 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
719 ("IPv6 next header is not TCP, but packet marker is set"));
720 l2_id |= SFXGE_LRO_L2_ID_IPV6;
721 th = (struct tcphdr *)(iph + 1);
726 bucket = conn_hash & rxq->lro.conns_mask;
728 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
729 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
731 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
733 if (c->mbuf != NULL) {
734 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
735 struct ip *c_iph, *iph = nh;
737 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
738 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
741 struct ip6_hdr *c_iph, *iph = nh;
743 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
744 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
749 /* Re-insert at head of list to reduce lookup time. */
750 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
751 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
753 if (c->next_buf.mbuf != NULL) {
754 if (!sfxge_lro_try_merge(rxq, c))
757 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
760 c->next_buf = *rx_buf;
765 rx_buf->flags = EFX_DISCARD;
769 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
771 sfxge_rx_deliver(rxq, rx_buf);
774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
776 struct sfxge_lro_state *st = &rxq->lro;
777 struct sfxge_lro_conn *c;
780 while (!LIST_EMPTY(&st->active_conns)) {
781 c = LIST_FIRST(&st->active_conns);
782 if (!c->delivered && c->mbuf != NULL)
783 sfxge_lro_deliver(st, c);
784 if (sfxge_lro_try_merge(rxq, c)) {
786 sfxge_lro_deliver(st, c);
787 LIST_REMOVE(c, active_link);
792 t = *(volatile int *)&ticks;
793 if (__predict_false(t != st->last_purge_ticks))
794 sfxge_lro_purge_idle(rxq, t);
797 #else /* !SFXGE_LRO */
800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
809 #endif /* SFXGE_LRO */
812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
814 struct sfxge_softc *sc = rxq->sc;
815 int if_capenable = sc->ifnet->if_capenable;
816 int lro_enabled = if_capenable & IFCAP_LRO;
818 struct sfxge_evq *evq;
819 unsigned int completed;
822 struct sfxge_rx_sw_desc *prev = NULL;
825 evq = sc->evq[index];
827 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
829 completed = rxq->completed;
830 while (completed != rxq->pending) {
832 struct sfxge_rx_sw_desc *rx_desc;
834 id = completed++ & rxq->ptr_mask;
835 rx_desc = &rxq->queue[id];
838 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
841 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
844 /* Read the length from the pseudo header if required */
845 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
848 rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
851 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
852 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
855 prefetch_read_many(mtod(m, caddr_t));
857 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
859 if (~if_capenable & IFCAP_RXCSUM)
861 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
864 if (~if_capenable & IFCAP_RXCSUM_IPV6)
865 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
868 /* Check for loopback packets */
870 struct ether_header *etherhp;
873 etherhp = mtod(m, struct ether_header *);
875 if (etherhp->ether_type ==
876 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
877 EFSYS_PROBE(loopback);
886 ("Rx descriptor with both IPv4 and IPv6 flags"));
890 /* Pass packet up the stack or into LRO (pipelined) */
893 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
894 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
895 sfxge_lro(rxq, prev);
897 sfxge_rx_deliver(rxq, prev);
903 /* Return the packet to the pool */
905 rx_desc->mbuf = NULL;
907 rxq->completed = completed;
909 level = rxq->added - rxq->completed;
911 /* Pass last packet up the stack or into LRO */
914 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
915 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
916 sfxge_lro(rxq, prev);
918 sfxge_rx_deliver(rxq, prev);
922 * If there are any pending flows and this is the end of the
923 * poll then they must be completed.
926 sfxge_lro_end_of_burst(rxq);
928 /* Top up the queue if necessary */
929 if (level < rxq->refill_threshold)
930 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
934 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
936 struct sfxge_rxq *rxq;
937 struct sfxge_evq *evq;
939 unsigned int retry = 3;
941 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
943 rxq = sc->rxq[index];
944 evq = sc->evq[index];
948 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
949 ("rxq not started"));
951 rxq->init_state = SFXGE_RXQ_INITIALIZED;
953 callout_stop(&rxq->refill_callout);
955 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
956 rxq->flush_state = SFXGE_FLUSH_PENDING;
958 SFXGE_EVQ_UNLOCK(evq);
960 /* Flush the receive queue */
961 if (efx_rx_qflush(rxq->common) != 0) {
963 rxq->flush_state = SFXGE_FLUSH_FAILED;
969 /* Spin for 100 ms */
972 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
975 } while (++count < 20);
979 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
980 /* Flush timeout - neither done nor failed */
981 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
982 device_get_nameunit(sc->dev), index);
983 rxq->flush_state = SFXGE_FLUSH_DONE;
987 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
988 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
989 device_get_nameunit(sc->dev), index);
990 rxq->flush_state = SFXGE_FLUSH_DONE;
993 rxq->pending = rxq->added;
994 sfxge_rx_qcomplete(rxq, B_TRUE);
996 KASSERT(rxq->completed == rxq->pending,
997 ("rxq->completed != rxq->pending"));
1005 /* Destroy the common code receive queue. */
1006 efx_rx_qdestroy(rxq->common);
1008 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1009 EFX_RXQ_NBUFS(sc->rxq_entries));
1011 SFXGE_EVQ_UNLOCK(evq);
1015 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1017 struct sfxge_rxq *rxq;
1019 struct sfxge_evq *evq;
1022 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1024 rxq = sc->rxq[index];
1026 evq = sc->evq[index];
1028 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1029 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1030 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1031 ("evq->init_state != SFXGE_EVQ_STARTED"));
1033 /* Program the buffer table. */
1034 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1035 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1038 /* Create the common code receive queue. */
1039 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1040 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1041 &rxq->common)) != 0)
1044 SFXGE_EVQ_LOCK(evq);
1046 /* Enable the receive queue. */
1047 efx_rx_qenable(rxq->common);
1049 rxq->init_state = SFXGE_RXQ_STARTED;
1050 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1052 /* Try to fill the queue from the pool. */
1053 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1055 SFXGE_EVQ_UNLOCK(evq);
1060 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1061 EFX_RXQ_NBUFS(sc->rxq_entries));
1066 sfxge_rx_stop(struct sfxge_softc *sc)
1070 efx_mac_filter_default_rxq_clear(sc->enp);
1072 /* Stop the receive queue(s) */
1073 index = sc->rxq_count;
1074 while (--index >= 0)
1075 sfxge_rx_qstop(sc, index);
1077 sc->rx_prefix_size = 0;
1078 sc->rx_buffer_size = 0;
1080 efx_rx_fini(sc->enp);
1084 sfxge_rx_start(struct sfxge_softc *sc)
1086 struct sfxge_intr *intr;
1087 const efx_nic_cfg_t *encp;
1088 size_t hdrlen, align, reserved;
1094 /* Initialize the common code receive module. */
1095 if ((rc = efx_rx_init(sc->enp)) != 0)
1098 encp = efx_nic_cfg_get(sc->enp);
1099 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1101 /* Calculate the receive packet buffer size. */
1102 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1104 /* Ensure IP headers are 32bit aligned */
1105 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1106 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1108 sc->rx_buffer_size += sc->rx_buffer_align;
1110 /* Align end of packet buffer for RX DMA end padding */
1111 align = MAX(1, encp->enc_rx_buf_align_end);
1112 EFSYS_ASSERT(ISP2(align));
1113 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1116 * Standard mbuf zones only guarantee pointer-size alignment;
1117 * we need extra space to align to the cache line
1119 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1121 /* Select zone for packet buffers */
1122 if (reserved <= MCLBYTES)
1123 sc->rx_cluster_size = MCLBYTES;
1124 else if (reserved <= MJUMPAGESIZE)
1125 sc->rx_cluster_size = MJUMPAGESIZE;
1126 else if (reserved <= MJUM9BYTES)
1127 sc->rx_cluster_size = MJUM9BYTES;
1129 sc->rx_cluster_size = MJUM16BYTES;
1132 * Set up the scale table. Enable all hash types and hash insertion.
1134 for (index = 0; index < nitems(sc->rx_indir_table); index++)
1136 sc->rx_indir_table[index] =
1137 rss_get_indirection_to_bucket(index) % sc->rxq_count;
1139 sc->rx_indir_table[index] = index % sc->rxq_count;
1141 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1142 nitems(sc->rx_indir_table))) != 0)
1144 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1145 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1146 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1149 rss_getkey(toep_key);
1151 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1152 sizeof(toep_key))) != 0)
1155 /* Start the receive queue(s). */
1156 for (index = 0; index < sc->rxq_count; index++) {
1157 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1161 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1162 sc->intr.n_alloc > 1);
1170 while (--index >= 0)
1171 sfxge_rx_qstop(sc, index);
1174 efx_rx_fini(sc->enp);
1181 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1183 struct sfxge_lro_state *st = &rxq->lro;
1186 st->conns_mask = lro_table_size - 1;
1187 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1188 ("lro_table_size must be a power of 2"));
1190 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1192 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1194 for (i = 0; i <= st->conns_mask; ++i) {
1195 TAILQ_INIT(&st->conns[i]);
1198 LIST_INIT(&st->active_conns);
1199 TAILQ_INIT(&st->free_conns);
1202 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1204 struct sfxge_lro_state *st = &rxq->lro;
1205 struct sfxge_lro_conn *c;
1208 /* Return cleanly if sfxge_lro_init() has not been called. */
1209 if (st->conns == NULL)
1212 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1214 for (i = 0; i <= st->conns_mask; ++i) {
1215 while (!TAILQ_EMPTY(&st->conns[i])) {
1216 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1217 sfxge_lro_drop(rxq, c);
1221 while (!TAILQ_EMPTY(&st->free_conns)) {
1222 c = TAILQ_FIRST(&st->free_conns);
1223 TAILQ_REMOVE(&st->free_conns, c, link);
1224 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1228 free(st->conns_n, M_SFXGE);
1229 free(st->conns, M_SFXGE);
1236 sfxge_lro_init(struct sfxge_rxq *rxq)
1241 sfxge_lro_fini(struct sfxge_rxq *rxq)
1245 #endif /* SFXGE_LRO */
1248 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1250 struct sfxge_rxq *rxq;
1252 rxq = sc->rxq[index];
1254 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1255 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1257 /* Free the context array and the flow table. */
1258 free(rxq->queue, M_SFXGE);
1259 sfxge_lro_fini(rxq);
1261 /* Release DMA memory. */
1262 sfxge_dma_free(&rxq->mem);
1264 sc->rxq[index] = NULL;
1270 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1272 struct sfxge_rxq *rxq;
1273 struct sfxge_evq *evq;
1277 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1279 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1282 rxq->entries = sc->rxq_entries;
1283 rxq->ptr_mask = rxq->entries - 1;
1284 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1286 sc->rxq[index] = rxq;
1289 evq = sc->evq[index];
1291 /* Allocate and zero DMA space. */
1292 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1295 /* Allocate buffer table entries. */
1296 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1299 /* Allocate the context array and the flow table. */
1300 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1301 M_SFXGE, M_WAITOK | M_ZERO);
1302 sfxge_lro_init(rxq);
1304 callout_init(&rxq->refill_callout, 1);
1306 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1311 static const struct {
1314 } sfxge_rx_stats[] = {
1315 #define SFXGE_RX_STAT(name, member) \
1316 { #name, offsetof(struct sfxge_rxq, member) }
1318 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1319 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1320 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1321 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1322 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1323 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1324 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1325 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1330 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1332 struct sfxge_softc *sc = arg1;
1333 unsigned int id = arg2;
1334 unsigned int sum, index;
1336 /* Sum across all RX queues */
1338 for (index = 0; index < sc->rxq_count; index++)
1339 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1340 sfxge_rx_stats[id].offset);
1342 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1346 sfxge_rx_stat_init(struct sfxge_softc *sc)
1348 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1349 struct sysctl_oid_list *stat_list;
1352 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1354 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1357 OID_AUTO, sfxge_rx_stats[id].name,
1358 CTLTYPE_UINT|CTLFLAG_RD,
1359 sc, id, sfxge_rx_stat_handler, "IU",
1365 sfxge_rx_fini(struct sfxge_softc *sc)
1369 index = sc->rxq_count;
1370 while (--index >= 0)
1371 sfxge_rx_qfini(sc, index);
1377 sfxge_rx_init(struct sfxge_softc *sc)
1379 struct sfxge_intr *intr;
1384 if (!ISP2(lro_table_size)) {
1385 log(LOG_ERR, "%s=%u must be power of 2",
1386 SFXGE_LRO_PARAM(table_size), lro_table_size);
1388 goto fail_lro_table_size;
1391 if (lro_idle_ticks == 0)
1392 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1397 sc->rxq_count = intr->n_alloc;
1399 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1400 ("intr->state != SFXGE_INTR_INITIALIZED"));
1402 /* Initialize the receive queue(s) - one per interrupt. */
1403 for (index = 0; index < sc->rxq_count; index++) {
1404 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1408 sfxge_rx_stat_init(sc);
1413 /* Tear down the receive queue(s). */
1414 while (--index >= 0)
1415 sfxge_rx_qfini(sc, index);
1420 fail_lro_table_size: