2 * Copyright (c) 2010-2015 Solarflare Communications Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
11 * 1. Redistributions of source code must retain the above copyright notice,
12 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include <sys/param.h>
38 #include <sys/malloc.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/syslog.h>
44 #include <sys/limits.h>
45 #include <sys/syslog.h>
47 #include <net/ethernet.h>
49 #include <net/if_vlan_var.h>
51 #include <netinet/in.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
54 #include <netinet/tcp.h>
56 #include <machine/in_cksum.h>
58 #include "common/efx.h"
64 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
68 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
69 "Large receive offload (LRO) parameters");
71 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
73 /* Size of the LRO hash table. Must be a power of 2. A larger table
74 * means we can accelerate a larger number of streams.
76 static unsigned lro_table_size = 128;
77 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
78 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
80 "Size of the LRO hash table (must be a power of 2)");
82 /* Maximum length of a hash chain. If chains get too long then the lookup
83 * time increases and may exceed the benefit of LRO.
85 static unsigned lro_chain_max = 20;
86 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
87 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
89 "The maximum length of a hash chain");
91 /* Maximum time (in ticks) that a connection can be idle before it's LRO
94 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
95 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
96 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
98 "The maximum time (in ticks) that a connection can be idle "
99 "before it's LRO state is discarded");
101 /* Number of packets with payload that must arrive in-order before a
102 * connection is eligible for LRO. The idea is we should avoid coalescing
103 * segments when the sender is in slow-start because reducing the ACK rate
104 * can damage performance.
106 static int lro_slow_start_packets = 2000;
107 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
108 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
109 &lro_slow_start_packets, 0,
110 "Number of packets with payload that must arrive in-order before "
111 "a connection is eligible for LRO");
113 /* Number of packets with payload that must arrive in-order following loss
114 * before a connection is eligible for LRO. The idea is we should avoid
115 * coalescing segments when the sender is recovering from loss, because
116 * reducing the ACK rate can damage performance.
118 static int lro_loss_packets = 20;
119 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
120 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
121 &lro_loss_packets, 0,
122 "Number of packets with payload that must arrive in-order "
123 "following loss before a connection is eligible for LRO");
125 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
126 #define SFXGE_LRO_L2_ID_VLAN 0x4000
127 #define SFXGE_LRO_L2_ID_IPV6 0x8000
128 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
129 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
131 /* Compare IPv6 addresses, avoiding conditional branches */
132 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
133 const struct in6_addr *right)
136 const uint64_t *left64 = (const uint64_t *)left;
137 const uint64_t *right64 = (const uint64_t *)right;
138 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
140 return (left->s6_addr32[0] - right->s6_addr32[0]) |
141 (left->s6_addr32[1] - right->s6_addr32[1]) |
142 (left->s6_addr32[2] - right->s6_addr32[2]) |
143 (left->s6_addr32[3] - right->s6_addr32[3]);
147 #endif /* SFXGE_LRO */
150 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
153 rxq->flush_state = SFXGE_FLUSH_DONE;
157 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
160 rxq->flush_state = SFXGE_FLUSH_FAILED;
163 static uint8_t toep_key[] = {
164 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
165 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
166 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
167 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
168 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
172 sfxge_rx_post_refill(void *arg)
174 struct sfxge_rxq *rxq = arg;
175 struct sfxge_softc *sc;
177 struct sfxge_evq *evq;
182 evq = sc->evq[index];
184 magic = SFXGE_MAGIC_RX_QREFILL | index;
186 /* This is guaranteed due to the start/stop order of rx and ev */
187 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
188 ("evq not started"));
189 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
190 ("rxq not started"));
191 efx_ev_qpost(evq->common, magic);
195 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
197 /* Initially retry after 100 ms, but back off in case of
198 * repeated failures as we probably have to wait for the
199 * administrator to raise the pool limit. */
201 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
203 rxq->refill_delay = hz / 10;
205 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
206 sfxge_rx_post_refill, rxq);
209 #define SFXGE_REFILL_BATCH 64
212 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
214 struct sfxge_softc *sc;
216 struct sfxge_evq *evq;
219 unsigned int mblksize;
221 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
225 evq = sc->evq[index];
227 prefetch_read_many(sc->enp);
228 prefetch_read_many(rxq->common);
230 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
232 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
235 rxfill = rxq->added - rxq->completed;
236 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
237 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
238 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
239 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
240 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
246 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
247 while (ntodo-- > 0) {
249 struct sfxge_rx_sw_desc *rx_desc;
250 bus_dma_segment_t seg;
253 id = (rxq->added + batch) & rxq->ptr_mask;
254 rx_desc = &rxq->queue[id];
255 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
257 rx_desc->flags = EFX_DISCARD;
258 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
259 sc->rx_cluster_size);
263 /* m_len specifies length of area to be mapped for DMA */
265 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
266 m->m_data += sc->rx_buffer_align;
268 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
269 addr[batch++] = seg.ds_addr;
271 if (batch == SFXGE_REFILL_BATCH) {
272 efx_rx_qpost(rxq->common, addr, mblksize, batch,
273 rxq->completed, rxq->added);
280 sfxge_rx_schedule_refill(rxq, retrying);
283 efx_rx_qpost(rxq->common, addr, mblksize, batch,
284 rxq->completed, rxq->added);
288 /* Make the descriptors visible to the hardware */
289 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
290 BUS_DMASYNC_PREWRITE);
292 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
294 /* The queue could still be empty if no descriptors were actually
295 * pushed, in which case there will be no event to cause the next
296 * refill, so we must schedule a refill ourselves.
298 if(rxq->pushed == rxq->completed) {
299 sfxge_rx_schedule_refill(rxq, retrying);
304 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
307 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
310 /* Make sure the queue is full */
311 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
314 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
316 struct ifnet *ifp = sc->ifnet;
318 m->m_pkthdr.rcvif = ifp;
319 m->m_pkthdr.csum_data = 0xffff;
320 ifp->if_input(ifp, m);
324 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
326 struct mbuf *m = rx_desc->mbuf;
327 int flags = rx_desc->flags;
330 /* Convert checksum flags */
331 csum_flags = (flags & EFX_CKSUM_IPV4) ?
332 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
333 if (flags & EFX_CKSUM_TCPUDP)
334 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
336 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
338 efx_psuedo_hdr_hash_get(sc->enp,
339 EFX_RX_HASHALG_TOEPLITZ,
341 /* The hash covers a 4-tuple for TCP only */
343 (flags & EFX_PKT_IPV4) ?
344 ((flags & EFX_PKT_TCP) ?
345 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
346 ((flags & EFX_PKT_TCP) ?
347 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
349 m->m_data += sc->rx_prefix_size;
350 m->m_len = rx_desc->size - sc->rx_prefix_size;
351 m->m_pkthdr.len = m->m_len;
352 m->m_pkthdr.csum_flags = csum_flags;
353 __sfxge_rx_deliver(sc, rx_desc->mbuf);
355 rx_desc->flags = EFX_DISCARD;
356 rx_desc->mbuf = NULL;
362 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
364 struct sfxge_softc *sc = st->sc;
365 struct mbuf *m = c->mbuf;
369 KASSERT(m, ("no mbuf to deliver"));
373 /* Finish off packet munging and recalculate IP header checksum. */
374 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
375 struct ip *iph = c->nh;
376 iph->ip_len = htons(iph->ip_len);
378 iph->ip_sum = in_cksum_hdr(iph);
379 c_th = (struct tcphdr *)(iph + 1);
380 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
381 CSUM_IP_CHECKED | CSUM_IP_VALID);
383 struct ip6_hdr *iph = c->nh;
384 iph->ip6_plen = htons(iph->ip6_plen);
385 c_th = (struct tcphdr *)(iph + 1);
386 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
389 c_th->th_win = c->th_last->th_win;
390 c_th->th_ack = c->th_last->th_ack;
391 if (c_th->th_off == c->th_last->th_off) {
392 /* Copy TCP options (take care to avoid going negative). */
393 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
394 memcpy(c_th + 1, c->th_last + 1, optlen);
397 m->m_pkthdr.flowid = c->conn_hash;
399 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
400 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
402 m->m_pkthdr.csum_flags = csum_flags;
403 __sfxge_rx_deliver(sc, m);
409 /* Drop the given connection, and add it to the free list. */
410 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
414 KASSERT(!c->mbuf, ("found orphaned mbuf"));
416 if (c->next_buf.mbuf != NULL) {
417 sfxge_rx_deliver(rxq->sc, &c->next_buf);
418 LIST_REMOVE(c, active_link);
421 bucket = c->conn_hash & rxq->lro.conns_mask;
422 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
423 --rxq->lro.conns_n[bucket];
424 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
425 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
428 /* Stop tracking connections that have gone idle in order to keep hash
431 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
433 struct sfxge_lro_conn *c;
436 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
437 ("found active connections"));
439 rxq->lro.last_purge_ticks = now;
440 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
441 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
444 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
445 if (now - c->last_pkt_ticks > lro_idle_ticks) {
446 ++rxq->lro.n_drop_idle;
447 sfxge_lro_drop(rxq, c);
453 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
454 struct mbuf *mbuf, struct tcphdr *th)
458 /* Tack the new mbuf onto the chain. */
459 KASSERT(!mbuf->m_next, ("mbuf already chained"));
460 c->mbuf_tail->m_next = mbuf;
463 /* Increase length appropriately */
464 c->mbuf->m_pkthdr.len += mbuf->m_len;
466 /* Update the connection state flags */
467 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
468 struct ip *iph = c->nh;
469 iph->ip_len += mbuf->m_len;
470 c_th = (struct tcphdr *)(iph + 1);
472 struct ip6_hdr *iph = c->nh;
473 iph->ip6_plen += mbuf->m_len;
474 c_th = (struct tcphdr *)(iph + 1);
476 c_th->th_flags |= (th->th_flags & TH_PUSH);
480 /* Pass packet up now if another segment could overflow the IP
483 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
484 sfxge_lro_deliver(st, c);
488 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
489 struct mbuf *mbuf, void *nh, struct tcphdr *th)
491 /* Start the chain */
493 c->mbuf_tail = c->mbuf;
497 mbuf->m_pkthdr.len = mbuf->m_len;
499 /* Mangle header fields for later processing */
500 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
502 iph->ip_len = ntohs(iph->ip_len);
504 struct ip6_hdr *iph = nh;
505 iph->ip6_plen = ntohs(iph->ip6_plen);
509 /* Try to merge or otherwise hold or deliver (as appropriate) the
510 * packet buffered for this connection (c->next_buf). Return a flag
511 * indicating whether the connection is still active for LRO purposes.
514 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
516 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
517 char *eh = c->next_eh;
518 int data_length, hdr_length, dont_merge;
519 unsigned th_seq, pkt_length;
523 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
524 struct ip *iph = c->next_nh;
525 th = (struct tcphdr *)(iph + 1);
526 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
528 struct ip6_hdr *iph = c->next_nh;
529 th = (struct tcphdr *)(iph + 1);
530 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
533 hdr_length = (char *) th + th->th_off * 4 - eh;
534 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
536 th_seq = ntohl(th->th_seq);
537 dont_merge = ((data_length <= 0)
538 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
540 /* Check for options other than aligned timestamp. */
541 if (th->th_off != 5) {
542 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
543 if (th->th_off == 8 &&
544 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
546 (TCPOPT_TIMESTAMP << 8) |
547 TCPOLEN_TIMESTAMP)) {
548 /* timestamp option -- okay */
554 if (__predict_false(th_seq != c->next_seq)) {
555 /* Out-of-order, so start counting again. */
557 sfxge_lro_deliver(&rxq->lro, c);
558 c->n_in_order_pkts -= lro_loss_packets;
559 c->next_seq = th_seq + data_length;
560 ++rxq->lro.n_misorder;
561 goto deliver_buf_out;
563 c->next_seq = th_seq + data_length;
566 if (now - c->last_pkt_ticks > lro_idle_ticks) {
567 ++rxq->lro.n_drop_idle;
569 sfxge_lro_deliver(&rxq->lro, c);
570 sfxge_lro_drop(rxq, c);
573 c->last_pkt_ticks = ticks;
575 if (c->n_in_order_pkts < lro_slow_start_packets) {
576 /* May be in slow-start, so don't merge. */
577 ++rxq->lro.n_slow_start;
578 ++c->n_in_order_pkts;
579 goto deliver_buf_out;
582 if (__predict_false(dont_merge)) {
584 sfxge_lro_deliver(&rxq->lro, c);
585 if (th->th_flags & (TH_FIN | TH_RST)) {
586 ++rxq->lro.n_drop_closed;
587 sfxge_lro_drop(rxq, c);
590 goto deliver_buf_out;
593 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
595 if (__predict_true(c->mbuf != NULL)) {
596 /* Remove headers and any padding */
597 rx_buf->mbuf->m_data += hdr_length;
598 rx_buf->mbuf->m_len = data_length;
600 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
602 /* Remove any padding */
603 rx_buf->mbuf->m_len = pkt_length;
605 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
612 sfxge_rx_deliver(rxq->sc, rx_buf);
616 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
617 uint16_t l2_id, void *nh, struct tcphdr *th)
619 unsigned bucket = conn_hash & st->conns_mask;
620 struct sfxge_lro_conn *c;
622 if (st->conns_n[bucket] >= lro_chain_max) {
627 if (!TAILQ_EMPTY(&st->free_conns)) {
628 c = TAILQ_FIRST(&st->free_conns);
629 TAILQ_REMOVE(&st->free_conns, c, link);
631 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
635 c->next_buf.mbuf = NULL;
638 /* Create the connection tracking data */
639 ++st->conns_n[bucket];
640 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
642 c->conn_hash = conn_hash;
643 c->source = th->th_sport;
644 c->dest = th->th_dport;
645 c->n_in_order_pkts = 0;
646 c->last_pkt_ticks = *(volatile int *)&ticks;
649 /* NB. We don't initialise c->next_seq, and it doesn't matter what
650 * value it has. Most likely the next packet received for this
651 * connection will not match -- no harm done.
655 /* Process mbuf and decide whether to dispatch it to the stack now or
659 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
661 struct sfxge_softc *sc = rxq->sc;
662 struct mbuf *m = rx_buf->mbuf;
663 struct ether_header *eh;
664 struct sfxge_lro_conn *c;
672 /* Get the hardware hash */
673 conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
674 EFX_RX_HASHALG_TOEPLITZ,
677 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
678 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
679 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
680 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
681 SFXGE_LRO_L2_ID_VLAN;
682 l3_proto = veh->evl_proto;
686 l3_proto = eh->ether_type;
690 /* Check whether this is a suitable packet (unfragmented
691 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
692 * length, and compute a hash if necessary. If not, return.
694 if (l3_proto == htons(ETHERTYPE_IP)) {
697 KASSERT(iph->ip_p == IPPROTO_TCP,
698 ("IPv4 protocol is not TCP, but packet marker is set"));
699 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
700 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
702 th = (struct tcphdr *)(iph + 1);
703 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
704 struct ip6_hdr *iph = nh;
706 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
707 ("IPv6 next header is not TCP, but packet marker is set"));
708 l2_id |= SFXGE_LRO_L2_ID_IPV6;
709 th = (struct tcphdr *)(iph + 1);
714 bucket = conn_hash & rxq->lro.conns_mask;
716 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
717 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
719 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
721 if (c->mbuf != NULL) {
722 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
723 struct ip *c_iph, *iph = nh;
725 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
726 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
729 struct ip6_hdr *c_iph, *iph = nh;
731 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
732 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
737 /* Re-insert at head of list to reduce lookup time. */
738 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
739 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
741 if (c->next_buf.mbuf != NULL) {
742 if (!sfxge_lro_try_merge(rxq, c))
745 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
748 c->next_buf = *rx_buf;
753 rx_buf->flags = EFX_DISCARD;
757 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
759 sfxge_rx_deliver(sc, rx_buf);
762 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
764 struct sfxge_lro_state *st = &rxq->lro;
765 struct sfxge_lro_conn *c;
768 while (!LIST_EMPTY(&st->active_conns)) {
769 c = LIST_FIRST(&st->active_conns);
770 if (!c->delivered && c->mbuf != NULL)
771 sfxge_lro_deliver(st, c);
772 if (sfxge_lro_try_merge(rxq, c)) {
774 sfxge_lro_deliver(st, c);
775 LIST_REMOVE(c, active_link);
780 t = *(volatile int *)&ticks;
781 if (__predict_false(t != st->last_purge_ticks))
782 sfxge_lro_purge_idle(rxq, t);
785 #else /* !SFXGE_LRO */
788 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
793 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
797 #endif /* SFXGE_LRO */
800 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
802 struct sfxge_softc *sc = rxq->sc;
803 int if_capenable = sc->ifnet->if_capenable;
804 int lro_enabled = if_capenable & IFCAP_LRO;
806 struct sfxge_evq *evq;
807 unsigned int completed;
810 struct sfxge_rx_sw_desc *prev = NULL;
813 evq = sc->evq[index];
815 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
817 completed = rxq->completed;
818 while (completed != rxq->pending) {
820 struct sfxge_rx_sw_desc *rx_desc;
822 id = completed++ & rxq->ptr_mask;
823 rx_desc = &rxq->queue[id];
826 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
829 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
832 /* Read the length from the psuedo header if required */
833 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
836 rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
839 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
840 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
843 prefetch_read_many(mtod(m, caddr_t));
845 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
847 if (~if_capenable & IFCAP_RXCSUM)
849 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
852 if (~if_capenable & IFCAP_RXCSUM_IPV6)
853 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
856 /* Check for loopback packets */
858 struct ether_header *etherhp;
861 etherhp = mtod(m, struct ether_header *);
863 if (etherhp->ether_type ==
864 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
865 EFSYS_PROBE(loopback);
874 ("Rx descriptor with both IPv4 and IPv6 flags"));
878 /* Pass packet up the stack or into LRO (pipelined) */
881 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
882 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
883 sfxge_lro(rxq, prev);
885 sfxge_rx_deliver(sc, prev);
891 /* Return the packet to the pool */
893 rx_desc->mbuf = NULL;
895 rxq->completed = completed;
897 level = rxq->added - rxq->completed;
899 /* Pass last packet up the stack or into LRO */
902 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
903 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
904 sfxge_lro(rxq, prev);
906 sfxge_rx_deliver(sc, prev);
910 * If there are any pending flows and this is the end of the
911 * poll then they must be completed.
914 sfxge_lro_end_of_burst(rxq);
916 /* Top up the queue if necessary */
917 if (level < rxq->refill_threshold)
918 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
922 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
924 struct sfxge_rxq *rxq;
925 struct sfxge_evq *evq;
927 unsigned int retry = 3;
929 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
931 rxq = sc->rxq[index];
932 evq = sc->evq[index];
936 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
937 ("rxq not started"));
939 rxq->init_state = SFXGE_RXQ_INITIALIZED;
941 callout_stop(&rxq->refill_callout);
943 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
944 rxq->flush_state = SFXGE_FLUSH_PENDING;
946 SFXGE_EVQ_UNLOCK(evq);
948 /* Flush the receive queue */
949 if (efx_rx_qflush(rxq->common) != 0) {
951 rxq->flush_state = SFXGE_FLUSH_FAILED;
957 /* Spin for 100 ms */
960 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
963 } while (++count < 20);
967 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
968 /* Flush timeout - neither done nor failed */
969 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
970 device_get_nameunit(sc->dev), index);
971 rxq->flush_state = SFXGE_FLUSH_DONE;
975 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
976 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
977 device_get_nameunit(sc->dev), index);
978 rxq->flush_state = SFXGE_FLUSH_DONE;
981 rxq->pending = rxq->added;
982 sfxge_rx_qcomplete(rxq, B_TRUE);
984 KASSERT(rxq->completed == rxq->pending,
985 ("rxq->completed != rxq->pending"));
993 /* Destroy the common code receive queue. */
994 efx_rx_qdestroy(rxq->common);
996 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
997 EFX_RXQ_NBUFS(sc->rxq_entries));
999 SFXGE_EVQ_UNLOCK(evq);
1003 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1005 struct sfxge_rxq *rxq;
1007 struct sfxge_evq *evq;
1010 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1012 rxq = sc->rxq[index];
1014 evq = sc->evq[index];
1016 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1017 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1018 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1019 ("evq->init_state != SFXGE_EVQ_STARTED"));
1021 /* Program the buffer table. */
1022 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1023 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1026 /* Create the common code receive queue. */
1027 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1028 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1029 &rxq->common)) != 0)
1032 SFXGE_EVQ_LOCK(evq);
1034 /* Enable the receive queue. */
1035 efx_rx_qenable(rxq->common);
1037 rxq->init_state = SFXGE_RXQ_STARTED;
1038 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1040 /* Try to fill the queue from the pool. */
1041 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1043 SFXGE_EVQ_UNLOCK(evq);
1048 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1049 EFX_RXQ_NBUFS(sc->rxq_entries));
1054 sfxge_rx_stop(struct sfxge_softc *sc)
1058 efx_mac_filter_default_rxq_clear(sc->enp);
1060 /* Stop the receive queue(s) */
1061 index = sc->rxq_count;
1062 while (--index >= 0)
1063 sfxge_rx_qstop(sc, index);
1065 sc->rx_prefix_size = 0;
1066 sc->rx_buffer_size = 0;
1068 efx_rx_fini(sc->enp);
1072 sfxge_rx_start(struct sfxge_softc *sc)
1074 struct sfxge_intr *intr;
1075 const efx_nic_cfg_t *encp;
1076 size_t hdrlen, align, reserved;
1082 /* Initialize the common code receive module. */
1083 if ((rc = efx_rx_init(sc->enp)) != 0)
1086 encp = efx_nic_cfg_get(sc->enp);
1087 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1089 /* Calculate the receive packet buffer size. */
1090 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1092 /* Ensure IP headers are 32bit aligned */
1093 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1094 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1096 sc->rx_buffer_size += sc->rx_buffer_align;
1098 /* Align end of packet buffer for RX DMA end padding */
1099 align = MAX(1, encp->enc_rx_buf_align_end);
1100 EFSYS_ASSERT(ISP2(align));
1101 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1104 * Standard mbuf zones only guarantee pointer-size alignment;
1105 * we need extra space to align to the cache line
1107 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1109 /* Select zone for packet buffers */
1110 if (reserved <= MCLBYTES)
1111 sc->rx_cluster_size = MCLBYTES;
1112 else if (reserved <= MJUMPAGESIZE)
1113 sc->rx_cluster_size = MJUMPAGESIZE;
1114 else if (reserved <= MJUM9BYTES)
1115 sc->rx_cluster_size = MJUM9BYTES;
1117 sc->rx_cluster_size = MJUM16BYTES;
1120 * Set up the scale table. Enable all hash types and hash insertion.
1122 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1123 sc->rx_indir_table[index] = index % sc->rxq_count;
1124 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1125 SFXGE_RX_SCALE_MAX)) != 0)
1127 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1128 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1129 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1131 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1132 sizeof(toep_key))) != 0)
1135 /* Start the receive queue(s). */
1136 for (index = 0; index < sc->rxq_count; index++) {
1137 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1141 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1142 sc->intr.n_alloc > 1);
1150 while (--index >= 0)
1151 sfxge_rx_qstop(sc, index);
1154 efx_rx_fini(sc->enp);
1161 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1163 struct sfxge_lro_state *st = &rxq->lro;
1166 st->conns_mask = lro_table_size - 1;
1167 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1168 ("lro_table_size must be a power of 2"));
1170 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1172 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1174 for (i = 0; i <= st->conns_mask; ++i) {
1175 TAILQ_INIT(&st->conns[i]);
1178 LIST_INIT(&st->active_conns);
1179 TAILQ_INIT(&st->free_conns);
1182 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1184 struct sfxge_lro_state *st = &rxq->lro;
1185 struct sfxge_lro_conn *c;
1188 /* Return cleanly if sfxge_lro_init() has not been called. */
1189 if (st->conns == NULL)
1192 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1194 for (i = 0; i <= st->conns_mask; ++i) {
1195 while (!TAILQ_EMPTY(&st->conns[i])) {
1196 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1197 sfxge_lro_drop(rxq, c);
1201 while (!TAILQ_EMPTY(&st->free_conns)) {
1202 c = TAILQ_FIRST(&st->free_conns);
1203 TAILQ_REMOVE(&st->free_conns, c, link);
1204 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1208 free(st->conns_n, M_SFXGE);
1209 free(st->conns, M_SFXGE);
1216 sfxge_lro_init(struct sfxge_rxq *rxq)
1221 sfxge_lro_fini(struct sfxge_rxq *rxq)
1225 #endif /* SFXGE_LRO */
1228 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1230 struct sfxge_rxq *rxq;
1232 rxq = sc->rxq[index];
1234 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1235 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1237 /* Free the context array and the flow table. */
1238 free(rxq->queue, M_SFXGE);
1239 sfxge_lro_fini(rxq);
1241 /* Release DMA memory. */
1242 sfxge_dma_free(&rxq->mem);
1244 sc->rxq[index] = NULL;
1250 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1252 struct sfxge_rxq *rxq;
1253 struct sfxge_evq *evq;
1257 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1259 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1262 rxq->entries = sc->rxq_entries;
1263 rxq->ptr_mask = rxq->entries - 1;
1264 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1266 sc->rxq[index] = rxq;
1269 evq = sc->evq[index];
1271 /* Allocate and zero DMA space. */
1272 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1275 /* Allocate buffer table entries. */
1276 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1279 /* Allocate the context array and the flow table. */
1280 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1281 M_SFXGE, M_WAITOK | M_ZERO);
1282 sfxge_lro_init(rxq);
1284 callout_init(&rxq->refill_callout, 1);
1286 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1291 static const struct {
1294 } sfxge_rx_stats[] = {
1295 #define SFXGE_RX_STAT(name, member) \
1296 { #name, offsetof(struct sfxge_rxq, member) }
1298 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1299 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1300 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1301 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1302 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1303 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1304 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1305 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1310 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1312 struct sfxge_softc *sc = arg1;
1313 unsigned int id = arg2;
1314 unsigned int sum, index;
1316 /* Sum across all RX queues */
1318 for (index = 0; index < sc->rxq_count; index++)
1319 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1320 sfxge_rx_stats[id].offset);
1322 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1326 sfxge_rx_stat_init(struct sfxge_softc *sc)
1328 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1329 struct sysctl_oid_list *stat_list;
1332 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1334 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1337 OID_AUTO, sfxge_rx_stats[id].name,
1338 CTLTYPE_UINT|CTLFLAG_RD,
1339 sc, id, sfxge_rx_stat_handler, "IU",
1345 sfxge_rx_fini(struct sfxge_softc *sc)
1349 index = sc->rxq_count;
1350 while (--index >= 0)
1351 sfxge_rx_qfini(sc, index);
1357 sfxge_rx_init(struct sfxge_softc *sc)
1359 struct sfxge_intr *intr;
1364 if (!ISP2(lro_table_size)) {
1365 log(LOG_ERR, "%s=%u must be power of 2",
1366 SFXGE_LRO_PARAM(table_size), lro_table_size);
1368 goto fail_lro_table_size;
1371 if (lro_idle_ticks == 0)
1372 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1377 sc->rxq_count = intr->n_alloc;
1379 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1380 ("intr->state != SFXGE_INTR_INITIALIZED"));
1382 /* Initialize the receive queue(s) - one per interrupt. */
1383 for (index = 0; index < sc->rxq_count; index++) {
1384 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1388 sfxge_rx_stat_init(sc);
1393 /* Tear down the receive queue(s). */
1394 while (--index >= 0)
1395 sfxge_rx_qfini(sc, index);
1400 fail_lro_table_size: