2 * Copyright (c) 2010-2015 Solarflare Communications Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
11 * 1. Redistributions of source code must retain the above copyright notice,
12 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include <sys/types.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/syslog.h>
43 #include <sys/limits.h>
44 #include <sys/syslog.h>
46 #include <net/ethernet.h>
48 #include <net/if_vlan_var.h>
50 #include <netinet/in.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 #include <netinet/tcp.h>
55 #include <machine/in_cksum.h>
57 #include "common/efx.h"
63 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
67 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
68 "Large receive offload (LRO) parameters");
70 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
72 /* Size of the LRO hash table. Must be a power of 2. A larger table
73 * means we can accelerate a larger number of streams.
75 static unsigned lro_table_size = 128;
76 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
77 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
79 "Size of the LRO hash table (must be a power of 2)");
81 /* Maximum length of a hash chain. If chains get too long then the lookup
82 * time increases and may exceed the benefit of LRO.
84 static unsigned lro_chain_max = 20;
85 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
88 "The maximum length of a hash chain");
90 /* Maximum time (in ticks) that a connection can be idle before it's LRO
93 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
94 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
97 "The maximum time (in ticks) that a connection can be idle "
98 "before it's LRO state is discarded");
100 /* Number of packets with payload that must arrive in-order before a
101 * connection is eligible for LRO. The idea is we should avoid coalescing
102 * segments when the sender is in slow-start because reducing the ACK rate
103 * can damage performance.
105 static int lro_slow_start_packets = 2000;
106 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
107 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
108 &lro_slow_start_packets, 0,
109 "Number of packets with payload that must arrive in-order before "
110 "a connection is eligible for LRO");
112 /* Number of packets with payload that must arrive in-order following loss
113 * before a connection is eligible for LRO. The idea is we should avoid
114 * coalescing segments when the sender is recovering from loss, because
115 * reducing the ACK rate can damage performance.
117 static int lro_loss_packets = 20;
118 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
119 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
120 &lro_loss_packets, 0,
121 "Number of packets with payload that must arrive in-order "
122 "following loss before a connection is eligible for LRO");
124 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
125 #define SFXGE_LRO_L2_ID_VLAN 0x4000
126 #define SFXGE_LRO_L2_ID_IPV6 0x8000
127 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
128 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
130 /* Compare IPv6 addresses, avoiding conditional branches */
131 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
132 const struct in6_addr *right)
135 const uint64_t *left64 = (const uint64_t *)left;
136 const uint64_t *right64 = (const uint64_t *)right;
137 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
139 return (left->s6_addr32[0] - right->s6_addr32[0]) |
140 (left->s6_addr32[1] - right->s6_addr32[1]) |
141 (left->s6_addr32[2] - right->s6_addr32[2]) |
142 (left->s6_addr32[3] - right->s6_addr32[3]);
146 #endif /* SFXGE_LRO */
149 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
152 rxq->flush_state = SFXGE_FLUSH_DONE;
156 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
159 rxq->flush_state = SFXGE_FLUSH_FAILED;
162 static uint8_t toep_key[] = {
163 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
164 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
165 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
166 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
167 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
171 sfxge_rx_post_refill(void *arg)
173 struct sfxge_rxq *rxq = arg;
174 struct sfxge_softc *sc;
176 struct sfxge_evq *evq;
181 evq = sc->evq[index];
183 magic = SFXGE_MAGIC_RX_QREFILL | index;
185 /* This is guaranteed due to the start/stop order of rx and ev */
186 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
187 ("evq not started"));
188 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
189 ("rxq not started"));
190 efx_ev_qpost(evq->common, magic);
194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
196 /* Initially retry after 100 ms, but back off in case of
197 * repeated failures as we probably have to wait for the
198 * administrator to raise the pool limit. */
200 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
202 rxq->refill_delay = hz / 10;
204 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
205 sfxge_rx_post_refill, rxq);
208 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
213 /* Allocate mbuf structure */
214 args.flags = M_PKTHDR;
216 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
218 /* Allocate (and attach) packet buffer */
219 if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
220 uma_zfree(zone_mbuf, m);
227 #define SFXGE_REFILL_BATCH 64
230 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
232 struct sfxge_softc *sc;
234 struct sfxge_evq *evq;
237 unsigned int mblksize;
239 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
243 evq = sc->evq[index];
245 prefetch_read_many(sc->enp);
246 prefetch_read_many(rxq->common);
248 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
250 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
253 rxfill = rxq->added - rxq->completed;
254 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
255 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
256 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
257 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
258 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
264 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
265 while (ntodo-- > 0) {
267 struct sfxge_rx_sw_desc *rx_desc;
268 bus_dma_segment_t seg;
271 id = (rxq->added + batch) & rxq->ptr_mask;
272 rx_desc = &rxq->queue[id];
273 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
275 rx_desc->flags = EFX_DISCARD;
276 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
280 /* m_len specifies length of area to be mapped for DMA */
282 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
283 m->m_data += sc->rx_buffer_align;
285 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
286 addr[batch++] = seg.ds_addr;
288 if (batch == SFXGE_REFILL_BATCH) {
289 efx_rx_qpost(rxq->common, addr, mblksize, batch,
290 rxq->completed, rxq->added);
297 sfxge_rx_schedule_refill(rxq, retrying);
300 efx_rx_qpost(rxq->common, addr, mblksize, batch,
301 rxq->completed, rxq->added);
305 /* Make the descriptors visible to the hardware */
306 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
307 BUS_DMASYNC_PREWRITE);
309 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
311 /* The queue could still be empty if no descriptors were actually
312 * pushed, in which case there will be no event to cause the next
313 * refill, so we must schedule a refill ourselves.
315 if(rxq->pushed == rxq->completed) {
316 sfxge_rx_schedule_refill(rxq, retrying);
321 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
324 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
327 /* Make sure the queue is full */
328 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
331 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
333 struct ifnet *ifp = sc->ifnet;
335 m->m_pkthdr.rcvif = ifp;
336 m->m_pkthdr.csum_data = 0xffff;
337 ifp->if_input(ifp, m);
341 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
343 struct mbuf *m = rx_desc->mbuf;
344 int flags = rx_desc->flags;
347 /* Convert checksum flags */
348 csum_flags = (flags & EFX_CKSUM_IPV4) ?
349 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
350 if (flags & EFX_CKSUM_TCPUDP)
351 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
353 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
355 efx_psuedo_hdr_hash_get(sc->enp,
356 EFX_RX_HASHALG_TOEPLITZ,
358 /* The hash covers a 4-tuple for TCP only */
360 (flags & EFX_PKT_IPV4) ?
361 ((flags & EFX_PKT_TCP) ?
362 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
363 ((flags & EFX_PKT_TCP) ?
364 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
366 m->m_data += sc->rx_prefix_size;
367 m->m_len = rx_desc->size - sc->rx_prefix_size;
368 m->m_pkthdr.len = m->m_len;
369 m->m_pkthdr.csum_flags = csum_flags;
370 __sfxge_rx_deliver(sc, rx_desc->mbuf);
372 rx_desc->flags = EFX_DISCARD;
373 rx_desc->mbuf = NULL;
379 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
381 struct sfxge_softc *sc = st->sc;
382 struct mbuf *m = c->mbuf;
386 KASSERT(m, ("no mbuf to deliver"));
390 /* Finish off packet munging and recalculate IP header checksum. */
391 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
392 struct ip *iph = c->nh;
393 iph->ip_len = htons(iph->ip_len);
395 iph->ip_sum = in_cksum_hdr(iph);
396 c_th = (struct tcphdr *)(iph + 1);
397 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
398 CSUM_IP_CHECKED | CSUM_IP_VALID);
400 struct ip6_hdr *iph = c->nh;
401 iph->ip6_plen = htons(iph->ip6_plen);
402 c_th = (struct tcphdr *)(iph + 1);
403 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
406 c_th->th_win = c->th_last->th_win;
407 c_th->th_ack = c->th_last->th_ack;
408 if (c_th->th_off == c->th_last->th_off) {
409 /* Copy TCP options (take care to avoid going negative). */
410 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
411 memcpy(c_th + 1, c->th_last + 1, optlen);
414 m->m_pkthdr.flowid = c->conn_hash;
416 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
417 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
419 m->m_pkthdr.csum_flags = csum_flags;
420 __sfxge_rx_deliver(sc, m);
426 /* Drop the given connection, and add it to the free list. */
427 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
431 KASSERT(!c->mbuf, ("found orphaned mbuf"));
433 if (c->next_buf.mbuf != NULL) {
434 sfxge_rx_deliver(rxq->sc, &c->next_buf);
435 LIST_REMOVE(c, active_link);
438 bucket = c->conn_hash & rxq->lro.conns_mask;
439 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
440 --rxq->lro.conns_n[bucket];
441 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
442 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
445 /* Stop tracking connections that have gone idle in order to keep hash
448 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
450 struct sfxge_lro_conn *c;
453 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
454 ("found active connections"));
456 rxq->lro.last_purge_ticks = now;
457 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
458 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
461 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
462 if (now - c->last_pkt_ticks > lro_idle_ticks) {
463 ++rxq->lro.n_drop_idle;
464 sfxge_lro_drop(rxq, c);
470 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
471 struct mbuf *mbuf, struct tcphdr *th)
475 /* Tack the new mbuf onto the chain. */
476 KASSERT(!mbuf->m_next, ("mbuf already chained"));
477 c->mbuf_tail->m_next = mbuf;
480 /* Increase length appropriately */
481 c->mbuf->m_pkthdr.len += mbuf->m_len;
483 /* Update the connection state flags */
484 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
485 struct ip *iph = c->nh;
486 iph->ip_len += mbuf->m_len;
487 c_th = (struct tcphdr *)(iph + 1);
489 struct ip6_hdr *iph = c->nh;
490 iph->ip6_plen += mbuf->m_len;
491 c_th = (struct tcphdr *)(iph + 1);
493 c_th->th_flags |= (th->th_flags & TH_PUSH);
497 /* Pass packet up now if another segment could overflow the IP
500 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
501 sfxge_lro_deliver(st, c);
505 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
506 struct mbuf *mbuf, void *nh, struct tcphdr *th)
508 /* Start the chain */
510 c->mbuf_tail = c->mbuf;
514 mbuf->m_pkthdr.len = mbuf->m_len;
516 /* Mangle header fields for later processing */
517 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
519 iph->ip_len = ntohs(iph->ip_len);
521 struct ip6_hdr *iph = nh;
522 iph->ip6_plen = ntohs(iph->ip6_plen);
526 /* Try to merge or otherwise hold or deliver (as appropriate) the
527 * packet buffered for this connection (c->next_buf). Return a flag
528 * indicating whether the connection is still active for LRO purposes.
531 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
533 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
534 char *eh = c->next_eh;
535 int data_length, hdr_length, dont_merge;
536 unsigned th_seq, pkt_length;
540 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
541 struct ip *iph = c->next_nh;
542 th = (struct tcphdr *)(iph + 1);
543 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
545 struct ip6_hdr *iph = c->next_nh;
546 th = (struct tcphdr *)(iph + 1);
547 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
550 hdr_length = (char *) th + th->th_off * 4 - eh;
551 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
553 th_seq = ntohl(th->th_seq);
554 dont_merge = ((data_length <= 0)
555 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
557 /* Check for options other than aligned timestamp. */
558 if (th->th_off != 5) {
559 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
560 if (th->th_off == 8 &&
561 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
563 (TCPOPT_TIMESTAMP << 8) |
564 TCPOLEN_TIMESTAMP)) {
565 /* timestamp option -- okay */
571 if (__predict_false(th_seq != c->next_seq)) {
572 /* Out-of-order, so start counting again. */
574 sfxge_lro_deliver(&rxq->lro, c);
575 c->n_in_order_pkts -= lro_loss_packets;
576 c->next_seq = th_seq + data_length;
577 ++rxq->lro.n_misorder;
578 goto deliver_buf_out;
580 c->next_seq = th_seq + data_length;
583 if (now - c->last_pkt_ticks > lro_idle_ticks) {
584 ++rxq->lro.n_drop_idle;
586 sfxge_lro_deliver(&rxq->lro, c);
587 sfxge_lro_drop(rxq, c);
590 c->last_pkt_ticks = ticks;
592 if (c->n_in_order_pkts < lro_slow_start_packets) {
593 /* May be in slow-start, so don't merge. */
594 ++rxq->lro.n_slow_start;
595 ++c->n_in_order_pkts;
596 goto deliver_buf_out;
599 if (__predict_false(dont_merge)) {
601 sfxge_lro_deliver(&rxq->lro, c);
602 if (th->th_flags & (TH_FIN | TH_RST)) {
603 ++rxq->lro.n_drop_closed;
604 sfxge_lro_drop(rxq, c);
607 goto deliver_buf_out;
610 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
612 if (__predict_true(c->mbuf != NULL)) {
613 /* Remove headers and any padding */
614 rx_buf->mbuf->m_data += hdr_length;
615 rx_buf->mbuf->m_len = data_length;
617 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
619 /* Remove any padding */
620 rx_buf->mbuf->m_len = pkt_length;
622 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
629 sfxge_rx_deliver(rxq->sc, rx_buf);
633 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
634 uint16_t l2_id, void *nh, struct tcphdr *th)
636 unsigned bucket = conn_hash & st->conns_mask;
637 struct sfxge_lro_conn *c;
639 if (st->conns_n[bucket] >= lro_chain_max) {
644 if (!TAILQ_EMPTY(&st->free_conns)) {
645 c = TAILQ_FIRST(&st->free_conns);
646 TAILQ_REMOVE(&st->free_conns, c, link);
648 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
652 c->next_buf.mbuf = NULL;
655 /* Create the connection tracking data */
656 ++st->conns_n[bucket];
657 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
659 c->conn_hash = conn_hash;
660 c->source = th->th_sport;
661 c->dest = th->th_dport;
662 c->n_in_order_pkts = 0;
663 c->last_pkt_ticks = *(volatile int *)&ticks;
666 /* NB. We don't initialise c->next_seq, and it doesn't matter what
667 * value it has. Most likely the next packet received for this
668 * connection will not match -- no harm done.
672 /* Process mbuf and decide whether to dispatch it to the stack now or
676 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
678 struct sfxge_softc *sc = rxq->sc;
679 struct mbuf *m = rx_buf->mbuf;
680 struct ether_header *eh;
681 struct sfxge_lro_conn *c;
689 /* Get the hardware hash */
690 conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
691 EFX_RX_HASHALG_TOEPLITZ,
694 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
695 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
696 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
697 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
698 SFXGE_LRO_L2_ID_VLAN;
699 l3_proto = veh->evl_proto;
703 l3_proto = eh->ether_type;
707 /* Check whether this is a suitable packet (unfragmented
708 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
709 * length, and compute a hash if necessary. If not, return.
711 if (l3_proto == htons(ETHERTYPE_IP)) {
714 KASSERT(iph->ip_p == IPPROTO_TCP,
715 ("IPv4 protocol is not TCP, but packet marker is set"));
716 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
717 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
719 th = (struct tcphdr *)(iph + 1);
720 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
721 struct ip6_hdr *iph = nh;
723 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
724 ("IPv6 next header is not TCP, but packet marker is set"));
725 l2_id |= SFXGE_LRO_L2_ID_IPV6;
726 th = (struct tcphdr *)(iph + 1);
731 bucket = conn_hash & rxq->lro.conns_mask;
733 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
734 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
736 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
738 if (c->mbuf != NULL) {
739 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
740 struct ip *c_iph, *iph = nh;
742 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
743 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
746 struct ip6_hdr *c_iph, *iph = nh;
748 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
749 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
754 /* Re-insert at head of list to reduce lookup time. */
755 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
756 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
758 if (c->next_buf.mbuf != NULL) {
759 if (!sfxge_lro_try_merge(rxq, c))
762 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
765 c->next_buf = *rx_buf;
770 rx_buf->flags = EFX_DISCARD;
774 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
776 sfxge_rx_deliver(sc, rx_buf);
779 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
781 struct sfxge_lro_state *st = &rxq->lro;
782 struct sfxge_lro_conn *c;
785 while (!LIST_EMPTY(&st->active_conns)) {
786 c = LIST_FIRST(&st->active_conns);
787 if (!c->delivered && c->mbuf != NULL)
788 sfxge_lro_deliver(st, c);
789 if (sfxge_lro_try_merge(rxq, c)) {
791 sfxge_lro_deliver(st, c);
792 LIST_REMOVE(c, active_link);
797 t = *(volatile int *)&ticks;
798 if (__predict_false(t != st->last_purge_ticks))
799 sfxge_lro_purge_idle(rxq, t);
802 #else /* !SFXGE_LRO */
805 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
810 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
814 #endif /* SFXGE_LRO */
817 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
819 struct sfxge_softc *sc = rxq->sc;
820 int if_capenable = sc->ifnet->if_capenable;
821 int lro_enabled = if_capenable & IFCAP_LRO;
823 struct sfxge_evq *evq;
824 unsigned int completed;
827 struct sfxge_rx_sw_desc *prev = NULL;
830 evq = sc->evq[index];
832 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
834 completed = rxq->completed;
835 while (completed != rxq->pending) {
837 struct sfxge_rx_sw_desc *rx_desc;
839 id = completed++ & rxq->ptr_mask;
840 rx_desc = &rxq->queue[id];
843 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
846 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
849 /* Read the length from the psuedo header if required */
850 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
853 rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
856 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
857 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
860 prefetch_read_many(mtod(m, caddr_t));
862 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
864 if (~if_capenable & IFCAP_RXCSUM)
866 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
869 if (~if_capenable & IFCAP_RXCSUM_IPV6)
870 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
873 /* Check for loopback packets */
875 struct ether_header *etherhp;
878 etherhp = mtod(m, struct ether_header *);
880 if (etherhp->ether_type ==
881 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
882 EFSYS_PROBE(loopback);
891 ("Rx descriptor with both IPv4 and IPv6 flags"));
895 /* Pass packet up the stack or into LRO (pipelined) */
898 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
899 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
900 sfxge_lro(rxq, prev);
902 sfxge_rx_deliver(sc, prev);
908 /* Return the packet to the pool */
910 rx_desc->mbuf = NULL;
912 rxq->completed = completed;
914 level = rxq->added - rxq->completed;
916 /* Pass last packet up the stack or into LRO */
919 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
920 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
921 sfxge_lro(rxq, prev);
923 sfxge_rx_deliver(sc, prev);
927 * If there are any pending flows and this is the end of the
928 * poll then they must be completed.
931 sfxge_lro_end_of_burst(rxq);
933 /* Top up the queue if necessary */
934 if (level < rxq->refill_threshold)
935 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
939 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
941 struct sfxge_rxq *rxq;
942 struct sfxge_evq *evq;
944 unsigned int retry = 3;
946 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
948 rxq = sc->rxq[index];
949 evq = sc->evq[index];
953 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
954 ("rxq not started"));
956 rxq->init_state = SFXGE_RXQ_INITIALIZED;
958 callout_stop(&rxq->refill_callout);
960 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
961 rxq->flush_state = SFXGE_FLUSH_PENDING;
963 SFXGE_EVQ_UNLOCK(evq);
965 /* Flush the receive queue */
966 if (efx_rx_qflush(rxq->common) != 0) {
968 rxq->flush_state = SFXGE_FLUSH_FAILED;
974 /* Spin for 100 ms */
977 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
980 } while (++count < 20);
984 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
985 /* Flush timeout - neither done nor failed */
986 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
987 device_get_nameunit(sc->dev), index);
988 rxq->flush_state = SFXGE_FLUSH_DONE;
992 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
993 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
994 device_get_nameunit(sc->dev), index);
995 rxq->flush_state = SFXGE_FLUSH_DONE;
998 rxq->pending = rxq->added;
999 sfxge_rx_qcomplete(rxq, B_TRUE);
1001 KASSERT(rxq->completed == rxq->pending,
1002 ("rxq->completed != rxq->pending"));
1010 /* Destroy the common code receive queue. */
1011 efx_rx_qdestroy(rxq->common);
1013 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1014 EFX_RXQ_NBUFS(sc->rxq_entries));
1016 SFXGE_EVQ_UNLOCK(evq);
1020 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1022 struct sfxge_rxq *rxq;
1024 struct sfxge_evq *evq;
1027 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1029 rxq = sc->rxq[index];
1031 evq = sc->evq[index];
1033 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1034 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1035 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1036 ("evq->init_state != SFXGE_EVQ_STARTED"));
1038 /* Program the buffer table. */
1039 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1040 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1043 /* Create the common code receive queue. */
1044 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1045 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1046 &rxq->common)) != 0)
1049 SFXGE_EVQ_LOCK(evq);
1051 /* Enable the receive queue. */
1052 efx_rx_qenable(rxq->common);
1054 rxq->init_state = SFXGE_RXQ_STARTED;
1055 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1057 /* Try to fill the queue from the pool. */
1058 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1060 SFXGE_EVQ_UNLOCK(evq);
1065 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1066 EFX_RXQ_NBUFS(sc->rxq_entries));
1071 sfxge_rx_stop(struct sfxge_softc *sc)
1075 efx_mac_filter_default_rxq_clear(sc->enp);
1077 /* Stop the receive queue(s) */
1078 index = sc->rxq_count;
1079 while (--index >= 0)
1080 sfxge_rx_qstop(sc, index);
1082 sc->rx_prefix_size = 0;
1083 sc->rx_buffer_size = 0;
1085 efx_rx_fini(sc->enp);
1089 sfxge_rx_start(struct sfxge_softc *sc)
1091 struct sfxge_intr *intr;
1092 const efx_nic_cfg_t *encp;
1093 size_t hdrlen, align, reserved;
1099 /* Initialize the common code receive module. */
1100 if ((rc = efx_rx_init(sc->enp)) != 0)
1103 encp = efx_nic_cfg_get(sc->enp);
1104 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1106 /* Calculate the receive packet buffer size. */
1107 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1109 /* Ensure IP headers are 32bit aligned */
1110 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1111 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1113 sc->rx_buffer_size += sc->rx_buffer_align;
1115 /* Align end of packet buffer for RX DMA end padding */
1116 align = MAX(1, encp->enc_rx_buf_align_end);
1117 EFSYS_ASSERT(ISP2(align));
1118 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1121 * Standard mbuf zones only guarantee pointer-size alignment;
1122 * we need extra space to align to the cache line
1124 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1126 /* Select zone for packet buffers */
1127 if (reserved <= MCLBYTES)
1128 sc->rx_buffer_zone = zone_clust;
1129 else if (reserved <= MJUMPAGESIZE)
1130 sc->rx_buffer_zone = zone_jumbop;
1131 else if (reserved <= MJUM9BYTES)
1132 sc->rx_buffer_zone = zone_jumbo9;
1134 sc->rx_buffer_zone = zone_jumbo16;
1137 * Set up the scale table. Enable all hash types and hash insertion.
1139 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1140 sc->rx_indir_table[index] = index % sc->rxq_count;
1141 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1142 SFXGE_RX_SCALE_MAX)) != 0)
1144 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1145 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1146 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1148 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1149 sizeof(toep_key))) != 0)
1152 /* Start the receive queue(s). */
1153 for (index = 0; index < sc->rxq_count; index++) {
1154 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1158 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1159 sc->intr.n_alloc > 1);
1167 while (--index >= 0)
1168 sfxge_rx_qstop(sc, index);
1171 efx_rx_fini(sc->enp);
1178 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1180 struct sfxge_lro_state *st = &rxq->lro;
1183 st->conns_mask = lro_table_size - 1;
1184 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1185 ("lro_table_size must be a power of 2"));
1187 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1189 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1191 for (i = 0; i <= st->conns_mask; ++i) {
1192 TAILQ_INIT(&st->conns[i]);
1195 LIST_INIT(&st->active_conns);
1196 TAILQ_INIT(&st->free_conns);
1199 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1201 struct sfxge_lro_state *st = &rxq->lro;
1202 struct sfxge_lro_conn *c;
1205 /* Return cleanly if sfxge_lro_init() has not been called. */
1206 if (st->conns == NULL)
1209 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1211 for (i = 0; i <= st->conns_mask; ++i) {
1212 while (!TAILQ_EMPTY(&st->conns[i])) {
1213 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1214 sfxge_lro_drop(rxq, c);
1218 while (!TAILQ_EMPTY(&st->free_conns)) {
1219 c = TAILQ_FIRST(&st->free_conns);
1220 TAILQ_REMOVE(&st->free_conns, c, link);
1221 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1225 free(st->conns_n, M_SFXGE);
1226 free(st->conns, M_SFXGE);
1233 sfxge_lro_init(struct sfxge_rxq *rxq)
1238 sfxge_lro_fini(struct sfxge_rxq *rxq)
1242 #endif /* SFXGE_LRO */
1245 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1247 struct sfxge_rxq *rxq;
1249 rxq = sc->rxq[index];
1251 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1252 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1254 /* Free the context array and the flow table. */
1255 free(rxq->queue, M_SFXGE);
1256 sfxge_lro_fini(rxq);
1258 /* Release DMA memory. */
1259 sfxge_dma_free(&rxq->mem);
1261 sc->rxq[index] = NULL;
1267 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1269 struct sfxge_rxq *rxq;
1270 struct sfxge_evq *evq;
1274 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1276 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1279 rxq->entries = sc->rxq_entries;
1280 rxq->ptr_mask = rxq->entries - 1;
1281 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1283 sc->rxq[index] = rxq;
1286 evq = sc->evq[index];
1288 /* Allocate and zero DMA space. */
1289 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1292 /* Allocate buffer table entries. */
1293 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1296 /* Allocate the context array and the flow table. */
1297 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1298 M_SFXGE, M_WAITOK | M_ZERO);
1299 sfxge_lro_init(rxq);
1301 callout_init(&rxq->refill_callout, 1);
1303 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1308 static const struct {
1311 } sfxge_rx_stats[] = {
1312 #define SFXGE_RX_STAT(name, member) \
1313 { #name, offsetof(struct sfxge_rxq, member) }
1315 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1316 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1317 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1318 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1319 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1320 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1321 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1322 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1327 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1329 struct sfxge_softc *sc = arg1;
1330 unsigned int id = arg2;
1331 unsigned int sum, index;
1333 /* Sum across all RX queues */
1335 for (index = 0; index < sc->rxq_count; index++)
1336 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1337 sfxge_rx_stats[id].offset);
1339 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1343 sfxge_rx_stat_init(struct sfxge_softc *sc)
1345 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1346 struct sysctl_oid_list *stat_list;
1349 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1351 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1354 OID_AUTO, sfxge_rx_stats[id].name,
1355 CTLTYPE_UINT|CTLFLAG_RD,
1356 sc, id, sfxge_rx_stat_handler, "IU",
1362 sfxge_rx_fini(struct sfxge_softc *sc)
1366 index = sc->rxq_count;
1367 while (--index >= 0)
1368 sfxge_rx_qfini(sc, index);
1374 sfxge_rx_init(struct sfxge_softc *sc)
1376 struct sfxge_intr *intr;
1381 if (!ISP2(lro_table_size)) {
1382 log(LOG_ERR, "%s=%u must be power of 2",
1383 SFXGE_LRO_PARAM(table_size), lro_table_size);
1385 goto fail_lro_table_size;
1388 if (lro_idle_ticks == 0)
1389 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1394 sc->rxq_count = intr->n_alloc;
1396 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1397 ("intr->state != SFXGE_INTR_INITIALIZED"));
1399 /* Initialize the receive queue(s) - one per interrupt. */
1400 for (index = 0; index < sc->rxq_count; index++) {
1401 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1405 sfxge_rx_stat_init(sc);
1410 /* Tear down the receive queue(s). */
1411 while (--index >= 0)
1412 sfxge_rx_qfini(sc, index);
1417 fail_lro_table_size: