2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/types.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
41 #include <net/ethernet.h>
43 #include <net/if_vlan_var.h>
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
50 #include <machine/in_cksum.h>
52 #include "common/efx.h"
58 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
60 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
61 "Large receive offload (LRO) parameters");
63 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
65 /* Size of the LRO hash table. Must be a power of 2. A larger table
66 * means we can accelerate a larger number of streams.
68 static unsigned lro_table_size = 128;
69 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
70 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
72 "Size of the LRO hash table (must be a power of 2)");
74 /* Maximum length of a hash chain. If chains get too long then the lookup
75 * time increases and may exceed the benefit of LRO.
77 static unsigned lro_chain_max = 20;
78 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
79 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
81 "The maximum length of a hash chain");
83 /* Maximum time (in ticks) that a connection can be idle before it's LRO
86 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
87 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
88 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
90 "The maximum time (in ticks) that a connection can be idle "
91 "before it's LRO state is discarded");
93 /* Number of packets with payload that must arrive in-order before a
94 * connection is eligible for LRO. The idea is we should avoid coalescing
95 * segments when the sender is in slow-start because reducing the ACK rate
96 * can damage performance.
98 static int lro_slow_start_packets = 2000;
99 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
100 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
101 &lro_slow_start_packets, 0,
102 "Number of packets with payload that must arrive in-order before "
103 "a connection is eligible for LRO");
105 /* Number of packets with payload that must arrive in-order following loss
106 * before a connection is eligible for LRO. The idea is we should avoid
107 * coalescing segments when the sender is recovering from loss, because
108 * reducing the ACK rate can damage performance.
110 static int lro_loss_packets = 20;
111 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
112 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
113 &lro_loss_packets, 0,
114 "Number of packets with payload that must arrive in-order "
115 "following loss before a connection is eligible for LRO");
117 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
118 #define SFXGE_LRO_L2_ID_VLAN 0x4000
119 #define SFXGE_LRO_L2_ID_IPV6 0x8000
120 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
121 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
123 /* Compare IPv6 addresses, avoiding conditional branches */
124 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
125 const struct in6_addr *right)
128 const uint64_t *left64 = (const uint64_t *)left;
129 const uint64_t *right64 = (const uint64_t *)right;
130 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
132 return (left->s6_addr32[0] - right->s6_addr32[0]) |
133 (left->s6_addr32[1] - right->s6_addr32[1]) |
134 (left->s6_addr32[2] - right->s6_addr32[2]) |
135 (left->s6_addr32[3] - right->s6_addr32[3]);
140 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
143 rxq->flush_state = SFXGE_FLUSH_DONE;
147 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
150 rxq->flush_state = SFXGE_FLUSH_FAILED;
153 static uint8_t toep_key[] = {
154 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
155 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
156 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
157 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
158 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
162 sfxge_rx_post_refill(void *arg)
164 struct sfxge_rxq *rxq = arg;
165 struct sfxge_softc *sc;
167 struct sfxge_evq *evq;
172 evq = sc->evq[index];
174 magic = SFXGE_MAGIC_RX_QREFILL | index;
176 /* This is guaranteed due to the start/stop order of rx and ev */
177 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
178 ("evq not started"));
179 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
180 ("rxq not started"));
181 efx_ev_qpost(evq->common, magic);
185 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
187 /* Initially retry after 100 ms, but back off in case of
188 * repeated failures as we probably have to wait for the
189 * administrator to raise the pool limit. */
191 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
193 rxq->refill_delay = hz / 10;
195 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
196 sfxge_rx_post_refill, rxq);
199 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204 /* Allocate mbuf structure */
205 args.flags = M_PKTHDR;
207 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
209 /* Allocate (and attach) packet buffer */
210 if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
211 uma_zfree(zone_mbuf, m);
218 #define SFXGE_REFILL_BATCH 64
221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
223 struct sfxge_softc *sc;
225 struct sfxge_evq *evq;
228 unsigned int mblksize;
230 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
234 evq = sc->evq[index];
236 prefetch_read_many(sc->enp);
237 prefetch_read_many(rxq->common);
239 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
241 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
244 rxfill = rxq->added - rxq->completed;
245 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
255 mblksize = sc->rx_buffer_size;
256 while (ntodo-- > 0) {
258 struct sfxge_rx_sw_desc *rx_desc;
259 bus_dma_segment_t seg;
262 id = (rxq->added + batch) & rxq->ptr_mask;
263 rx_desc = &rxq->queue[id];
264 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
266 rx_desc->flags = EFX_DISCARD;
267 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
270 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
271 addr[batch++] = seg.ds_addr;
273 if (batch == SFXGE_REFILL_BATCH) {
274 efx_rx_qpost(rxq->common, addr, mblksize, batch,
275 rxq->completed, rxq->added);
282 sfxge_rx_schedule_refill(rxq, retrying);
285 efx_rx_qpost(rxq->common, addr, mblksize, batch,
286 rxq->completed, rxq->added);
290 /* Make the descriptors visible to the hardware */
291 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
292 BUS_DMASYNC_PREWRITE);
294 efx_rx_qpush(rxq->common, rxq->added);
298 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
301 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
304 /* Make sure the queue is full */
305 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
308 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
310 struct ifnet *ifp = sc->ifnet;
312 m->m_pkthdr.rcvif = ifp;
313 m->m_pkthdr.csum_data = 0xffff;
314 ifp->if_input(ifp, m);
318 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
320 struct mbuf *m = rx_desc->mbuf;
323 /* Convert checksum flags */
324 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
325 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
326 if (rx_desc->flags & EFX_CKSUM_TCPUDP)
327 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
329 /* The hash covers a 4-tuple for TCP only */
330 if (rx_desc->flags & EFX_PKT_TCP) {
331 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
333 m->m_flags |= M_FLOWID;
335 m->m_data += sc->rx_prefix_size;
336 m->m_len = rx_desc->size - sc->rx_prefix_size;
337 m->m_pkthdr.len = m->m_len;
338 m->m_pkthdr.csum_flags = csum_flags;
339 __sfxge_rx_deliver(sc, rx_desc->mbuf);
341 rx_desc->flags = EFX_DISCARD;
342 rx_desc->mbuf = NULL;
346 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
348 struct sfxge_softc *sc = st->sc;
349 struct mbuf *m = c->mbuf;
353 KASSERT(m, ("no mbuf to deliver"));
357 /* Finish off packet munging and recalculate IP header checksum. */
358 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
359 struct ip *iph = c->nh;
360 iph->ip_len = htons(iph->ip_len);
362 iph->ip_sum = in_cksum_hdr(iph);
363 c_th = (struct tcphdr *)(iph + 1);
364 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
365 CSUM_IP_CHECKED | CSUM_IP_VALID);
367 struct ip6_hdr *iph = c->nh;
368 iph->ip6_plen = htons(iph->ip6_plen);
369 c_th = (struct tcphdr *)(iph + 1);
370 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
373 c_th->th_win = c->th_last->th_win;
374 c_th->th_ack = c->th_last->th_ack;
375 if (c_th->th_off == c->th_last->th_off) {
376 /* Copy TCP options (take care to avoid going negative). */
377 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
378 memcpy(c_th + 1, c->th_last + 1, optlen);
381 m->m_pkthdr.flowid = c->conn_hash;
382 m->m_flags |= M_FLOWID;
384 m->m_pkthdr.csum_flags = csum_flags;
385 __sfxge_rx_deliver(sc, m);
391 /* Drop the given connection, and add it to the free list. */
392 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
396 KASSERT(!c->mbuf, ("found orphaned mbuf"));
398 if (c->next_buf.mbuf != NULL) {
399 sfxge_rx_deliver(rxq->sc, &c->next_buf);
400 LIST_REMOVE(c, active_link);
403 bucket = c->conn_hash & rxq->lro.conns_mask;
404 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
405 --rxq->lro.conns_n[bucket];
406 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
407 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
410 /* Stop tracking connections that have gone idle in order to keep hash
413 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
415 struct sfxge_lro_conn *c;
418 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
419 ("found active connections"));
421 rxq->lro.last_purge_ticks = now;
422 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
423 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
426 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
427 if (now - c->last_pkt_ticks > lro_idle_ticks) {
428 ++rxq->lro.n_drop_idle;
429 sfxge_lro_drop(rxq, c);
435 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
436 struct mbuf *mbuf, struct tcphdr *th)
440 /* Tack the new mbuf onto the chain. */
441 KASSERT(!mbuf->m_next, ("mbuf already chained"));
442 c->mbuf_tail->m_next = mbuf;
445 /* Increase length appropriately */
446 c->mbuf->m_pkthdr.len += mbuf->m_len;
448 /* Update the connection state flags */
449 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
450 struct ip *iph = c->nh;
451 iph->ip_len += mbuf->m_len;
452 c_th = (struct tcphdr *)(iph + 1);
454 struct ip6_hdr *iph = c->nh;
455 iph->ip6_plen += mbuf->m_len;
456 c_th = (struct tcphdr *)(iph + 1);
458 c_th->th_flags |= (th->th_flags & TH_PUSH);
462 /* Pass packet up now if another segment could overflow the IP
465 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
466 sfxge_lro_deliver(st, c);
470 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
471 struct mbuf *mbuf, void *nh, struct tcphdr *th)
473 /* Start the chain */
475 c->mbuf_tail = c->mbuf;
479 mbuf->m_pkthdr.len = mbuf->m_len;
481 /* Mangle header fields for later processing */
482 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
484 iph->ip_len = ntohs(iph->ip_len);
486 struct ip6_hdr *iph = nh;
487 iph->ip6_plen = ntohs(iph->ip6_plen);
491 /* Try to merge or otherwise hold or deliver (as appropriate) the
492 * packet buffered for this connection (c->next_buf). Return a flag
493 * indicating whether the connection is still active for LRO purposes.
496 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
498 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
499 char *eh = c->next_eh;
500 int data_length, hdr_length, dont_merge;
501 unsigned th_seq, pkt_length;
505 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
506 struct ip *iph = c->next_nh;
507 th = (struct tcphdr *)(iph + 1);
508 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
510 struct ip6_hdr *iph = c->next_nh;
511 th = (struct tcphdr *)(iph + 1);
512 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
515 hdr_length = (char *) th + th->th_off * 4 - eh;
516 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
518 th_seq = ntohl(th->th_seq);
519 dont_merge = ((data_length <= 0)
520 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
522 /* Check for options other than aligned timestamp. */
523 if (th->th_off != 5) {
524 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
525 if (th->th_off == 8 &&
526 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
528 (TCPOPT_TIMESTAMP << 8) |
529 TCPOLEN_TIMESTAMP)) {
530 /* timestamp option -- okay */
536 if (__predict_false(th_seq != c->next_seq)) {
537 /* Out-of-order, so start counting again. */
539 sfxge_lro_deliver(&rxq->lro, c);
540 c->n_in_order_pkts -= lro_loss_packets;
541 c->next_seq = th_seq + data_length;
542 ++rxq->lro.n_misorder;
543 goto deliver_buf_out;
545 c->next_seq = th_seq + data_length;
548 if (now - c->last_pkt_ticks > lro_idle_ticks) {
549 ++rxq->lro.n_drop_idle;
551 sfxge_lro_deliver(&rxq->lro, c);
552 sfxge_lro_drop(rxq, c);
555 c->last_pkt_ticks = ticks;
557 if (c->n_in_order_pkts < lro_slow_start_packets) {
558 /* May be in slow-start, so don't merge. */
559 ++rxq->lro.n_slow_start;
560 ++c->n_in_order_pkts;
561 goto deliver_buf_out;
564 if (__predict_false(dont_merge)) {
566 sfxge_lro_deliver(&rxq->lro, c);
567 if (th->th_flags & (TH_FIN | TH_RST)) {
568 ++rxq->lro.n_drop_closed;
569 sfxge_lro_drop(rxq, c);
572 goto deliver_buf_out;
575 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
577 if (__predict_true(c->mbuf != NULL)) {
578 /* Remove headers and any padding */
579 rx_buf->mbuf->m_data += hdr_length;
580 rx_buf->mbuf->m_len = data_length;
582 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
584 /* Remove any padding */
585 rx_buf->mbuf->m_len = pkt_length;
587 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
594 sfxge_rx_deliver(rxq->sc, rx_buf);
598 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
599 uint16_t l2_id, void *nh, struct tcphdr *th)
601 unsigned bucket = conn_hash & st->conns_mask;
602 struct sfxge_lro_conn *c;
604 if (st->conns_n[bucket] >= lro_chain_max) {
609 if (!TAILQ_EMPTY(&st->free_conns)) {
610 c = TAILQ_FIRST(&st->free_conns);
611 TAILQ_REMOVE(&st->free_conns, c, link);
613 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
617 c->next_buf.mbuf = NULL;
620 /* Create the connection tracking data */
621 ++st->conns_n[bucket];
622 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
624 c->conn_hash = conn_hash;
625 c->source = th->th_sport;
626 c->dest = th->th_dport;
627 c->n_in_order_pkts = 0;
628 c->last_pkt_ticks = *(volatile int *)&ticks;
631 /* NB. We don't initialise c->next_seq, and it doesn't matter what
632 * value it has. Most likely the next packet received for this
633 * connection will not match -- no harm done.
637 /* Process mbuf and decide whether to dispatch it to the stack now or
641 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
643 struct sfxge_softc *sc = rxq->sc;
644 struct mbuf *m = rx_buf->mbuf;
645 struct ether_header *eh;
646 struct sfxge_lro_conn *c;
654 /* Get the hardware hash */
655 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
658 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
659 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
660 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
661 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
662 SFXGE_LRO_L2_ID_VLAN;
663 l3_proto = veh->evl_proto;
667 l3_proto = eh->ether_type;
671 /* Check whether this is a suitable packet (unfragmented
672 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
673 * length, and compute a hash if necessary. If not, return.
675 if (l3_proto == htons(ETHERTYPE_IP)) {
677 if ((iph->ip_p - IPPROTO_TCP) |
678 (iph->ip_hl - (sizeof(*iph) >> 2u)) |
679 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
681 th = (struct tcphdr *)(iph + 1);
682 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
683 struct ip6_hdr *iph = nh;
684 if (iph->ip6_nxt != IPPROTO_TCP)
686 l2_id |= SFXGE_LRO_L2_ID_IPV6;
687 th = (struct tcphdr *)(iph + 1);
692 bucket = conn_hash & rxq->lro.conns_mask;
694 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
695 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
697 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
699 if (c->mbuf != NULL) {
700 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
701 struct ip *c_iph, *iph = nh;
703 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
704 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
707 struct ip6_hdr *c_iph, *iph = nh;
709 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
710 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
715 /* Re-insert at head of list to reduce lookup time. */
716 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
717 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
719 if (c->next_buf.mbuf != NULL) {
720 if (!sfxge_lro_try_merge(rxq, c))
723 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
726 c->next_buf = *rx_buf;
731 rx_buf->flags = EFX_DISCARD;
735 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
737 sfxge_rx_deliver(sc, rx_buf);
740 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
742 struct sfxge_lro_state *st = &rxq->lro;
743 struct sfxge_lro_conn *c;
746 while (!LIST_EMPTY(&st->active_conns)) {
747 c = LIST_FIRST(&st->active_conns);
748 if (!c->delivered && c->mbuf != NULL)
749 sfxge_lro_deliver(st, c);
750 if (sfxge_lro_try_merge(rxq, c)) {
752 sfxge_lro_deliver(st, c);
753 LIST_REMOVE(c, active_link);
758 t = *(volatile int *)&ticks;
759 if (__predict_false(t != st->last_purge_ticks))
760 sfxge_lro_purge_idle(rxq, t);
764 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
766 struct sfxge_softc *sc = rxq->sc;
767 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
769 struct sfxge_evq *evq;
770 unsigned int completed;
773 struct sfxge_rx_sw_desc *prev = NULL;
776 evq = sc->evq[index];
778 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
780 completed = rxq->completed;
781 while (completed != rxq->pending) {
783 struct sfxge_rx_sw_desc *rx_desc;
785 id = completed++ & rxq->ptr_mask;
786 rx_desc = &rxq->queue[id];
789 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
792 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
795 prefetch_read_many(mtod(m, caddr_t));
797 /* Check for loopback packets */
798 if (!(rx_desc->flags & EFX_PKT_IPV4) &&
799 !(rx_desc->flags & EFX_PKT_IPV6)) {
800 struct ether_header *etherhp;
803 etherhp = mtod(m, struct ether_header *);
805 if (etherhp->ether_type ==
806 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
807 EFSYS_PROBE(loopback);
814 /* Pass packet up the stack or into LRO (pipelined) */
817 sfxge_lro(rxq, prev);
819 sfxge_rx_deliver(sc, prev);
825 /* Return the packet to the pool */
827 rx_desc->mbuf = NULL;
829 rxq->completed = completed;
831 level = rxq->added - rxq->completed;
833 /* Pass last packet up the stack or into LRO */
836 sfxge_lro(rxq, prev);
838 sfxge_rx_deliver(sc, prev);
842 * If there are any pending flows and this is the end of the
843 * poll then they must be completed.
846 sfxge_lro_end_of_burst(rxq);
848 /* Top up the queue if necessary */
849 if (level < rxq->refill_threshold)
850 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
854 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
856 struct sfxge_rxq *rxq;
857 struct sfxge_evq *evq;
860 rxq = sc->rxq[index];
861 evq = sc->evq[index];
865 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
866 ("rxq not started"));
868 rxq->init_state = SFXGE_RXQ_INITIALIZED;
870 callout_stop(&rxq->refill_callout);
873 rxq->flush_state = SFXGE_FLUSH_PENDING;
875 /* Flush the receive queue */
876 efx_rx_qflush(rxq->common);
878 SFXGE_EVQ_UNLOCK(evq);
882 /* Spin for 100 ms */
885 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
888 } while (++count < 20);
892 if (rxq->flush_state == SFXGE_FLUSH_FAILED)
895 rxq->flush_state = SFXGE_FLUSH_DONE;
897 rxq->pending = rxq->added;
898 sfxge_rx_qcomplete(rxq, B_TRUE);
900 KASSERT(rxq->completed == rxq->pending,
901 ("rxq->completed != rxq->pending"));
908 /* Destroy the common code receive queue. */
909 efx_rx_qdestroy(rxq->common);
911 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
912 EFX_RXQ_NBUFS(sc->rxq_entries));
914 SFXGE_EVQ_UNLOCK(evq);
918 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
920 struct sfxge_rxq *rxq;
922 struct sfxge_evq *evq;
925 rxq = sc->rxq[index];
927 evq = sc->evq[index];
929 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
930 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
931 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
932 ("evq->init_state != SFXGE_EVQ_STARTED"));
934 /* Program the buffer table. */
935 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
936 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
939 /* Create the common code receive queue. */
940 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
941 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
947 /* Enable the receive queue. */
948 efx_rx_qenable(rxq->common);
950 rxq->init_state = SFXGE_RXQ_STARTED;
952 /* Try to fill the queue from the pool. */
953 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
955 SFXGE_EVQ_UNLOCK(evq);
960 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
961 EFX_RXQ_NBUFS(sc->rxq_entries));
966 sfxge_rx_stop(struct sfxge_softc *sc)
970 /* Stop the receive queue(s) */
971 index = sc->rxq_count;
973 sfxge_rx_qstop(sc, index);
975 sc->rx_prefix_size = 0;
976 sc->rx_buffer_size = 0;
978 efx_rx_fini(sc->enp);
982 sfxge_rx_start(struct sfxge_softc *sc)
984 struct sfxge_intr *intr;
990 /* Initialize the common code receive module. */
991 if ((rc = efx_rx_init(sc->enp)) != 0)
994 /* Calculate the receive packet buffer size. */
995 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
996 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
999 /* Select zone for packet buffers */
1000 if (sc->rx_buffer_size <= MCLBYTES)
1001 sc->rx_buffer_zone = zone_clust;
1002 else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1003 sc->rx_buffer_zone = zone_jumbop;
1004 else if (sc->rx_buffer_size <= MJUM9BYTES)
1005 sc->rx_buffer_zone = zone_jumbo9;
1007 sc->rx_buffer_zone = zone_jumbo16;
1010 * Set up the scale table. Enable all hash types and hash insertion.
1012 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1013 sc->rx_indir_table[index] = index % sc->rxq_count;
1014 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1015 SFXGE_RX_SCALE_MAX)) != 0)
1017 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1018 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1019 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1021 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1022 sizeof(toep_key))) != 0)
1025 /* Start the receive queue(s). */
1026 for (index = 0; index < sc->rxq_count; index++) {
1027 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1034 while (--index >= 0)
1035 sfxge_rx_qstop(sc, index);
1038 efx_rx_fini(sc->enp);
1043 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1045 struct sfxge_lro_state *st = &rxq->lro;
1048 st->conns_mask = lro_table_size - 1;
1049 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1050 ("lro_table_size must be a power of 2"));
1052 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1054 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1056 for (i = 0; i <= st->conns_mask; ++i) {
1057 TAILQ_INIT(&st->conns[i]);
1060 LIST_INIT(&st->active_conns);
1061 TAILQ_INIT(&st->free_conns);
1064 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1066 struct sfxge_lro_state *st = &rxq->lro;
1067 struct sfxge_lro_conn *c;
1070 /* Return cleanly if sfxge_lro_init() has not been called. */
1071 if (st->conns == NULL)
1074 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1076 for (i = 0; i <= st->conns_mask; ++i) {
1077 while (!TAILQ_EMPTY(&st->conns[i])) {
1078 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1079 sfxge_lro_drop(rxq, c);
1083 while (!TAILQ_EMPTY(&st->free_conns)) {
1084 c = TAILQ_FIRST(&st->free_conns);
1085 TAILQ_REMOVE(&st->free_conns, c, link);
1086 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1090 free(st->conns_n, M_SFXGE);
1091 free(st->conns, M_SFXGE);
1096 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1098 struct sfxge_rxq *rxq;
1100 rxq = sc->rxq[index];
1102 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1103 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1105 /* Free the context array and the flow table. */
1106 free(rxq->queue, M_SFXGE);
1107 sfxge_lro_fini(rxq);
1109 /* Release DMA memory. */
1110 sfxge_dma_free(&rxq->mem);
1112 sc->rxq[index] = NULL;
1118 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1120 struct sfxge_rxq *rxq;
1121 struct sfxge_evq *evq;
1125 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1127 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1130 rxq->entries = sc->rxq_entries;
1131 rxq->ptr_mask = rxq->entries - 1;
1132 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1134 sc->rxq[index] = rxq;
1137 evq = sc->evq[index];
1139 /* Allocate and zero DMA space. */
1140 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1143 /* Allocate buffer table entries. */
1144 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1147 /* Allocate the context array and the flow table. */
1148 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1149 M_SFXGE, M_WAITOK | M_ZERO);
1150 sfxge_lro_init(rxq);
1152 callout_init(&rxq->refill_callout, B_TRUE);
1154 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1159 static const struct {
1162 } sfxge_rx_stats[] = {
1163 #define SFXGE_RX_STAT(name, member) \
1164 { #name, offsetof(struct sfxge_rxq, member) }
1165 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1166 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1167 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1168 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1169 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1170 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1171 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1172 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1176 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1178 struct sfxge_softc *sc = arg1;
1179 unsigned int id = arg2;
1180 unsigned int sum, index;
1182 /* Sum across all RX queues */
1184 for (index = 0; index < sc->rxq_count; index++)
1185 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1186 sfxge_rx_stats[id].offset);
1188 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1192 sfxge_rx_stat_init(struct sfxge_softc *sc)
1194 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1195 struct sysctl_oid_list *stat_list;
1198 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1200 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1203 OID_AUTO, sfxge_rx_stats[id].name,
1204 CTLTYPE_UINT|CTLFLAG_RD,
1205 sc, id, sfxge_rx_stat_handler, "IU",
1211 sfxge_rx_fini(struct sfxge_softc *sc)
1215 index = sc->rxq_count;
1216 while (--index >= 0)
1217 sfxge_rx_qfini(sc, index);
1223 sfxge_rx_init(struct sfxge_softc *sc)
1225 struct sfxge_intr *intr;
1229 if (!ISP2(lro_table_size)) {
1230 log(LOG_ERR, "%s=%u must be power of 2",
1231 SFXGE_LRO_PARAM(table_size), lro_table_size);
1233 goto fail_lro_table_size;
1236 if (lro_idle_ticks == 0)
1237 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1241 sc->rxq_count = intr->n_alloc;
1243 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1244 ("intr->state != SFXGE_INTR_INITIALIZED"));
1246 /* Initialize the receive queue(s) - one per interrupt. */
1247 for (index = 0; index < sc->rxq_count; index++) {
1248 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1252 sfxge_rx_stat_init(sc);
1257 /* Tear down the receive queue(s). */
1258 while (--index >= 0)
1259 sfxge_rx_qfini(sc, index);
1263 fail_lro_table_size: