2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/types.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
40 #include <net/ethernet.h>
42 #include <net/if_vlan_var.h>
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <netinet/ip6.h>
47 #include <netinet/tcp.h>
49 #include <machine/in_cksum.h>
51 #include "common/efx.h"
57 #define RX_REFILL_THRESHOLD (EFX_RXQ_LIMIT(SFXGE_NDESCS) * 9 / 10)
58 #define RX_REFILL_THRESHOLD_2 (RX_REFILL_THRESHOLD / 2)
60 /* Size of the LRO hash table. Must be a power of 2. A larger table
61 * means we can accelerate a larger number of streams.
63 static unsigned lro_table_size = 128;
65 /* Maximum length of a hash chain. If chains get too long then the lookup
66 * time increases and may exceed the benefit of LRO.
68 static unsigned lro_chain_max = 20;
70 /* Maximum time (in ticks) that a connection can be idle before it's LRO
73 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
75 /* Number of packets with payload that must arrive in-order before a
76 * connection is eligible for LRO. The idea is we should avoid coalescing
77 * segments when the sender is in slow-start because reducing the ACK rate
78 * can damage performance.
80 static int lro_slow_start_packets = 2000;
82 /* Number of packets with payload that must arrive in-order following loss
83 * before a connection is eligible for LRO. The idea is we should avoid
84 * coalescing segments when the sender is recovering from loss, because
85 * reducing the ACK rate can damage performance.
87 static int lro_loss_packets = 20;
89 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
90 #define SFXGE_LRO_L2_ID_VLAN 0x4000
91 #define SFXGE_LRO_L2_ID_IPV6 0x8000
92 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
93 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
95 /* Compare IPv6 addresses, avoiding conditional branches */
96 static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left,
97 const struct in6_addr *right)
100 const uint64_t *left64 = (const uint64_t *)left;
101 const uint64_t *right64 = (const uint64_t *)right;
102 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
104 return (left->s6_addr32[0] - right->s6_addr32[0]) |
105 (left->s6_addr32[1] - right->s6_addr32[1]) |
106 (left->s6_addr32[2] - right->s6_addr32[2]) |
107 (left->s6_addr32[3] - right->s6_addr32[3]);
112 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
115 rxq->flush_state = SFXGE_FLUSH_DONE;
119 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
122 rxq->flush_state = SFXGE_FLUSH_FAILED;
125 static uint8_t toep_key[] = {
126 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
134 sfxge_rx_post_refill(void *arg)
136 struct sfxge_rxq *rxq = arg;
137 struct sfxge_softc *sc;
139 struct sfxge_evq *evq;
144 evq = sc->evq[index];
146 magic = SFXGE_MAGIC_RX_QREFILL | index;
148 /* This is guaranteed due to the start/stop order of rx and ev */
149 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
150 ("evq not started"));
151 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
152 ("rxq not started"));
153 efx_ev_qpost(evq->common, magic);
157 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
159 /* Initially retry after 100 ms, but back off in case of
160 * repeated failures as we probably have to wait for the
161 * administrator to raise the pool limit. */
163 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
165 rxq->refill_delay = hz / 10;
167 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
168 sfxge_rx_post_refill, rxq);
171 static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
176 /* Allocate mbuf structure */
177 args.flags = M_PKTHDR;
179 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_DONTWAIT);
181 /* Allocate (and attach) packet buffer */
182 if (m && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_DONTWAIT)) {
183 uma_zfree(zone_mbuf, m);
190 #define SFXGE_REFILL_BATCH 64
193 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
195 struct sfxge_softc *sc;
197 struct sfxge_evq *evq;
200 unsigned int mblksize;
202 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
206 evq = sc->evq[index];
208 prefetch_read_many(sc->enp);
209 prefetch_read_many(rxq->common);
211 mtx_assert(&evq->lock, MA_OWNED);
213 if (rxq->init_state != SFXGE_RXQ_STARTED)
216 rxfill = rxq->added - rxq->completed;
217 KASSERT(rxfill <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
218 ("rxfill > EFX_RXQ_LIMIT(SFXGE_NDESCS)"));
219 ntodo = min(EFX_RXQ_LIMIT(SFXGE_NDESCS) - rxfill, target);
220 KASSERT(ntodo <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
221 ("ntodo > EFX_RQX_LIMIT(SFXGE_NDESCS)"));
227 mblksize = sc->rx_buffer_size;
228 while (ntodo-- > 0) {
230 struct sfxge_rx_sw_desc *rx_desc;
231 bus_dma_segment_t seg;
234 id = (rxq->added + batch) & (SFXGE_NDESCS - 1);
235 rx_desc = &rxq->queue[id];
236 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
238 rx_desc->flags = EFX_DISCARD;
239 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
242 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
243 addr[batch++] = seg.ds_addr;
245 if (batch == SFXGE_REFILL_BATCH) {
246 efx_rx_qpost(rxq->common, addr, mblksize, batch,
247 rxq->completed, rxq->added);
254 sfxge_rx_schedule_refill(rxq, retrying);
257 efx_rx_qpost(rxq->common, addr, mblksize, batch,
258 rxq->completed, rxq->added);
262 /* Make the descriptors visible to the hardware */
263 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
264 BUS_DMASYNC_PREWRITE);
266 efx_rx_qpush(rxq->common, rxq->added);
270 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
273 if (rxq->init_state != SFXGE_RXQ_STARTED)
276 /* Make sure the queue is full */
277 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_TRUE);
280 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
282 struct ifnet *ifp = sc->ifnet;
284 m->m_pkthdr.rcvif = ifp;
285 m->m_pkthdr.header = m->m_data;
286 m->m_pkthdr.csum_data = 0xffff;
287 ifp->if_input(ifp, m);
291 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
293 struct mbuf *m = rx_desc->mbuf;
296 /* Convert checksum flags */
297 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
298 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
299 if (rx_desc->flags & EFX_CKSUM_TCPUDP)
300 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
303 /* The hash covers a 4-tuple for TCP only */
304 if (rx_desc->flags & EFX_PKT_TCP) {
305 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
307 m->m_flags |= M_FLOWID;
310 m->m_data += sc->rx_prefix_size;
311 m->m_len = rx_desc->size - sc->rx_prefix_size;
312 m->m_pkthdr.len = m->m_len;
313 m->m_pkthdr.csum_flags = csum_flags;
314 __sfxge_rx_deliver(sc, rx_desc->mbuf);
316 rx_desc->flags = EFX_DISCARD;
317 rx_desc->mbuf = NULL;
321 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
323 struct sfxge_softc *sc = st->sc;
324 struct mbuf *m = c->mbuf;
328 KASSERT(m, ("no mbuf to deliver"));
332 /* Finish off packet munging and recalculate IP header checksum. */
333 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
334 struct ip *iph = c->nh;
335 iph->ip_len = htons(iph->ip_len);
337 iph->ip_sum = in_cksum_hdr(iph);
338 c_th = (struct tcphdr *)(iph + 1);
339 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
340 CSUM_IP_CHECKED | CSUM_IP_VALID);
342 struct ip6_hdr *iph = c->nh;
343 iph->ip6_plen = htons(iph->ip6_plen);
344 c_th = (struct tcphdr *)(iph + 1);
345 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
348 c_th->th_win = c->th_last->th_win;
349 c_th->th_ack = c->th_last->th_ack;
350 if (c_th->th_off == c->th_last->th_off) {
351 /* Copy TCP options (take care to avoid going negative). */
352 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
353 memcpy(c_th + 1, c->th_last + 1, optlen);
357 m->m_pkthdr.flowid = c->conn_hash;
358 m->m_flags |= M_FLOWID;
360 m->m_pkthdr.csum_flags = csum_flags;
361 __sfxge_rx_deliver(sc, m);
367 /* Drop the given connection, and add it to the free list. */
368 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
372 KASSERT(!c->mbuf, ("found orphaned mbuf"));
374 if (c->next_buf.mbuf) {
375 sfxge_rx_deliver(rxq->sc, &c->next_buf);
376 LIST_REMOVE(c, active_link);
379 bucket = c->conn_hash & rxq->lro.conns_mask;
380 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
381 --rxq->lro.conns_n[bucket];
382 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
383 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
386 /* Stop tracking connections that have gone idle in order to keep hash
389 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
391 struct sfxge_lro_conn *c;
394 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
395 ("found active connections"));
397 rxq->lro.last_purge_ticks = now;
398 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
399 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
402 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
403 if (now - c->last_pkt_ticks > lro_idle_ticks) {
404 ++rxq->lro.n_drop_idle;
405 sfxge_lro_drop(rxq, c);
411 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
412 struct mbuf *mbuf, struct tcphdr *th)
416 /* Tack the new mbuf onto the chain. */
417 KASSERT(!mbuf->m_next, ("mbuf already chained"));
418 c->mbuf_tail->m_next = mbuf;
421 /* Increase length appropriately */
422 c->mbuf->m_pkthdr.len += mbuf->m_len;
424 /* Update the connection state flags */
425 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
426 struct ip *iph = c->nh;
427 iph->ip_len += mbuf->m_len;
428 c_th = (struct tcphdr *)(iph + 1);
430 struct ip6_hdr *iph = c->nh;
431 iph->ip6_plen += mbuf->m_len;
432 c_th = (struct tcphdr *)(iph + 1);
434 c_th->th_flags |= (th->th_flags & TH_PUSH);
438 /* Pass packet up now if another segment could overflow the IP
441 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
442 sfxge_lro_deliver(st, c);
446 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
447 struct mbuf *mbuf, void *nh, struct tcphdr *th)
449 /* Start the chain */
451 c->mbuf_tail = c->mbuf;
455 mbuf->m_pkthdr.len = mbuf->m_len;
457 /* Mangle header fields for later processing */
458 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
460 iph->ip_len = ntohs(iph->ip_len);
462 struct ip6_hdr *iph = nh;
463 iph->ip6_plen = ntohs(iph->ip6_plen);
467 /* Try to merge or otherwise hold or deliver (as appropriate) the
468 * packet buffered for this connection (c->next_buf). Return a flag
469 * indicating whether the connection is still active for LRO purposes.
472 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
474 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
475 char *eh = c->next_eh;
476 int data_length, hdr_length, dont_merge;
477 unsigned th_seq, pkt_length;
481 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
482 struct ip *iph = c->next_nh;
483 th = (struct tcphdr *)(iph + 1);
484 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
486 struct ip6_hdr *iph = c->next_nh;
487 th = (struct tcphdr *)(iph + 1);
488 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
491 hdr_length = (char *) th + th->th_off * 4 - eh;
492 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
494 th_seq = ntohl(th->th_seq);
495 dont_merge = ((data_length <= 0)
496 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
498 /* Check for options other than aligned timestamp. */
499 if (th->th_off != 5) {
500 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
501 if (th->th_off == 8 &&
502 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
504 (TCPOPT_TIMESTAMP << 8) |
505 TCPOLEN_TIMESTAMP)) {
506 /* timestamp option -- okay */
512 if (__predict_false(th_seq != c->next_seq)) {
513 /* Out-of-order, so start counting again. */
515 sfxge_lro_deliver(&rxq->lro, c);
516 c->n_in_order_pkts -= lro_loss_packets;
517 c->next_seq = th_seq + data_length;
518 ++rxq->lro.n_misorder;
519 goto deliver_buf_out;
521 c->next_seq = th_seq + data_length;
524 if (now - c->last_pkt_ticks > lro_idle_ticks) {
525 ++rxq->lro.n_drop_idle;
527 sfxge_lro_deliver(&rxq->lro, c);
528 sfxge_lro_drop(rxq, c);
531 c->last_pkt_ticks = ticks;
533 if (c->n_in_order_pkts < lro_slow_start_packets) {
534 /* May be in slow-start, so don't merge. */
535 ++rxq->lro.n_slow_start;
536 ++c->n_in_order_pkts;
537 goto deliver_buf_out;
540 if (__predict_false(dont_merge)) {
542 sfxge_lro_deliver(&rxq->lro, c);
543 if (th->th_flags & (TH_FIN | TH_RST)) {
544 ++rxq->lro.n_drop_closed;
545 sfxge_lro_drop(rxq, c);
548 goto deliver_buf_out;
551 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
553 if (__predict_true(c->mbuf != NULL)) {
554 /* Remove headers and any padding */
555 rx_buf->mbuf->m_data += hdr_length;
556 rx_buf->mbuf->m_len = data_length;
558 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
560 /* Remove any padding */
561 rx_buf->mbuf->m_len = pkt_length;
563 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
570 sfxge_rx_deliver(rxq->sc, rx_buf);
574 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
575 uint16_t l2_id, void *nh, struct tcphdr *th)
577 unsigned bucket = conn_hash & st->conns_mask;
578 struct sfxge_lro_conn *c;
580 if (st->conns_n[bucket] >= lro_chain_max) {
585 if (!TAILQ_EMPTY(&st->free_conns)) {
586 c = TAILQ_FIRST(&st->free_conns);
587 TAILQ_REMOVE(&st->free_conns, c, link);
589 c = malloc(sizeof(*c), M_SFXGE, M_DONTWAIT);
593 c->next_buf.mbuf = NULL;
596 /* Create the connection tracking data */
597 ++st->conns_n[bucket];
598 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
600 c->conn_hash = conn_hash;
601 c->source = th->th_sport;
602 c->dest = th->th_dport;
603 c->n_in_order_pkts = 0;
604 c->last_pkt_ticks = *(volatile int *)&ticks;
607 /* NB. We don't initialise c->next_seq, and it doesn't matter what
608 * value it has. Most likely the next packet received for this
609 * connection will not match -- no harm done.
613 /* Process mbuf and decide whether to dispatch it to the stack now or
617 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
619 struct sfxge_softc *sc = rxq->sc;
620 struct mbuf *m = rx_buf->mbuf;
621 struct ether_header *eh;
622 struct sfxge_lro_conn *c;
630 /* Get the hardware hash */
631 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
634 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
635 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
636 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
637 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
638 SFXGE_LRO_L2_ID_VLAN;
639 l3_proto = veh->evl_proto;
643 l3_proto = eh->ether_type;
647 /* Check whether this is a suitable packet (unfragmented
648 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
649 * length, and compute a hash if necessary. If not, return.
651 if (l3_proto == htons(ETHERTYPE_IP)) {
653 if ((iph->ip_p - IPPROTO_TCP) |
654 (iph->ip_hl - (sizeof(*iph) >> 2u)) |
655 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
657 th = (struct tcphdr *)(iph + 1);
658 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
659 struct ip6_hdr *iph = nh;
660 if (iph->ip6_nxt != IPPROTO_TCP)
662 l2_id |= SFXGE_LRO_L2_ID_IPV6;
663 th = (struct tcphdr *)(iph + 1);
668 bucket = conn_hash & rxq->lro.conns_mask;
670 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
671 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
673 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
676 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
677 struct ip *c_iph, *iph = nh;
679 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
680 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
683 struct ip6_hdr *c_iph, *iph = nh;
685 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
686 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
691 /* Re-insert at head of list to reduce lookup time. */
692 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
693 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
695 if (c->next_buf.mbuf) {
696 if (!sfxge_lro_try_merge(rxq, c))
699 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
702 c->next_buf = *rx_buf;
707 rx_buf->flags = EFX_DISCARD;
711 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
713 sfxge_rx_deliver(sc, rx_buf);
716 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
718 struct sfxge_lro_state *st = &rxq->lro;
719 struct sfxge_lro_conn *c;
722 while (!LIST_EMPTY(&st->active_conns)) {
723 c = LIST_FIRST(&st->active_conns);
724 if (!c->delivered && c->mbuf)
725 sfxge_lro_deliver(st, c);
726 if (sfxge_lro_try_merge(rxq, c)) {
728 sfxge_lro_deliver(st, c);
729 LIST_REMOVE(c, active_link);
734 t = *(volatile int *)&ticks;
735 if (__predict_false(t != st->last_purge_ticks))
736 sfxge_lro_purge_idle(rxq, t);
740 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
742 struct sfxge_softc *sc = rxq->sc;
743 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
745 struct sfxge_evq *evq;
746 unsigned int completed;
749 struct sfxge_rx_sw_desc *prev = NULL;
752 evq = sc->evq[index];
754 mtx_assert(&evq->lock, MA_OWNED);
756 completed = rxq->completed;
757 while (completed != rxq->pending) {
759 struct sfxge_rx_sw_desc *rx_desc;
761 id = completed++ & (SFXGE_NDESCS - 1);
762 rx_desc = &rxq->queue[id];
765 if (rxq->init_state != SFXGE_RXQ_STARTED)
768 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
771 prefetch_read_many(mtod(m, caddr_t));
773 /* Check for loopback packets */
774 if (!(rx_desc->flags & EFX_PKT_IPV4) &&
775 !(rx_desc->flags & EFX_PKT_IPV6)) {
776 struct ether_header *etherhp;
779 etherhp = mtod(m, struct ether_header *);
781 if (etherhp->ether_type ==
782 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
783 EFSYS_PROBE(loopback);
790 /* Pass packet up the stack or into LRO (pipelined) */
793 sfxge_lro(rxq, prev);
795 sfxge_rx_deliver(sc, prev);
801 /* Return the packet to the pool */
803 rx_desc->mbuf = NULL;
805 rxq->completed = completed;
807 level = rxq->added - rxq->completed;
809 /* Pass last packet up the stack or into LRO */
812 sfxge_lro(rxq, prev);
814 sfxge_rx_deliver(sc, prev);
818 * If there are any pending flows and this is the end of the
819 * poll then they must be completed.
822 sfxge_lro_end_of_burst(rxq);
824 /* Top up the queue if necessary */
825 if (level < RX_REFILL_THRESHOLD)
826 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
830 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
832 struct sfxge_rxq *rxq;
833 struct sfxge_evq *evq;
836 rxq = sc->rxq[index];
837 evq = sc->evq[index];
839 mtx_lock(&evq->lock);
841 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
842 ("rxq not started"));
844 rxq->init_state = SFXGE_RXQ_INITIALIZED;
846 callout_stop(&rxq->refill_callout);
849 rxq->flush_state = SFXGE_FLUSH_PENDING;
851 /* Flush the receive queue */
852 efx_rx_qflush(rxq->common);
854 mtx_unlock(&evq->lock);
858 /* Spin for 100 ms */
861 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
864 } while (++count < 20);
866 mtx_lock(&evq->lock);
868 if (rxq->flush_state == SFXGE_FLUSH_FAILED)
871 rxq->flush_state = SFXGE_FLUSH_DONE;
873 rxq->pending = rxq->added;
874 sfxge_rx_qcomplete(rxq, B_TRUE);
876 KASSERT(rxq->completed == rxq->pending,
877 ("rxq->completed != rxq->pending"));
884 /* Destroy the common code receive queue. */
885 efx_rx_qdestroy(rxq->common);
887 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
888 EFX_RXQ_NBUFS(SFXGE_NDESCS));
890 mtx_unlock(&evq->lock);
894 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
896 struct sfxge_rxq *rxq;
898 struct sfxge_evq *evq;
901 rxq = sc->rxq[index];
903 evq = sc->evq[index];
905 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
906 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
907 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
908 ("evq->init_state != SFXGE_EVQ_STARTED"));
910 /* Program the buffer table. */
911 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
912 EFX_RXQ_NBUFS(SFXGE_NDESCS))) != 0)
915 /* Create the common code receive queue. */
916 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
917 esmp, SFXGE_NDESCS, rxq->buf_base_id, evq->common,
921 mtx_lock(&evq->lock);
923 /* Enable the receive queue. */
924 efx_rx_qenable(rxq->common);
926 rxq->init_state = SFXGE_RXQ_STARTED;
928 /* Try to fill the queue from the pool. */
929 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
931 mtx_unlock(&evq->lock);
936 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
937 EFX_RXQ_NBUFS(SFXGE_NDESCS));
942 sfxge_rx_stop(struct sfxge_softc *sc)
944 struct sfxge_intr *intr;
949 /* Stop the receive queue(s) */
950 index = intr->n_alloc;
952 sfxge_rx_qstop(sc, index);
954 sc->rx_prefix_size = 0;
955 sc->rx_buffer_size = 0;
957 efx_rx_fini(sc->enp);
961 sfxge_rx_start(struct sfxge_softc *sc)
963 struct sfxge_intr *intr;
969 /* Initialize the common code receive module. */
970 if ((rc = efx_rx_init(sc->enp)) != 0)
973 /* Calculate the receive packet buffer size. */
974 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
975 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
978 /* Select zone for packet buffers */
979 if (sc->rx_buffer_size <= MCLBYTES)
980 sc->rx_buffer_zone = zone_clust;
981 else if (sc->rx_buffer_size <= MJUMPAGESIZE)
982 sc->rx_buffer_zone = zone_jumbop;
983 else if (sc->rx_buffer_size <= MJUM9BYTES)
984 sc->rx_buffer_zone = zone_jumbo9;
986 sc->rx_buffer_zone = zone_jumbo16;
989 * Set up the scale table. Enable all hash types and hash insertion.
991 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
992 sc->rx_indir_table[index] = index % sc->intr.n_alloc;
993 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
994 SFXGE_RX_SCALE_MAX)) != 0)
996 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
997 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
998 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1000 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1001 sizeof(toep_key))) != 0)
1004 /* Start the receive queue(s). */
1005 for (index = 0; index < intr->n_alloc; index++) {
1006 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1013 while (--index >= 0)
1014 sfxge_rx_qstop(sc, index);
1017 efx_rx_fini(sc->enp);
1022 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1024 struct sfxge_lro_state *st = &rxq->lro;
1027 st->conns_mask = lro_table_size - 1;
1028 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1029 ("lro_table_size must be a power of 2"));
1031 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1033 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1035 for (i = 0; i <= st->conns_mask; ++i) {
1036 TAILQ_INIT(&st->conns[i]);
1039 LIST_INIT(&st->active_conns);
1040 TAILQ_INIT(&st->free_conns);
1043 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1045 struct sfxge_lro_state *st = &rxq->lro;
1046 struct sfxge_lro_conn *c;
1049 /* Return cleanly if sfxge_lro_init() has not been called. */
1050 if (st->conns == NULL)
1053 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1055 for (i = 0; i <= st->conns_mask; ++i) {
1056 while (!TAILQ_EMPTY(&st->conns[i])) {
1057 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1058 sfxge_lro_drop(rxq, c);
1062 while (!TAILQ_EMPTY(&st->free_conns)) {
1063 c = TAILQ_FIRST(&st->free_conns);
1064 TAILQ_REMOVE(&st->free_conns, c, link);
1065 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1069 free(st->conns_n, M_SFXGE);
1070 free(st->conns, M_SFXGE);
1075 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1077 struct sfxge_rxq *rxq;
1079 rxq = sc->rxq[index];
1081 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1082 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1084 /* Free the context array and the flow table. */
1085 free(rxq->queue, M_SFXGE);
1086 sfxge_lro_fini(rxq);
1088 /* Release DMA memory. */
1089 sfxge_dma_free(&rxq->mem);
1091 sc->rxq[index] = NULL;
1097 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1099 struct sfxge_rxq *rxq;
1100 struct sfxge_evq *evq;
1104 KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc));
1106 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1110 sc->rxq[index] = rxq;
1113 evq = sc->evq[index];
1115 /* Allocate and zero DMA space. */
1116 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(SFXGE_NDESCS), esmp)) != 0)
1118 (void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(SFXGE_NDESCS));
1120 /* Allocate buffer table entries. */
1121 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(SFXGE_NDESCS),
1124 /* Allocate the context array and the flow table. */
1125 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * SFXGE_NDESCS,
1126 M_SFXGE, M_WAITOK | M_ZERO);
1127 sfxge_lro_init(rxq);
1129 callout_init(&rxq->refill_callout, B_TRUE);
1131 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1136 static const struct {
1139 } sfxge_rx_stats[] = {
1140 #define SFXGE_RX_STAT(name, member) \
1141 { #name, offsetof(struct sfxge_rxq, member) }
1142 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1143 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1144 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1145 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1146 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1147 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1148 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1149 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1153 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1155 struct sfxge_softc *sc = arg1;
1156 unsigned int id = arg2;
1157 unsigned int sum, index;
1159 /* Sum across all RX queues */
1161 for (index = 0; index < sc->intr.n_alloc; index++)
1162 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1163 sfxge_rx_stats[id].offset);
1165 return SYSCTL_OUT(req, &sum, sizeof(sum));
1169 sfxge_rx_stat_init(struct sfxge_softc *sc)
1171 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1172 struct sysctl_oid_list *stat_list;
1175 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1178 id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]);
1182 OID_AUTO, sfxge_rx_stats[id].name,
1183 CTLTYPE_UINT|CTLFLAG_RD,
1184 sc, id, sfxge_rx_stat_handler, "IU",
1190 sfxge_rx_fini(struct sfxge_softc *sc)
1192 struct sfxge_intr *intr;
1197 index = intr->n_alloc;
1198 while (--index >= 0)
1199 sfxge_rx_qfini(sc, index);
1203 sfxge_rx_init(struct sfxge_softc *sc)
1205 struct sfxge_intr *intr;
1209 if (lro_idle_ticks == 0)
1210 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1214 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1215 ("intr->state != SFXGE_INTR_INITIALIZED"));
1217 /* Initialize the receive queue(s) - one per interrupt. */
1218 for (index = 0; index < intr->n_alloc; index++) {
1219 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1223 sfxge_rx_stat_init(sc);
1228 /* Tear down the receive queue(s). */
1229 while (--index >= 0)
1230 sfxge_rx_qfini(sc, index);