2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/types.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
40 #include <net/ethernet.h>
42 #include <net/if_vlan_var.h>
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <netinet/ip6.h>
47 #include <netinet/tcp.h>
49 #include <machine/in_cksum.h>
51 #include "common/efx.h"
57 #define RX_REFILL_THRESHOLD (EFX_RXQ_LIMIT(SFXGE_NDESCS) * 9 / 10)
58 #define RX_REFILL_THRESHOLD_2 (RX_REFILL_THRESHOLD / 2)
60 /* Size of the LRO hash table. Must be a power of 2. A larger table
61 * means we can accelerate a larger number of streams.
63 static unsigned lro_table_size = 128;
65 /* Maximum length of a hash chain. If chains get too long then the lookup
66 * time increases and may exceed the benefit of LRO.
68 static unsigned lro_chain_max = 20;
70 /* Maximum time (in ticks) that a connection can be idle before it's LRO
73 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
75 /* Number of packets with payload that must arrive in-order before a
76 * connection is eligible for LRO. The idea is we should avoid coalescing
77 * segments when the sender is in slow-start because reducing the ACK rate
78 * can damage performance.
80 static int lro_slow_start_packets = 2000;
82 /* Number of packets with payload that must arrive in-order following loss
83 * before a connection is eligible for LRO. The idea is we should avoid
84 * coalescing segments when the sender is recovering from loss, because
85 * reducing the ACK rate can damage performance.
87 static int lro_loss_packets = 20;
89 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
90 #define SFXGE_LRO_L2_ID_VLAN 0x4000
91 #define SFXGE_LRO_L2_ID_IPV6 0x8000
92 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
93 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
95 /* Compare IPv6 addresses, avoiding conditional branches */
96 static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left,
97 const struct in6_addr *right)
100 const uint64_t *left64 = (const uint64_t *)left;
101 const uint64_t *right64 = (const uint64_t *)right;
102 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
104 return (left->s6_addr32[0] - right->s6_addr32[0]) |
105 (left->s6_addr32[1] - right->s6_addr32[1]) |
106 (left->s6_addr32[2] - right->s6_addr32[2]) |
107 (left->s6_addr32[3] - right->s6_addr32[3]);
112 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
115 rxq->flush_state = SFXGE_FLUSH_DONE;
119 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
122 rxq->flush_state = SFXGE_FLUSH_FAILED;
125 static uint8_t toep_key[] = {
126 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
134 sfxge_rx_post_refill(void *arg)
136 struct sfxge_rxq *rxq = arg;
137 struct sfxge_softc *sc;
139 struct sfxge_evq *evq;
144 evq = sc->evq[index];
146 magic = SFXGE_MAGIC_RX_QREFILL | index;
148 /* This is guaranteed due to the start/stop order of rx and ev */
149 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
150 ("evq not started"));
151 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
152 ("rxq not started"));
153 efx_ev_qpost(evq->common, magic);
157 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
159 /* Initially retry after 100 ms, but back off in case of
160 * repeated failures as we probably have to wait for the
161 * administrator to raise the pool limit. */
163 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
165 rxq->refill_delay = hz / 10;
167 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
168 sfxge_rx_post_refill, rxq);
171 static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
176 /* Allocate mbuf structure */
177 args.flags = M_PKTHDR;
179 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
181 /* Allocate (and attach) packet buffer */
182 if (m && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
183 uma_zfree(zone_mbuf, m);
190 #define SFXGE_REFILL_BATCH 64
193 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
195 struct sfxge_softc *sc;
197 struct sfxge_evq *evq;
200 unsigned int mblksize;
202 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
206 evq = sc->evq[index];
208 prefetch_read_many(sc->enp);
209 prefetch_read_many(rxq->common);
211 mtx_assert(&evq->lock, MA_OWNED);
213 if (rxq->init_state != SFXGE_RXQ_STARTED)
216 rxfill = rxq->added - rxq->completed;
217 KASSERT(rxfill <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
218 ("rxfill > EFX_RXQ_LIMIT(SFXGE_NDESCS)"));
219 ntodo = min(EFX_RXQ_LIMIT(SFXGE_NDESCS) - rxfill, target);
220 KASSERT(ntodo <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
221 ("ntodo > EFX_RQX_LIMIT(SFXGE_NDESCS)"));
227 mblksize = sc->rx_buffer_size;
228 while (ntodo-- > 0) {
230 struct sfxge_rx_sw_desc *rx_desc;
231 bus_dma_segment_t seg;
234 id = (rxq->added + batch) & (SFXGE_NDESCS - 1);
235 rx_desc = &rxq->queue[id];
236 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
238 rx_desc->flags = EFX_DISCARD;
239 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
242 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
243 addr[batch++] = seg.ds_addr;
245 if (batch == SFXGE_REFILL_BATCH) {
246 efx_rx_qpost(rxq->common, addr, mblksize, batch,
247 rxq->completed, rxq->added);
254 sfxge_rx_schedule_refill(rxq, retrying);
257 efx_rx_qpost(rxq->common, addr, mblksize, batch,
258 rxq->completed, rxq->added);
262 /* Make the descriptors visible to the hardware */
263 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
264 BUS_DMASYNC_PREWRITE);
266 efx_rx_qpush(rxq->common, rxq->added);
270 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
273 if (rxq->init_state != SFXGE_RXQ_STARTED)
276 /* Make sure the queue is full */
277 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_TRUE);
280 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
282 struct ifnet *ifp = sc->ifnet;
284 m->m_pkthdr.rcvif = ifp;
285 m->m_pkthdr.csum_data = 0xffff;
286 ifp->if_input(ifp, m);
290 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
292 struct mbuf *m = rx_desc->mbuf;
295 /* Convert checksum flags */
296 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
297 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
298 if (rx_desc->flags & EFX_CKSUM_TCPUDP)
299 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
302 /* The hash covers a 4-tuple for TCP only */
303 if (rx_desc->flags & EFX_PKT_TCP) {
304 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
306 m->m_flags |= M_FLOWID;
309 m->m_data += sc->rx_prefix_size;
310 m->m_len = rx_desc->size - sc->rx_prefix_size;
311 m->m_pkthdr.len = m->m_len;
312 m->m_pkthdr.csum_flags = csum_flags;
313 __sfxge_rx_deliver(sc, rx_desc->mbuf);
315 rx_desc->flags = EFX_DISCARD;
316 rx_desc->mbuf = NULL;
320 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
322 struct sfxge_softc *sc = st->sc;
323 struct mbuf *m = c->mbuf;
327 KASSERT(m, ("no mbuf to deliver"));
331 /* Finish off packet munging and recalculate IP header checksum. */
332 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
333 struct ip *iph = c->nh;
334 iph->ip_len = htons(iph->ip_len);
336 iph->ip_sum = in_cksum_hdr(iph);
337 c_th = (struct tcphdr *)(iph + 1);
338 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
339 CSUM_IP_CHECKED | CSUM_IP_VALID);
341 struct ip6_hdr *iph = c->nh;
342 iph->ip6_plen = htons(iph->ip6_plen);
343 c_th = (struct tcphdr *)(iph + 1);
344 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
347 c_th->th_win = c->th_last->th_win;
348 c_th->th_ack = c->th_last->th_ack;
349 if (c_th->th_off == c->th_last->th_off) {
350 /* Copy TCP options (take care to avoid going negative). */
351 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
352 memcpy(c_th + 1, c->th_last + 1, optlen);
356 m->m_pkthdr.flowid = c->conn_hash;
357 m->m_flags |= M_FLOWID;
359 m->m_pkthdr.csum_flags = csum_flags;
360 __sfxge_rx_deliver(sc, m);
366 /* Drop the given connection, and add it to the free list. */
367 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
371 KASSERT(!c->mbuf, ("found orphaned mbuf"));
373 if (c->next_buf.mbuf) {
374 sfxge_rx_deliver(rxq->sc, &c->next_buf);
375 LIST_REMOVE(c, active_link);
378 bucket = c->conn_hash & rxq->lro.conns_mask;
379 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
380 --rxq->lro.conns_n[bucket];
381 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
382 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
385 /* Stop tracking connections that have gone idle in order to keep hash
388 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
390 struct sfxge_lro_conn *c;
393 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
394 ("found active connections"));
396 rxq->lro.last_purge_ticks = now;
397 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
398 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
401 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
402 if (now - c->last_pkt_ticks > lro_idle_ticks) {
403 ++rxq->lro.n_drop_idle;
404 sfxge_lro_drop(rxq, c);
410 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
411 struct mbuf *mbuf, struct tcphdr *th)
415 /* Tack the new mbuf onto the chain. */
416 KASSERT(!mbuf->m_next, ("mbuf already chained"));
417 c->mbuf_tail->m_next = mbuf;
420 /* Increase length appropriately */
421 c->mbuf->m_pkthdr.len += mbuf->m_len;
423 /* Update the connection state flags */
424 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
425 struct ip *iph = c->nh;
426 iph->ip_len += mbuf->m_len;
427 c_th = (struct tcphdr *)(iph + 1);
429 struct ip6_hdr *iph = c->nh;
430 iph->ip6_plen += mbuf->m_len;
431 c_th = (struct tcphdr *)(iph + 1);
433 c_th->th_flags |= (th->th_flags & TH_PUSH);
437 /* Pass packet up now if another segment could overflow the IP
440 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
441 sfxge_lro_deliver(st, c);
445 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
446 struct mbuf *mbuf, void *nh, struct tcphdr *th)
448 /* Start the chain */
450 c->mbuf_tail = c->mbuf;
454 mbuf->m_pkthdr.len = mbuf->m_len;
456 /* Mangle header fields for later processing */
457 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
459 iph->ip_len = ntohs(iph->ip_len);
461 struct ip6_hdr *iph = nh;
462 iph->ip6_plen = ntohs(iph->ip6_plen);
466 /* Try to merge or otherwise hold or deliver (as appropriate) the
467 * packet buffered for this connection (c->next_buf). Return a flag
468 * indicating whether the connection is still active for LRO purposes.
471 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
473 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
474 char *eh = c->next_eh;
475 int data_length, hdr_length, dont_merge;
476 unsigned th_seq, pkt_length;
480 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
481 struct ip *iph = c->next_nh;
482 th = (struct tcphdr *)(iph + 1);
483 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
485 struct ip6_hdr *iph = c->next_nh;
486 th = (struct tcphdr *)(iph + 1);
487 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
490 hdr_length = (char *) th + th->th_off * 4 - eh;
491 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
493 th_seq = ntohl(th->th_seq);
494 dont_merge = ((data_length <= 0)
495 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
497 /* Check for options other than aligned timestamp. */
498 if (th->th_off != 5) {
499 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
500 if (th->th_off == 8 &&
501 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
503 (TCPOPT_TIMESTAMP << 8) |
504 TCPOLEN_TIMESTAMP)) {
505 /* timestamp option -- okay */
511 if (__predict_false(th_seq != c->next_seq)) {
512 /* Out-of-order, so start counting again. */
514 sfxge_lro_deliver(&rxq->lro, c);
515 c->n_in_order_pkts -= lro_loss_packets;
516 c->next_seq = th_seq + data_length;
517 ++rxq->lro.n_misorder;
518 goto deliver_buf_out;
520 c->next_seq = th_seq + data_length;
523 if (now - c->last_pkt_ticks > lro_idle_ticks) {
524 ++rxq->lro.n_drop_idle;
526 sfxge_lro_deliver(&rxq->lro, c);
527 sfxge_lro_drop(rxq, c);
530 c->last_pkt_ticks = ticks;
532 if (c->n_in_order_pkts < lro_slow_start_packets) {
533 /* May be in slow-start, so don't merge. */
534 ++rxq->lro.n_slow_start;
535 ++c->n_in_order_pkts;
536 goto deliver_buf_out;
539 if (__predict_false(dont_merge)) {
541 sfxge_lro_deliver(&rxq->lro, c);
542 if (th->th_flags & (TH_FIN | TH_RST)) {
543 ++rxq->lro.n_drop_closed;
544 sfxge_lro_drop(rxq, c);
547 goto deliver_buf_out;
550 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
552 if (__predict_true(c->mbuf != NULL)) {
553 /* Remove headers and any padding */
554 rx_buf->mbuf->m_data += hdr_length;
555 rx_buf->mbuf->m_len = data_length;
557 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
559 /* Remove any padding */
560 rx_buf->mbuf->m_len = pkt_length;
562 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
569 sfxge_rx_deliver(rxq->sc, rx_buf);
573 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
574 uint16_t l2_id, void *nh, struct tcphdr *th)
576 unsigned bucket = conn_hash & st->conns_mask;
577 struct sfxge_lro_conn *c;
579 if (st->conns_n[bucket] >= lro_chain_max) {
584 if (!TAILQ_EMPTY(&st->free_conns)) {
585 c = TAILQ_FIRST(&st->free_conns);
586 TAILQ_REMOVE(&st->free_conns, c, link);
588 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
592 c->next_buf.mbuf = NULL;
595 /* Create the connection tracking data */
596 ++st->conns_n[bucket];
597 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
599 c->conn_hash = conn_hash;
600 c->source = th->th_sport;
601 c->dest = th->th_dport;
602 c->n_in_order_pkts = 0;
603 c->last_pkt_ticks = *(volatile int *)&ticks;
606 /* NB. We don't initialise c->next_seq, and it doesn't matter what
607 * value it has. Most likely the next packet received for this
608 * connection will not match -- no harm done.
612 /* Process mbuf and decide whether to dispatch it to the stack now or
616 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
618 struct sfxge_softc *sc = rxq->sc;
619 struct mbuf *m = rx_buf->mbuf;
620 struct ether_header *eh;
621 struct sfxge_lro_conn *c;
629 /* Get the hardware hash */
630 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
633 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
634 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
635 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
636 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
637 SFXGE_LRO_L2_ID_VLAN;
638 l3_proto = veh->evl_proto;
642 l3_proto = eh->ether_type;
646 /* Check whether this is a suitable packet (unfragmented
647 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
648 * length, and compute a hash if necessary. If not, return.
650 if (l3_proto == htons(ETHERTYPE_IP)) {
652 if ((iph->ip_p - IPPROTO_TCP) |
653 (iph->ip_hl - (sizeof(*iph) >> 2u)) |
654 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
656 th = (struct tcphdr *)(iph + 1);
657 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
658 struct ip6_hdr *iph = nh;
659 if (iph->ip6_nxt != IPPROTO_TCP)
661 l2_id |= SFXGE_LRO_L2_ID_IPV6;
662 th = (struct tcphdr *)(iph + 1);
667 bucket = conn_hash & rxq->lro.conns_mask;
669 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
670 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
672 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
675 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
676 struct ip *c_iph, *iph = nh;
678 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
679 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
682 struct ip6_hdr *c_iph, *iph = nh;
684 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
685 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
690 /* Re-insert at head of list to reduce lookup time. */
691 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
692 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
694 if (c->next_buf.mbuf) {
695 if (!sfxge_lro_try_merge(rxq, c))
698 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
701 c->next_buf = *rx_buf;
706 rx_buf->flags = EFX_DISCARD;
710 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
712 sfxge_rx_deliver(sc, rx_buf);
715 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
717 struct sfxge_lro_state *st = &rxq->lro;
718 struct sfxge_lro_conn *c;
721 while (!LIST_EMPTY(&st->active_conns)) {
722 c = LIST_FIRST(&st->active_conns);
723 if (!c->delivered && c->mbuf)
724 sfxge_lro_deliver(st, c);
725 if (sfxge_lro_try_merge(rxq, c)) {
727 sfxge_lro_deliver(st, c);
728 LIST_REMOVE(c, active_link);
733 t = *(volatile int *)&ticks;
734 if (__predict_false(t != st->last_purge_ticks))
735 sfxge_lro_purge_idle(rxq, t);
739 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
741 struct sfxge_softc *sc = rxq->sc;
742 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
744 struct sfxge_evq *evq;
745 unsigned int completed;
748 struct sfxge_rx_sw_desc *prev = NULL;
751 evq = sc->evq[index];
753 mtx_assert(&evq->lock, MA_OWNED);
755 completed = rxq->completed;
756 while (completed != rxq->pending) {
758 struct sfxge_rx_sw_desc *rx_desc;
760 id = completed++ & (SFXGE_NDESCS - 1);
761 rx_desc = &rxq->queue[id];
764 if (rxq->init_state != SFXGE_RXQ_STARTED)
767 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
770 prefetch_read_many(mtod(m, caddr_t));
772 /* Check for loopback packets */
773 if (!(rx_desc->flags & EFX_PKT_IPV4) &&
774 !(rx_desc->flags & EFX_PKT_IPV6)) {
775 struct ether_header *etherhp;
778 etherhp = mtod(m, struct ether_header *);
780 if (etherhp->ether_type ==
781 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
782 EFSYS_PROBE(loopback);
789 /* Pass packet up the stack or into LRO (pipelined) */
792 sfxge_lro(rxq, prev);
794 sfxge_rx_deliver(sc, prev);
800 /* Return the packet to the pool */
802 rx_desc->mbuf = NULL;
804 rxq->completed = completed;
806 level = rxq->added - rxq->completed;
808 /* Pass last packet up the stack or into LRO */
811 sfxge_lro(rxq, prev);
813 sfxge_rx_deliver(sc, prev);
817 * If there are any pending flows and this is the end of the
818 * poll then they must be completed.
821 sfxge_lro_end_of_burst(rxq);
823 /* Top up the queue if necessary */
824 if (level < RX_REFILL_THRESHOLD)
825 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
829 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
831 struct sfxge_rxq *rxq;
832 struct sfxge_evq *evq;
835 rxq = sc->rxq[index];
836 evq = sc->evq[index];
838 mtx_lock(&evq->lock);
840 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
841 ("rxq not started"));
843 rxq->init_state = SFXGE_RXQ_INITIALIZED;
845 callout_stop(&rxq->refill_callout);
848 rxq->flush_state = SFXGE_FLUSH_PENDING;
850 /* Flush the receive queue */
851 efx_rx_qflush(rxq->common);
853 mtx_unlock(&evq->lock);
857 /* Spin for 100 ms */
860 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
863 } while (++count < 20);
865 mtx_lock(&evq->lock);
867 if (rxq->flush_state == SFXGE_FLUSH_FAILED)
870 rxq->flush_state = SFXGE_FLUSH_DONE;
872 rxq->pending = rxq->added;
873 sfxge_rx_qcomplete(rxq, B_TRUE);
875 KASSERT(rxq->completed == rxq->pending,
876 ("rxq->completed != rxq->pending"));
883 /* Destroy the common code receive queue. */
884 efx_rx_qdestroy(rxq->common);
886 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
887 EFX_RXQ_NBUFS(SFXGE_NDESCS));
889 mtx_unlock(&evq->lock);
893 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
895 struct sfxge_rxq *rxq;
897 struct sfxge_evq *evq;
900 rxq = sc->rxq[index];
902 evq = sc->evq[index];
904 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
905 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
906 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
907 ("evq->init_state != SFXGE_EVQ_STARTED"));
909 /* Program the buffer table. */
910 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
911 EFX_RXQ_NBUFS(SFXGE_NDESCS))) != 0)
914 /* Create the common code receive queue. */
915 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
916 esmp, SFXGE_NDESCS, rxq->buf_base_id, evq->common,
920 mtx_lock(&evq->lock);
922 /* Enable the receive queue. */
923 efx_rx_qenable(rxq->common);
925 rxq->init_state = SFXGE_RXQ_STARTED;
927 /* Try to fill the queue from the pool. */
928 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
930 mtx_unlock(&evq->lock);
935 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
936 EFX_RXQ_NBUFS(SFXGE_NDESCS));
941 sfxge_rx_stop(struct sfxge_softc *sc)
943 struct sfxge_intr *intr;
948 /* Stop the receive queue(s) */
949 index = intr->n_alloc;
951 sfxge_rx_qstop(sc, index);
953 sc->rx_prefix_size = 0;
954 sc->rx_buffer_size = 0;
956 efx_rx_fini(sc->enp);
960 sfxge_rx_start(struct sfxge_softc *sc)
962 struct sfxge_intr *intr;
968 /* Initialize the common code receive module. */
969 if ((rc = efx_rx_init(sc->enp)) != 0)
972 /* Calculate the receive packet buffer size. */
973 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
974 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
977 /* Select zone for packet buffers */
978 if (sc->rx_buffer_size <= MCLBYTES)
979 sc->rx_buffer_zone = zone_clust;
980 else if (sc->rx_buffer_size <= MJUMPAGESIZE)
981 sc->rx_buffer_zone = zone_jumbop;
982 else if (sc->rx_buffer_size <= MJUM9BYTES)
983 sc->rx_buffer_zone = zone_jumbo9;
985 sc->rx_buffer_zone = zone_jumbo16;
988 * Set up the scale table. Enable all hash types and hash insertion.
990 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
991 sc->rx_indir_table[index] = index % sc->intr.n_alloc;
992 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
993 SFXGE_RX_SCALE_MAX)) != 0)
995 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
996 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
997 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
999 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1000 sizeof(toep_key))) != 0)
1003 /* Start the receive queue(s). */
1004 for (index = 0; index < intr->n_alloc; index++) {
1005 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1012 while (--index >= 0)
1013 sfxge_rx_qstop(sc, index);
1016 efx_rx_fini(sc->enp);
1021 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1023 struct sfxge_lro_state *st = &rxq->lro;
1026 st->conns_mask = lro_table_size - 1;
1027 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1028 ("lro_table_size must be a power of 2"));
1030 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1032 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1034 for (i = 0; i <= st->conns_mask; ++i) {
1035 TAILQ_INIT(&st->conns[i]);
1038 LIST_INIT(&st->active_conns);
1039 TAILQ_INIT(&st->free_conns);
1042 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1044 struct sfxge_lro_state *st = &rxq->lro;
1045 struct sfxge_lro_conn *c;
1048 /* Return cleanly if sfxge_lro_init() has not been called. */
1049 if (st->conns == NULL)
1052 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1054 for (i = 0; i <= st->conns_mask; ++i) {
1055 while (!TAILQ_EMPTY(&st->conns[i])) {
1056 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1057 sfxge_lro_drop(rxq, c);
1061 while (!TAILQ_EMPTY(&st->free_conns)) {
1062 c = TAILQ_FIRST(&st->free_conns);
1063 TAILQ_REMOVE(&st->free_conns, c, link);
1064 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1068 free(st->conns_n, M_SFXGE);
1069 free(st->conns, M_SFXGE);
1074 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1076 struct sfxge_rxq *rxq;
1078 rxq = sc->rxq[index];
1080 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1081 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1083 /* Free the context array and the flow table. */
1084 free(rxq->queue, M_SFXGE);
1085 sfxge_lro_fini(rxq);
1087 /* Release DMA memory. */
1088 sfxge_dma_free(&rxq->mem);
1090 sc->rxq[index] = NULL;
1096 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1098 struct sfxge_rxq *rxq;
1099 struct sfxge_evq *evq;
1103 KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc));
1105 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1109 sc->rxq[index] = rxq;
1112 evq = sc->evq[index];
1114 /* Allocate and zero DMA space. */
1115 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(SFXGE_NDESCS), esmp)) != 0)
1117 (void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(SFXGE_NDESCS));
1119 /* Allocate buffer table entries. */
1120 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(SFXGE_NDESCS),
1123 /* Allocate the context array and the flow table. */
1124 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * SFXGE_NDESCS,
1125 M_SFXGE, M_WAITOK | M_ZERO);
1126 sfxge_lro_init(rxq);
1128 callout_init(&rxq->refill_callout, B_TRUE);
1130 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1135 static const struct {
1138 } sfxge_rx_stats[] = {
1139 #define SFXGE_RX_STAT(name, member) \
1140 { #name, offsetof(struct sfxge_rxq, member) }
1141 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1142 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1143 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1144 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1145 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1146 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1147 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1148 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1152 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1154 struct sfxge_softc *sc = arg1;
1155 unsigned int id = arg2;
1156 unsigned int sum, index;
1158 /* Sum across all RX queues */
1160 for (index = 0; index < sc->intr.n_alloc; index++)
1161 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1162 sfxge_rx_stats[id].offset);
1164 return SYSCTL_OUT(req, &sum, sizeof(sum));
1168 sfxge_rx_stat_init(struct sfxge_softc *sc)
1170 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1171 struct sysctl_oid_list *stat_list;
1174 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1177 id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]);
1181 OID_AUTO, sfxge_rx_stats[id].name,
1182 CTLTYPE_UINT|CTLFLAG_RD,
1183 sc, id, sfxge_rx_stat_handler, "IU",
1189 sfxge_rx_fini(struct sfxge_softc *sc)
1191 struct sfxge_intr *intr;
1196 index = intr->n_alloc;
1197 while (--index >= 0)
1198 sfxge_rx_qfini(sc, index);
1202 sfxge_rx_init(struct sfxge_softc *sc)
1204 struct sfxge_intr *intr;
1208 if (lro_idle_ticks == 0)
1209 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1213 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1214 ("intr->state != SFXGE_INTR_INITIALIZED"));
1216 /* Initialize the receive queue(s) - one per interrupt. */
1217 for (index = 0; index < intr->n_alloc; index++) {
1218 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1222 sfxge_rx_stat_init(sc);
1227 /* Tear down the receive queue(s). */
1228 while (--index >= 0)
1229 sfxge_rx_qfini(sc, index);