2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/types.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
40 #include <net/ethernet.h>
42 #include <net/if_vlan_var.h>
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <netinet/ip6.h>
47 #include <netinet/tcp.h>
49 #include <machine/in_cksum.h>
51 #include "common/efx.h"
57 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
61 /* Size of the LRO hash table. Must be a power of 2. A larger table
62 * means we can accelerate a larger number of streams.
64 static unsigned lro_table_size = 128;
66 /* Maximum length of a hash chain. If chains get too long then the lookup
67 * time increases and may exceed the benefit of LRO.
69 static unsigned lro_chain_max = 20;
71 /* Maximum time (in ticks) that a connection can be idle before it's LRO
74 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
76 /* Number of packets with payload that must arrive in-order before a
77 * connection is eligible for LRO. The idea is we should avoid coalescing
78 * segments when the sender is in slow-start because reducing the ACK rate
79 * can damage performance.
81 static int lro_slow_start_packets = 2000;
83 /* Number of packets with payload that must arrive in-order following loss
84 * before a connection is eligible for LRO. The idea is we should avoid
85 * coalescing segments when the sender is recovering from loss, because
86 * reducing the ACK rate can damage performance.
88 static int lro_loss_packets = 20;
90 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
91 #define SFXGE_LRO_L2_ID_VLAN 0x4000
92 #define SFXGE_LRO_L2_ID_IPV6 0x8000
93 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
94 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
96 /* Compare IPv6 addresses, avoiding conditional branches */
97 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
98 const struct in6_addr *right)
101 const uint64_t *left64 = (const uint64_t *)left;
102 const uint64_t *right64 = (const uint64_t *)right;
103 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
105 return (left->s6_addr32[0] - right->s6_addr32[0]) |
106 (left->s6_addr32[1] - right->s6_addr32[1]) |
107 (left->s6_addr32[2] - right->s6_addr32[2]) |
108 (left->s6_addr32[3] - right->s6_addr32[3]);
112 #endif /* SFXGE_LRO */
115 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
118 rxq->flush_state = SFXGE_FLUSH_DONE;
122 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
125 rxq->flush_state = SFXGE_FLUSH_FAILED;
128 static uint8_t toep_key[] = {
129 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
130 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
131 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
132 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
133 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
137 sfxge_rx_post_refill(void *arg)
139 struct sfxge_rxq *rxq = arg;
140 struct sfxge_softc *sc;
142 struct sfxge_evq *evq;
147 evq = sc->evq[index];
149 magic = SFXGE_MAGIC_RX_QREFILL | index;
151 /* This is guaranteed due to the start/stop order of rx and ev */
152 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
153 ("evq not started"));
154 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
155 ("rxq not started"));
156 efx_ev_qpost(evq->common, magic);
160 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
162 /* Initially retry after 100 ms, but back off in case of
163 * repeated failures as we probably have to wait for the
164 * administrator to raise the pool limit. */
166 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
168 rxq->refill_delay = hz / 10;
170 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
171 sfxge_rx_post_refill, rxq);
174 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
179 /* Allocate mbuf structure */
180 args.flags = M_PKTHDR;
182 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
184 /* Allocate (and attach) packet buffer */
185 if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
186 uma_zfree(zone_mbuf, m);
193 #define SFXGE_REFILL_BATCH 64
196 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
198 struct sfxge_softc *sc;
200 struct sfxge_evq *evq;
203 unsigned int mblksize;
205 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
209 evq = sc->evq[index];
211 prefetch_read_many(sc->enp);
212 prefetch_read_many(rxq->common);
214 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
216 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
219 rxfill = rxq->added - rxq->completed;
220 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
221 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
222 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
223 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
224 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
230 mblksize = sc->rx_buffer_size;
231 while (ntodo-- > 0) {
233 struct sfxge_rx_sw_desc *rx_desc;
234 bus_dma_segment_t seg;
237 id = (rxq->added + batch) & rxq->ptr_mask;
238 rx_desc = &rxq->queue[id];
239 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
241 rx_desc->flags = EFX_DISCARD;
242 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
245 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
246 addr[batch++] = seg.ds_addr;
248 if (batch == SFXGE_REFILL_BATCH) {
249 efx_rx_qpost(rxq->common, addr, mblksize, batch,
250 rxq->completed, rxq->added);
257 sfxge_rx_schedule_refill(rxq, retrying);
260 efx_rx_qpost(rxq->common, addr, mblksize, batch,
261 rxq->completed, rxq->added);
265 /* Make the descriptors visible to the hardware */
266 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
267 BUS_DMASYNC_PREWRITE);
269 efx_rx_qpush(rxq->common, rxq->added);
273 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
276 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
279 /* Make sure the queue is full */
280 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
283 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
285 struct ifnet *ifp = sc->ifnet;
287 m->m_pkthdr.rcvif = ifp;
288 m->m_pkthdr.csum_data = 0xffff;
289 ifp->if_input(ifp, m);
293 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
295 struct mbuf *m = rx_desc->mbuf;
298 /* Convert checksum flags */
299 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
300 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
301 if (rx_desc->flags & EFX_CKSUM_TCPUDP)
302 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
305 /* The hash covers a 4-tuple for TCP only */
306 if (rx_desc->flags & EFX_PKT_TCP) {
307 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
309 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
312 m->m_data += sc->rx_prefix_size;
313 m->m_len = rx_desc->size - sc->rx_prefix_size;
314 m->m_pkthdr.len = m->m_len;
315 m->m_pkthdr.csum_flags = csum_flags;
316 __sfxge_rx_deliver(sc, rx_desc->mbuf);
318 rx_desc->flags = EFX_DISCARD;
319 rx_desc->mbuf = NULL;
325 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
327 struct sfxge_softc *sc = st->sc;
328 struct mbuf *m = c->mbuf;
332 KASSERT(m, ("no mbuf to deliver"));
336 /* Finish off packet munging and recalculate IP header checksum. */
337 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
338 struct ip *iph = c->nh;
339 iph->ip_len = htons(iph->ip_len);
341 iph->ip_sum = in_cksum_hdr(iph);
342 c_th = (struct tcphdr *)(iph + 1);
343 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
344 CSUM_IP_CHECKED | CSUM_IP_VALID);
346 struct ip6_hdr *iph = c->nh;
347 iph->ip6_plen = htons(iph->ip6_plen);
348 c_th = (struct tcphdr *)(iph + 1);
349 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
352 c_th->th_win = c->th_last->th_win;
353 c_th->th_ack = c->th_last->th_ack;
354 if (c_th->th_off == c->th_last->th_off) {
355 /* Copy TCP options (take care to avoid going negative). */
356 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
357 memcpy(c_th + 1, c->th_last + 1, optlen);
361 m->m_pkthdr.flowid = c->conn_hash;
362 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
364 m->m_pkthdr.csum_flags = csum_flags;
365 __sfxge_rx_deliver(sc, m);
371 /* Drop the given connection, and add it to the free list. */
372 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
376 KASSERT(!c->mbuf, ("found orphaned mbuf"));
378 if (c->next_buf.mbuf != NULL) {
379 sfxge_rx_deliver(rxq->sc, &c->next_buf);
380 LIST_REMOVE(c, active_link);
383 bucket = c->conn_hash & rxq->lro.conns_mask;
384 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
385 --rxq->lro.conns_n[bucket];
386 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
387 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
390 /* Stop tracking connections that have gone idle in order to keep hash
393 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
395 struct sfxge_lro_conn *c;
398 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
399 ("found active connections"));
401 rxq->lro.last_purge_ticks = now;
402 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
403 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
406 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
407 if (now - c->last_pkt_ticks > lro_idle_ticks) {
408 ++rxq->lro.n_drop_idle;
409 sfxge_lro_drop(rxq, c);
415 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
416 struct mbuf *mbuf, struct tcphdr *th)
420 /* Tack the new mbuf onto the chain. */
421 KASSERT(!mbuf->m_next, ("mbuf already chained"));
422 c->mbuf_tail->m_next = mbuf;
425 /* Increase length appropriately */
426 c->mbuf->m_pkthdr.len += mbuf->m_len;
428 /* Update the connection state flags */
429 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
430 struct ip *iph = c->nh;
431 iph->ip_len += mbuf->m_len;
432 c_th = (struct tcphdr *)(iph + 1);
434 struct ip6_hdr *iph = c->nh;
435 iph->ip6_plen += mbuf->m_len;
436 c_th = (struct tcphdr *)(iph + 1);
438 c_th->th_flags |= (th->th_flags & TH_PUSH);
442 /* Pass packet up now if another segment could overflow the IP
445 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
446 sfxge_lro_deliver(st, c);
450 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
451 struct mbuf *mbuf, void *nh, struct tcphdr *th)
453 /* Start the chain */
455 c->mbuf_tail = c->mbuf;
459 mbuf->m_pkthdr.len = mbuf->m_len;
461 /* Mangle header fields for later processing */
462 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
464 iph->ip_len = ntohs(iph->ip_len);
466 struct ip6_hdr *iph = nh;
467 iph->ip6_plen = ntohs(iph->ip6_plen);
471 /* Try to merge or otherwise hold or deliver (as appropriate) the
472 * packet buffered for this connection (c->next_buf). Return a flag
473 * indicating whether the connection is still active for LRO purposes.
476 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
478 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
479 char *eh = c->next_eh;
480 int data_length, hdr_length, dont_merge;
481 unsigned th_seq, pkt_length;
485 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
486 struct ip *iph = c->next_nh;
487 th = (struct tcphdr *)(iph + 1);
488 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
490 struct ip6_hdr *iph = c->next_nh;
491 th = (struct tcphdr *)(iph + 1);
492 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
495 hdr_length = (char *) th + th->th_off * 4 - eh;
496 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
498 th_seq = ntohl(th->th_seq);
499 dont_merge = ((data_length <= 0)
500 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
502 /* Check for options other than aligned timestamp. */
503 if (th->th_off != 5) {
504 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
505 if (th->th_off == 8 &&
506 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
508 (TCPOPT_TIMESTAMP << 8) |
509 TCPOLEN_TIMESTAMP)) {
510 /* timestamp option -- okay */
516 if (__predict_false(th_seq != c->next_seq)) {
517 /* Out-of-order, so start counting again. */
519 sfxge_lro_deliver(&rxq->lro, c);
520 c->n_in_order_pkts -= lro_loss_packets;
521 c->next_seq = th_seq + data_length;
522 ++rxq->lro.n_misorder;
523 goto deliver_buf_out;
525 c->next_seq = th_seq + data_length;
528 if (now - c->last_pkt_ticks > lro_idle_ticks) {
529 ++rxq->lro.n_drop_idle;
531 sfxge_lro_deliver(&rxq->lro, c);
532 sfxge_lro_drop(rxq, c);
535 c->last_pkt_ticks = ticks;
537 if (c->n_in_order_pkts < lro_slow_start_packets) {
538 /* May be in slow-start, so don't merge. */
539 ++rxq->lro.n_slow_start;
540 ++c->n_in_order_pkts;
541 goto deliver_buf_out;
544 if (__predict_false(dont_merge)) {
546 sfxge_lro_deliver(&rxq->lro, c);
547 if (th->th_flags & (TH_FIN | TH_RST)) {
548 ++rxq->lro.n_drop_closed;
549 sfxge_lro_drop(rxq, c);
552 goto deliver_buf_out;
555 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
557 if (__predict_true(c->mbuf != NULL)) {
558 /* Remove headers and any padding */
559 rx_buf->mbuf->m_data += hdr_length;
560 rx_buf->mbuf->m_len = data_length;
562 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
564 /* Remove any padding */
565 rx_buf->mbuf->m_len = pkt_length;
567 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
574 sfxge_rx_deliver(rxq->sc, rx_buf);
578 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
579 uint16_t l2_id, void *nh, struct tcphdr *th)
581 unsigned bucket = conn_hash & st->conns_mask;
582 struct sfxge_lro_conn *c;
584 if (st->conns_n[bucket] >= lro_chain_max) {
589 if (!TAILQ_EMPTY(&st->free_conns)) {
590 c = TAILQ_FIRST(&st->free_conns);
591 TAILQ_REMOVE(&st->free_conns, c, link);
593 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
597 c->next_buf.mbuf = NULL;
600 /* Create the connection tracking data */
601 ++st->conns_n[bucket];
602 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
604 c->conn_hash = conn_hash;
605 c->source = th->th_sport;
606 c->dest = th->th_dport;
607 c->n_in_order_pkts = 0;
608 c->last_pkt_ticks = *(volatile int *)&ticks;
611 /* NB. We don't initialise c->next_seq, and it doesn't matter what
612 * value it has. Most likely the next packet received for this
613 * connection will not match -- no harm done.
617 /* Process mbuf and decide whether to dispatch it to the stack now or
621 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
623 struct sfxge_softc *sc = rxq->sc;
624 struct mbuf *m = rx_buf->mbuf;
625 struct ether_header *eh;
626 struct sfxge_lro_conn *c;
634 /* Get the hardware hash */
635 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
638 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
639 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
640 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
641 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
642 SFXGE_LRO_L2_ID_VLAN;
643 l3_proto = veh->evl_proto;
647 l3_proto = eh->ether_type;
651 /* Check whether this is a suitable packet (unfragmented
652 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
653 * length, and compute a hash if necessary. If not, return.
655 if (l3_proto == htons(ETHERTYPE_IP)) {
657 if ((iph->ip_p - IPPROTO_TCP) |
658 (iph->ip_hl - (sizeof(*iph) >> 2u)) |
659 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
661 th = (struct tcphdr *)(iph + 1);
662 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
663 struct ip6_hdr *iph = nh;
664 if (iph->ip6_nxt != IPPROTO_TCP)
666 l2_id |= SFXGE_LRO_L2_ID_IPV6;
667 th = (struct tcphdr *)(iph + 1);
672 bucket = conn_hash & rxq->lro.conns_mask;
674 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
675 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
677 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
679 if (c->mbuf != NULL) {
680 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
681 struct ip *c_iph, *iph = nh;
683 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
684 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
687 struct ip6_hdr *c_iph, *iph = nh;
689 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
690 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
695 /* Re-insert at head of list to reduce lookup time. */
696 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
697 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
699 if (c->next_buf.mbuf != NULL) {
700 if (!sfxge_lro_try_merge(rxq, c))
703 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
706 c->next_buf = *rx_buf;
711 rx_buf->flags = EFX_DISCARD;
715 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
717 sfxge_rx_deliver(sc, rx_buf);
720 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
722 struct sfxge_lro_state *st = &rxq->lro;
723 struct sfxge_lro_conn *c;
726 while (!LIST_EMPTY(&st->active_conns)) {
727 c = LIST_FIRST(&st->active_conns);
728 if (!c->delivered && c->mbuf != NULL)
729 sfxge_lro_deliver(st, c);
730 if (sfxge_lro_try_merge(rxq, c)) {
732 sfxge_lro_deliver(st, c);
733 LIST_REMOVE(c, active_link);
738 t = *(volatile int *)&ticks;
739 if (__predict_false(t != st->last_purge_ticks))
740 sfxge_lro_purge_idle(rxq, t);
743 #else /* !SFXGE_LRO */
746 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
751 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
755 #endif /* SFXGE_LRO */
758 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
760 struct sfxge_softc *sc = rxq->sc;
761 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
763 struct sfxge_evq *evq;
764 unsigned int completed;
767 struct sfxge_rx_sw_desc *prev = NULL;
770 evq = sc->evq[index];
772 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
774 completed = rxq->completed;
775 while (completed != rxq->pending) {
777 struct sfxge_rx_sw_desc *rx_desc;
779 id = completed++ & rxq->ptr_mask;
780 rx_desc = &rxq->queue[id];
783 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
786 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
789 prefetch_read_many(mtod(m, caddr_t));
791 /* Check for loopback packets */
792 if (!(rx_desc->flags & EFX_PKT_IPV4) &&
793 !(rx_desc->flags & EFX_PKT_IPV6)) {
794 struct ether_header *etherhp;
797 etherhp = mtod(m, struct ether_header *);
799 if (etherhp->ether_type ==
800 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
801 EFSYS_PROBE(loopback);
808 /* Pass packet up the stack or into LRO (pipelined) */
811 sfxge_lro(rxq, prev);
813 sfxge_rx_deliver(sc, prev);
819 /* Return the packet to the pool */
821 rx_desc->mbuf = NULL;
823 rxq->completed = completed;
825 level = rxq->added - rxq->completed;
827 /* Pass last packet up the stack or into LRO */
830 sfxge_lro(rxq, prev);
832 sfxge_rx_deliver(sc, prev);
836 * If there are any pending flows and this is the end of the
837 * poll then they must be completed.
840 sfxge_lro_end_of_burst(rxq);
842 /* Top up the queue if necessary */
843 if (level < rxq->refill_threshold)
844 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
848 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
850 struct sfxge_rxq *rxq;
851 struct sfxge_evq *evq;
854 rxq = sc->rxq[index];
855 evq = sc->evq[index];
859 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
860 ("rxq not started"));
862 rxq->init_state = SFXGE_RXQ_INITIALIZED;
864 callout_stop(&rxq->refill_callout);
867 rxq->flush_state = SFXGE_FLUSH_PENDING;
869 /* Flush the receive queue */
870 efx_rx_qflush(rxq->common);
872 SFXGE_EVQ_UNLOCK(evq);
876 /* Spin for 100 ms */
879 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
882 } while (++count < 20);
886 if (rxq->flush_state == SFXGE_FLUSH_FAILED)
889 rxq->flush_state = SFXGE_FLUSH_DONE;
891 rxq->pending = rxq->added;
892 sfxge_rx_qcomplete(rxq, B_TRUE);
894 KASSERT(rxq->completed == rxq->pending,
895 ("rxq->completed != rxq->pending"));
902 /* Destroy the common code receive queue. */
903 efx_rx_qdestroy(rxq->common);
905 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
906 EFX_RXQ_NBUFS(sc->rxq_entries));
908 SFXGE_EVQ_UNLOCK(evq);
912 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
914 struct sfxge_rxq *rxq;
916 struct sfxge_evq *evq;
919 rxq = sc->rxq[index];
921 evq = sc->evq[index];
923 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
924 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
925 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
926 ("evq->init_state != SFXGE_EVQ_STARTED"));
928 /* Program the buffer table. */
929 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
930 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
933 /* Create the common code receive queue. */
934 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
935 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
941 /* Enable the receive queue. */
942 efx_rx_qenable(rxq->common);
944 rxq->init_state = SFXGE_RXQ_STARTED;
946 /* Try to fill the queue from the pool. */
947 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
949 SFXGE_EVQ_UNLOCK(evq);
954 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
955 EFX_RXQ_NBUFS(sc->rxq_entries));
960 sfxge_rx_stop(struct sfxge_softc *sc)
964 /* Stop the receive queue(s) */
965 index = sc->rxq_count;
967 sfxge_rx_qstop(sc, index);
969 sc->rx_prefix_size = 0;
970 sc->rx_buffer_size = 0;
972 efx_rx_fini(sc->enp);
976 sfxge_rx_start(struct sfxge_softc *sc)
978 struct sfxge_intr *intr;
984 /* Initialize the common code receive module. */
985 if ((rc = efx_rx_init(sc->enp)) != 0)
988 /* Calculate the receive packet buffer size. */
989 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
990 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
993 /* Select zone for packet buffers */
994 if (sc->rx_buffer_size <= MCLBYTES)
995 sc->rx_buffer_zone = zone_clust;
996 else if (sc->rx_buffer_size <= MJUMPAGESIZE)
997 sc->rx_buffer_zone = zone_jumbop;
998 else if (sc->rx_buffer_size <= MJUM9BYTES)
999 sc->rx_buffer_zone = zone_jumbo9;
1001 sc->rx_buffer_zone = zone_jumbo16;
1004 * Set up the scale table. Enable all hash types and hash insertion.
1006 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1007 sc->rx_indir_table[index] = index % sc->rxq_count;
1008 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1009 SFXGE_RX_SCALE_MAX)) != 0)
1011 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1012 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1013 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1015 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1016 sizeof(toep_key))) != 0)
1019 /* Start the receive queue(s). */
1020 for (index = 0; index < sc->rxq_count; index++) {
1021 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1028 while (--index >= 0)
1029 sfxge_rx_qstop(sc, index);
1032 efx_rx_fini(sc->enp);
1039 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1041 struct sfxge_lro_state *st = &rxq->lro;
1044 st->conns_mask = lro_table_size - 1;
1045 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1046 ("lro_table_size must be a power of 2"));
1048 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1050 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1052 for (i = 0; i <= st->conns_mask; ++i) {
1053 TAILQ_INIT(&st->conns[i]);
1056 LIST_INIT(&st->active_conns);
1057 TAILQ_INIT(&st->free_conns);
1060 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1062 struct sfxge_lro_state *st = &rxq->lro;
1063 struct sfxge_lro_conn *c;
1066 /* Return cleanly if sfxge_lro_init() has not been called. */
1067 if (st->conns == NULL)
1070 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1072 for (i = 0; i <= st->conns_mask; ++i) {
1073 while (!TAILQ_EMPTY(&st->conns[i])) {
1074 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1075 sfxge_lro_drop(rxq, c);
1079 while (!TAILQ_EMPTY(&st->free_conns)) {
1080 c = TAILQ_FIRST(&st->free_conns);
1081 TAILQ_REMOVE(&st->free_conns, c, link);
1082 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1086 free(st->conns_n, M_SFXGE);
1087 free(st->conns, M_SFXGE);
1094 sfxge_lro_init(struct sfxge_rxq *rxq)
1099 sfxge_lro_fini(struct sfxge_rxq *rxq)
1103 #endif /* SFXGE_LRO */
1106 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1108 struct sfxge_rxq *rxq;
1110 rxq = sc->rxq[index];
1112 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1113 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1115 /* Free the context array and the flow table. */
1116 free(rxq->queue, M_SFXGE);
1117 sfxge_lro_fini(rxq);
1119 /* Release DMA memory. */
1120 sfxge_dma_free(&rxq->mem);
1122 sc->rxq[index] = NULL;
1128 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1130 struct sfxge_rxq *rxq;
1131 struct sfxge_evq *evq;
1135 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1137 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1140 rxq->entries = sc->rxq_entries;
1141 rxq->ptr_mask = rxq->entries - 1;
1142 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1144 sc->rxq[index] = rxq;
1147 evq = sc->evq[index];
1149 /* Allocate and zero DMA space. */
1150 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1153 /* Allocate buffer table entries. */
1154 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1157 /* Allocate the context array and the flow table. */
1158 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1159 M_SFXGE, M_WAITOK | M_ZERO);
1160 sfxge_lro_init(rxq);
1162 callout_init(&rxq->refill_callout, B_TRUE);
1164 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1169 static const struct {
1172 } sfxge_rx_stats[] = {
1173 #define SFXGE_RX_STAT(name, member) \
1174 { #name, offsetof(struct sfxge_rxq, member) }
1176 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1177 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1178 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1179 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1180 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1181 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1182 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1183 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1188 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1190 struct sfxge_softc *sc = arg1;
1191 unsigned int id = arg2;
1192 unsigned int sum, index;
1194 /* Sum across all RX queues */
1196 for (index = 0; index < sc->rxq_count; index++)
1197 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1198 sfxge_rx_stats[id].offset);
1200 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1204 sfxge_rx_stat_init(struct sfxge_softc *sc)
1206 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1207 struct sysctl_oid_list *stat_list;
1210 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1212 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1215 OID_AUTO, sfxge_rx_stats[id].name,
1216 CTLTYPE_UINT|CTLFLAG_RD,
1217 sc, id, sfxge_rx_stat_handler, "IU",
1223 sfxge_rx_fini(struct sfxge_softc *sc)
1227 index = sc->rxq_count;
1228 while (--index >= 0)
1229 sfxge_rx_qfini(sc, index);
1235 sfxge_rx_init(struct sfxge_softc *sc)
1237 struct sfxge_intr *intr;
1242 if (lro_idle_ticks == 0)
1243 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1248 sc->rxq_count = intr->n_alloc;
1250 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1251 ("intr->state != SFXGE_INTR_INITIALIZED"));
1253 /* Initialize the receive queue(s) - one per interrupt. */
1254 for (index = 0; index < sc->rxq_count; index++) {
1255 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1259 sfxge_rx_stat_init(sc);
1264 /* Tear down the receive queue(s). */
1265 while (--index >= 0)
1266 sfxge_rx_qfini(sc, index);