2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/types.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
41 #include <net/ethernet.h>
43 #include <net/if_vlan_var.h>
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
50 #include <machine/in_cksum.h>
52 #include "common/efx.h"
58 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
62 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63 "Large receive offload (LRO) parameters");
65 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
67 /* Size of the LRO hash table. Must be a power of 2. A larger table
68 * means we can accelerate a larger number of streams.
70 static unsigned lro_table_size = 128;
71 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
74 "Size of the LRO hash table (must be a power of 2)");
76 /* Maximum length of a hash chain. If chains get too long then the lookup
77 * time increases and may exceed the benefit of LRO.
79 static unsigned lro_chain_max = 20;
80 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
83 "The maximum length of a hash chain");
85 /* Maximum time (in ticks) that a connection can be idle before it's LRO
88 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
92 "The maximum time (in ticks) that a connection can be idle "
93 "before it's LRO state is discarded");
95 /* Number of packets with payload that must arrive in-order before a
96 * connection is eligible for LRO. The idea is we should avoid coalescing
97 * segments when the sender is in slow-start because reducing the ACK rate
98 * can damage performance.
100 static int lro_slow_start_packets = 2000;
101 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103 &lro_slow_start_packets, 0,
104 "Number of packets with payload that must arrive in-order before "
105 "a connection is eligible for LRO");
107 /* Number of packets with payload that must arrive in-order following loss
108 * before a connection is eligible for LRO. The idea is we should avoid
109 * coalescing segments when the sender is recovering from loss, because
110 * reducing the ACK rate can damage performance.
112 static int lro_loss_packets = 20;
113 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115 &lro_loss_packets, 0,
116 "Number of packets with payload that must arrive in-order "
117 "following loss before a connection is eligible for LRO");
119 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120 #define SFXGE_LRO_L2_ID_VLAN 0x4000
121 #define SFXGE_LRO_L2_ID_IPV6 0x8000
122 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
125 /* Compare IPv6 addresses, avoiding conditional branches */
126 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127 const struct in6_addr *right)
130 const uint64_t *left64 = (const uint64_t *)left;
131 const uint64_t *right64 = (const uint64_t *)right;
132 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
134 return (left->s6_addr32[0] - right->s6_addr32[0]) |
135 (left->s6_addr32[1] - right->s6_addr32[1]) |
136 (left->s6_addr32[2] - right->s6_addr32[2]) |
137 (left->s6_addr32[3] - right->s6_addr32[3]);
141 #endif /* SFXGE_LRO */
144 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
147 rxq->flush_state = SFXGE_FLUSH_DONE;
151 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
154 rxq->flush_state = SFXGE_FLUSH_FAILED;
157 static uint8_t toep_key[] = {
158 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
166 sfxge_rx_post_refill(void *arg)
168 struct sfxge_rxq *rxq = arg;
169 struct sfxge_softc *sc;
171 struct sfxge_evq *evq;
176 evq = sc->evq[index];
178 magic = SFXGE_MAGIC_RX_QREFILL | index;
180 /* This is guaranteed due to the start/stop order of rx and ev */
181 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182 ("evq not started"));
183 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184 ("rxq not started"));
185 efx_ev_qpost(evq->common, magic);
189 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
191 /* Initially retry after 100 ms, but back off in case of
192 * repeated failures as we probably have to wait for the
193 * administrator to raise the pool limit. */
195 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
197 rxq->refill_delay = hz / 10;
199 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200 sfxge_rx_post_refill, rxq);
203 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
208 /* Allocate mbuf structure */
209 args.flags = M_PKTHDR;
211 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
213 /* Allocate (and attach) packet buffer */
214 if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215 uma_zfree(zone_mbuf, m);
222 #define SFXGE_REFILL_BATCH 64
225 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
227 struct sfxge_softc *sc;
229 struct sfxge_evq *evq;
232 unsigned int mblksize;
234 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
238 evq = sc->evq[index];
240 prefetch_read_many(sc->enp);
241 prefetch_read_many(rxq->common);
243 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
245 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
248 rxfill = rxq->added - rxq->completed;
249 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
259 mblksize = sc->rx_buffer_size;
260 while (ntodo-- > 0) {
262 struct sfxge_rx_sw_desc *rx_desc;
263 bus_dma_segment_t seg;
266 id = (rxq->added + batch) & rxq->ptr_mask;
267 rx_desc = &rxq->queue[id];
268 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
270 rx_desc->flags = EFX_DISCARD;
271 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
274 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275 addr[batch++] = seg.ds_addr;
277 if (batch == SFXGE_REFILL_BATCH) {
278 efx_rx_qpost(rxq->common, addr, mblksize, batch,
279 rxq->completed, rxq->added);
286 sfxge_rx_schedule_refill(rxq, retrying);
289 efx_rx_qpost(rxq->common, addr, mblksize, batch,
290 rxq->completed, rxq->added);
294 /* Make the descriptors visible to the hardware */
295 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296 BUS_DMASYNC_PREWRITE);
298 efx_rx_qpush(rxq->common, rxq->added);
302 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
305 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
308 /* Make sure the queue is full */
309 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
312 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
314 struct ifnet *ifp = sc->ifnet;
316 m->m_pkthdr.rcvif = ifp;
317 m->m_pkthdr.csum_data = 0xffff;
318 ifp->if_input(ifp, m);
322 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
324 struct mbuf *m = rx_desc->mbuf;
325 int flags = rx_desc->flags;
328 /* Convert checksum flags */
329 csum_flags = (flags & EFX_CKSUM_IPV4) ?
330 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331 if (flags & EFX_CKSUM_TCPUDP)
332 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
334 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
337 /* The hash covers a 4-tuple for TCP only */
339 (flags & EFX_PKT_IPV4) ?
340 ((flags & EFX_PKT_TCP) ?
341 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
342 ((flags & EFX_PKT_TCP) ?
343 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
345 m->m_data += sc->rx_prefix_size;
346 m->m_len = rx_desc->size - sc->rx_prefix_size;
347 m->m_pkthdr.len = m->m_len;
348 m->m_pkthdr.csum_flags = csum_flags;
349 __sfxge_rx_deliver(sc, rx_desc->mbuf);
351 rx_desc->flags = EFX_DISCARD;
352 rx_desc->mbuf = NULL;
358 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
360 struct sfxge_softc *sc = st->sc;
361 struct mbuf *m = c->mbuf;
365 KASSERT(m, ("no mbuf to deliver"));
369 /* Finish off packet munging and recalculate IP header checksum. */
370 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
371 struct ip *iph = c->nh;
372 iph->ip_len = htons(iph->ip_len);
374 iph->ip_sum = in_cksum_hdr(iph);
375 c_th = (struct tcphdr *)(iph + 1);
376 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
377 CSUM_IP_CHECKED | CSUM_IP_VALID);
379 struct ip6_hdr *iph = c->nh;
380 iph->ip6_plen = htons(iph->ip6_plen);
381 c_th = (struct tcphdr *)(iph + 1);
382 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
385 c_th->th_win = c->th_last->th_win;
386 c_th->th_ack = c->th_last->th_ack;
387 if (c_th->th_off == c->th_last->th_off) {
388 /* Copy TCP options (take care to avoid going negative). */
389 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
390 memcpy(c_th + 1, c->th_last + 1, optlen);
393 m->m_pkthdr.flowid = c->conn_hash;
395 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
396 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
398 m->m_pkthdr.csum_flags = csum_flags;
399 __sfxge_rx_deliver(sc, m);
405 /* Drop the given connection, and add it to the free list. */
406 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
410 KASSERT(!c->mbuf, ("found orphaned mbuf"));
412 if (c->next_buf.mbuf != NULL) {
413 sfxge_rx_deliver(rxq->sc, &c->next_buf);
414 LIST_REMOVE(c, active_link);
417 bucket = c->conn_hash & rxq->lro.conns_mask;
418 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
419 --rxq->lro.conns_n[bucket];
420 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
421 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
424 /* Stop tracking connections that have gone idle in order to keep hash
427 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
429 struct sfxge_lro_conn *c;
432 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
433 ("found active connections"));
435 rxq->lro.last_purge_ticks = now;
436 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
437 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
440 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
441 if (now - c->last_pkt_ticks > lro_idle_ticks) {
442 ++rxq->lro.n_drop_idle;
443 sfxge_lro_drop(rxq, c);
449 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
450 struct mbuf *mbuf, struct tcphdr *th)
454 /* Tack the new mbuf onto the chain. */
455 KASSERT(!mbuf->m_next, ("mbuf already chained"));
456 c->mbuf_tail->m_next = mbuf;
459 /* Increase length appropriately */
460 c->mbuf->m_pkthdr.len += mbuf->m_len;
462 /* Update the connection state flags */
463 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
464 struct ip *iph = c->nh;
465 iph->ip_len += mbuf->m_len;
466 c_th = (struct tcphdr *)(iph + 1);
468 struct ip6_hdr *iph = c->nh;
469 iph->ip6_plen += mbuf->m_len;
470 c_th = (struct tcphdr *)(iph + 1);
472 c_th->th_flags |= (th->th_flags & TH_PUSH);
476 /* Pass packet up now if another segment could overflow the IP
479 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
480 sfxge_lro_deliver(st, c);
484 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
485 struct mbuf *mbuf, void *nh, struct tcphdr *th)
487 /* Start the chain */
489 c->mbuf_tail = c->mbuf;
493 mbuf->m_pkthdr.len = mbuf->m_len;
495 /* Mangle header fields for later processing */
496 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
498 iph->ip_len = ntohs(iph->ip_len);
500 struct ip6_hdr *iph = nh;
501 iph->ip6_plen = ntohs(iph->ip6_plen);
505 /* Try to merge or otherwise hold or deliver (as appropriate) the
506 * packet buffered for this connection (c->next_buf). Return a flag
507 * indicating whether the connection is still active for LRO purposes.
510 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
512 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
513 char *eh = c->next_eh;
514 int data_length, hdr_length, dont_merge;
515 unsigned th_seq, pkt_length;
519 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
520 struct ip *iph = c->next_nh;
521 th = (struct tcphdr *)(iph + 1);
522 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
524 struct ip6_hdr *iph = c->next_nh;
525 th = (struct tcphdr *)(iph + 1);
526 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
529 hdr_length = (char *) th + th->th_off * 4 - eh;
530 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
532 th_seq = ntohl(th->th_seq);
533 dont_merge = ((data_length <= 0)
534 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
536 /* Check for options other than aligned timestamp. */
537 if (th->th_off != 5) {
538 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
539 if (th->th_off == 8 &&
540 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
542 (TCPOPT_TIMESTAMP << 8) |
543 TCPOLEN_TIMESTAMP)) {
544 /* timestamp option -- okay */
550 if (__predict_false(th_seq != c->next_seq)) {
551 /* Out-of-order, so start counting again. */
553 sfxge_lro_deliver(&rxq->lro, c);
554 c->n_in_order_pkts -= lro_loss_packets;
555 c->next_seq = th_seq + data_length;
556 ++rxq->lro.n_misorder;
557 goto deliver_buf_out;
559 c->next_seq = th_seq + data_length;
562 if (now - c->last_pkt_ticks > lro_idle_ticks) {
563 ++rxq->lro.n_drop_idle;
565 sfxge_lro_deliver(&rxq->lro, c);
566 sfxge_lro_drop(rxq, c);
569 c->last_pkt_ticks = ticks;
571 if (c->n_in_order_pkts < lro_slow_start_packets) {
572 /* May be in slow-start, so don't merge. */
573 ++rxq->lro.n_slow_start;
574 ++c->n_in_order_pkts;
575 goto deliver_buf_out;
578 if (__predict_false(dont_merge)) {
580 sfxge_lro_deliver(&rxq->lro, c);
581 if (th->th_flags & (TH_FIN | TH_RST)) {
582 ++rxq->lro.n_drop_closed;
583 sfxge_lro_drop(rxq, c);
586 goto deliver_buf_out;
589 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
591 if (__predict_true(c->mbuf != NULL)) {
592 /* Remove headers and any padding */
593 rx_buf->mbuf->m_data += hdr_length;
594 rx_buf->mbuf->m_len = data_length;
596 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
598 /* Remove any padding */
599 rx_buf->mbuf->m_len = pkt_length;
601 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
608 sfxge_rx_deliver(rxq->sc, rx_buf);
612 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
613 uint16_t l2_id, void *nh, struct tcphdr *th)
615 unsigned bucket = conn_hash & st->conns_mask;
616 struct sfxge_lro_conn *c;
618 if (st->conns_n[bucket] >= lro_chain_max) {
623 if (!TAILQ_EMPTY(&st->free_conns)) {
624 c = TAILQ_FIRST(&st->free_conns);
625 TAILQ_REMOVE(&st->free_conns, c, link);
627 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
631 c->next_buf.mbuf = NULL;
634 /* Create the connection tracking data */
635 ++st->conns_n[bucket];
636 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
638 c->conn_hash = conn_hash;
639 c->source = th->th_sport;
640 c->dest = th->th_dport;
641 c->n_in_order_pkts = 0;
642 c->last_pkt_ticks = *(volatile int *)&ticks;
645 /* NB. We don't initialise c->next_seq, and it doesn't matter what
646 * value it has. Most likely the next packet received for this
647 * connection will not match -- no harm done.
651 /* Process mbuf and decide whether to dispatch it to the stack now or
655 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
657 struct sfxge_softc *sc = rxq->sc;
658 struct mbuf *m = rx_buf->mbuf;
659 struct ether_header *eh;
660 struct sfxge_lro_conn *c;
668 /* Get the hardware hash */
669 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
672 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
673 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
674 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
675 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
676 SFXGE_LRO_L2_ID_VLAN;
677 l3_proto = veh->evl_proto;
681 l3_proto = eh->ether_type;
685 /* Check whether this is a suitable packet (unfragmented
686 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
687 * length, and compute a hash if necessary. If not, return.
689 if (l3_proto == htons(ETHERTYPE_IP)) {
692 KASSERT(iph->ip_p == IPPROTO_TCP,
693 ("IPv4 protocol is not TCP, but packet marker is set"));
694 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
695 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
697 th = (struct tcphdr *)(iph + 1);
698 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
699 struct ip6_hdr *iph = nh;
701 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
702 ("IPv6 next header is not TCP, but packet marker is set"));
703 l2_id |= SFXGE_LRO_L2_ID_IPV6;
704 th = (struct tcphdr *)(iph + 1);
709 bucket = conn_hash & rxq->lro.conns_mask;
711 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
712 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
714 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
716 if (c->mbuf != NULL) {
717 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
718 struct ip *c_iph, *iph = nh;
720 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
721 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
724 struct ip6_hdr *c_iph, *iph = nh;
726 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
727 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
732 /* Re-insert at head of list to reduce lookup time. */
733 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
734 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
736 if (c->next_buf.mbuf != NULL) {
737 if (!sfxge_lro_try_merge(rxq, c))
740 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
743 c->next_buf = *rx_buf;
748 rx_buf->flags = EFX_DISCARD;
752 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
754 sfxge_rx_deliver(sc, rx_buf);
757 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
759 struct sfxge_lro_state *st = &rxq->lro;
760 struct sfxge_lro_conn *c;
763 while (!LIST_EMPTY(&st->active_conns)) {
764 c = LIST_FIRST(&st->active_conns);
765 if (!c->delivered && c->mbuf != NULL)
766 sfxge_lro_deliver(st, c);
767 if (sfxge_lro_try_merge(rxq, c)) {
769 sfxge_lro_deliver(st, c);
770 LIST_REMOVE(c, active_link);
775 t = *(volatile int *)&ticks;
776 if (__predict_false(t != st->last_purge_ticks))
777 sfxge_lro_purge_idle(rxq, t);
780 #else /* !SFXGE_LRO */
783 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
788 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
792 #endif /* SFXGE_LRO */
795 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
797 struct sfxge_softc *sc = rxq->sc;
798 int if_capenable = sc->ifnet->if_capenable;
799 int lro_enabled = if_capenable & IFCAP_LRO;
801 struct sfxge_evq *evq;
802 unsigned int completed;
805 struct sfxge_rx_sw_desc *prev = NULL;
808 evq = sc->evq[index];
810 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
812 completed = rxq->completed;
813 while (completed != rxq->pending) {
815 struct sfxge_rx_sw_desc *rx_desc;
817 id = completed++ & rxq->ptr_mask;
818 rx_desc = &rxq->queue[id];
821 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
824 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
827 prefetch_read_many(mtod(m, caddr_t));
829 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
831 if (~if_capenable & IFCAP_RXCSUM)
833 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
836 if (~if_capenable & IFCAP_RXCSUM_IPV6)
837 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
840 /* Check for loopback packets */
842 struct ether_header *etherhp;
845 etherhp = mtod(m, struct ether_header *);
847 if (etherhp->ether_type ==
848 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
849 EFSYS_PROBE(loopback);
858 ("Rx descriptor with both IPv4 and IPv6 flags"));
862 /* Pass packet up the stack or into LRO (pipelined) */
865 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
866 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
867 sfxge_lro(rxq, prev);
869 sfxge_rx_deliver(sc, prev);
875 /* Return the packet to the pool */
877 rx_desc->mbuf = NULL;
879 rxq->completed = completed;
881 level = rxq->added - rxq->completed;
883 /* Pass last packet up the stack or into LRO */
886 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
887 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
888 sfxge_lro(rxq, prev);
890 sfxge_rx_deliver(sc, prev);
894 * If there are any pending flows and this is the end of the
895 * poll then they must be completed.
898 sfxge_lro_end_of_burst(rxq);
900 /* Top up the queue if necessary */
901 if (level < rxq->refill_threshold)
902 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
906 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
908 struct sfxge_rxq *rxq;
909 struct sfxge_evq *evq;
912 rxq = sc->rxq[index];
913 evq = sc->evq[index];
917 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
918 ("rxq not started"));
920 rxq->init_state = SFXGE_RXQ_INITIALIZED;
922 callout_stop(&rxq->refill_callout);
925 rxq->flush_state = SFXGE_FLUSH_PENDING;
927 /* Flush the receive queue */
928 efx_rx_qflush(rxq->common);
930 SFXGE_EVQ_UNLOCK(evq);
934 /* Spin for 100 ms */
937 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
940 } while (++count < 20);
944 if (rxq->flush_state == SFXGE_FLUSH_FAILED)
947 rxq->flush_state = SFXGE_FLUSH_DONE;
949 rxq->pending = rxq->added;
950 sfxge_rx_qcomplete(rxq, B_TRUE);
952 KASSERT(rxq->completed == rxq->pending,
953 ("rxq->completed != rxq->pending"));
960 /* Destroy the common code receive queue. */
961 efx_rx_qdestroy(rxq->common);
963 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
964 EFX_RXQ_NBUFS(sc->rxq_entries));
966 SFXGE_EVQ_UNLOCK(evq);
970 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
972 struct sfxge_rxq *rxq;
974 struct sfxge_evq *evq;
977 rxq = sc->rxq[index];
979 evq = sc->evq[index];
981 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
982 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
983 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
984 ("evq->init_state != SFXGE_EVQ_STARTED"));
986 /* Program the buffer table. */
987 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
988 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
991 /* Create the common code receive queue. */
992 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
993 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
999 /* Enable the receive queue. */
1000 efx_rx_qenable(rxq->common);
1002 rxq->init_state = SFXGE_RXQ_STARTED;
1004 /* Try to fill the queue from the pool. */
1005 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1007 SFXGE_EVQ_UNLOCK(evq);
1012 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1013 EFX_RXQ_NBUFS(sc->rxq_entries));
1018 sfxge_rx_stop(struct sfxge_softc *sc)
1022 /* Stop the receive queue(s) */
1023 index = sc->rxq_count;
1024 while (--index >= 0)
1025 sfxge_rx_qstop(sc, index);
1027 sc->rx_prefix_size = 0;
1028 sc->rx_buffer_size = 0;
1030 efx_rx_fini(sc->enp);
1034 sfxge_rx_start(struct sfxge_softc *sc)
1036 struct sfxge_intr *intr;
1042 /* Initialize the common code receive module. */
1043 if ((rc = efx_rx_init(sc->enp)) != 0)
1046 /* Calculate the receive packet buffer size. */
1047 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1048 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1049 sc->rx_prefix_size);
1051 /* Select zone for packet buffers */
1052 if (sc->rx_buffer_size <= MCLBYTES)
1053 sc->rx_buffer_zone = zone_clust;
1054 else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1055 sc->rx_buffer_zone = zone_jumbop;
1056 else if (sc->rx_buffer_size <= MJUM9BYTES)
1057 sc->rx_buffer_zone = zone_jumbo9;
1059 sc->rx_buffer_zone = zone_jumbo16;
1062 * Set up the scale table. Enable all hash types and hash insertion.
1064 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1065 sc->rx_indir_table[index] = index % sc->rxq_count;
1066 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1067 SFXGE_RX_SCALE_MAX)) != 0)
1069 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1070 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1071 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1073 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1074 sizeof(toep_key))) != 0)
1077 /* Start the receive queue(s). */
1078 for (index = 0; index < sc->rxq_count; index++) {
1079 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1086 while (--index >= 0)
1087 sfxge_rx_qstop(sc, index);
1090 efx_rx_fini(sc->enp);
1097 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1099 struct sfxge_lro_state *st = &rxq->lro;
1102 st->conns_mask = lro_table_size - 1;
1103 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1104 ("lro_table_size must be a power of 2"));
1106 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1108 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1110 for (i = 0; i <= st->conns_mask; ++i) {
1111 TAILQ_INIT(&st->conns[i]);
1114 LIST_INIT(&st->active_conns);
1115 TAILQ_INIT(&st->free_conns);
1118 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1120 struct sfxge_lro_state *st = &rxq->lro;
1121 struct sfxge_lro_conn *c;
1124 /* Return cleanly if sfxge_lro_init() has not been called. */
1125 if (st->conns == NULL)
1128 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1130 for (i = 0; i <= st->conns_mask; ++i) {
1131 while (!TAILQ_EMPTY(&st->conns[i])) {
1132 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1133 sfxge_lro_drop(rxq, c);
1137 while (!TAILQ_EMPTY(&st->free_conns)) {
1138 c = TAILQ_FIRST(&st->free_conns);
1139 TAILQ_REMOVE(&st->free_conns, c, link);
1140 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1144 free(st->conns_n, M_SFXGE);
1145 free(st->conns, M_SFXGE);
1152 sfxge_lro_init(struct sfxge_rxq *rxq)
1157 sfxge_lro_fini(struct sfxge_rxq *rxq)
1161 #endif /* SFXGE_LRO */
1164 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1166 struct sfxge_rxq *rxq;
1168 rxq = sc->rxq[index];
1170 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1171 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1173 /* Free the context array and the flow table. */
1174 free(rxq->queue, M_SFXGE);
1175 sfxge_lro_fini(rxq);
1177 /* Release DMA memory. */
1178 sfxge_dma_free(&rxq->mem);
1180 sc->rxq[index] = NULL;
1186 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1188 struct sfxge_rxq *rxq;
1189 struct sfxge_evq *evq;
1193 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1195 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1198 rxq->entries = sc->rxq_entries;
1199 rxq->ptr_mask = rxq->entries - 1;
1200 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1202 sc->rxq[index] = rxq;
1205 evq = sc->evq[index];
1207 /* Allocate and zero DMA space. */
1208 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1211 /* Allocate buffer table entries. */
1212 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1215 /* Allocate the context array and the flow table. */
1216 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1217 M_SFXGE, M_WAITOK | M_ZERO);
1218 sfxge_lro_init(rxq);
1220 callout_init(&rxq->refill_callout, 1);
1222 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1227 static const struct {
1230 } sfxge_rx_stats[] = {
1231 #define SFXGE_RX_STAT(name, member) \
1232 { #name, offsetof(struct sfxge_rxq, member) }
1234 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1235 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1236 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1237 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1238 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1239 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1240 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1241 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1246 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1248 struct sfxge_softc *sc = arg1;
1249 unsigned int id = arg2;
1250 unsigned int sum, index;
1252 /* Sum across all RX queues */
1254 for (index = 0; index < sc->rxq_count; index++)
1255 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1256 sfxge_rx_stats[id].offset);
1258 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1262 sfxge_rx_stat_init(struct sfxge_softc *sc)
1264 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1265 struct sysctl_oid_list *stat_list;
1268 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1270 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1273 OID_AUTO, sfxge_rx_stats[id].name,
1274 CTLTYPE_UINT|CTLFLAG_RD,
1275 sc, id, sfxge_rx_stat_handler, "IU",
1281 sfxge_rx_fini(struct sfxge_softc *sc)
1285 index = sc->rxq_count;
1286 while (--index >= 0)
1287 sfxge_rx_qfini(sc, index);
1293 sfxge_rx_init(struct sfxge_softc *sc)
1295 struct sfxge_intr *intr;
1300 if (!ISP2(lro_table_size)) {
1301 log(LOG_ERR, "%s=%u must be power of 2",
1302 SFXGE_LRO_PARAM(table_size), lro_table_size);
1304 goto fail_lro_table_size;
1307 if (lro_idle_ticks == 0)
1308 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1313 sc->rxq_count = intr->n_alloc;
1315 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1316 ("intr->state != SFXGE_INTR_INITIALIZED"));
1318 /* Initialize the receive queue(s) - one per interrupt. */
1319 for (index = 0; index < sc->rxq_count; index++) {
1320 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1324 sfxge_rx_stat_init(sc);
1329 /* Tear down the receive queue(s). */
1330 while (--index >= 0)
1331 sfxge_rx_qfini(sc, index);
1336 fail_lro_table_size: