2 * Copyright (c) 2010-2016 Solarflare Communications Inc.
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
11 * 1. Redistributions of source code must retain the above copyright notice,
12 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include <sys/param.h>
38 #include <sys/malloc.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/syslog.h>
44 #include <sys/limits.h>
45 #include <sys/syslog.h>
47 #include <net/ethernet.h>
49 #include <net/if_vlan_var.h>
51 #include <netinet/in.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
54 #include <netinet/tcp.h>
56 #include <machine/in_cksum.h>
58 #include "common/efx.h"
64 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
68 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
69 "Large receive offload (LRO) parameters");
71 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
73 /* Size of the LRO hash table. Must be a power of 2. A larger table
74 * means we can accelerate a larger number of streams.
76 static unsigned lro_table_size = 128;
77 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
78 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
80 "Size of the LRO hash table (must be a power of 2)");
82 /* Maximum length of a hash chain. If chains get too long then the lookup
83 * time increases and may exceed the benefit of LRO.
85 static unsigned lro_chain_max = 20;
86 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
87 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
89 "The maximum length of a hash chain");
91 /* Maximum time (in ticks) that a connection can be idle before it's LRO
94 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
95 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
96 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
98 "The maximum time (in ticks) that a connection can be idle "
99 "before it's LRO state is discarded");
101 /* Number of packets with payload that must arrive in-order before a
102 * connection is eligible for LRO. The idea is we should avoid coalescing
103 * segments when the sender is in slow-start because reducing the ACK rate
104 * can damage performance.
106 static int lro_slow_start_packets = 2000;
107 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
108 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
109 &lro_slow_start_packets, 0,
110 "Number of packets with payload that must arrive in-order before "
111 "a connection is eligible for LRO");
113 /* Number of packets with payload that must arrive in-order following loss
114 * before a connection is eligible for LRO. The idea is we should avoid
115 * coalescing segments when the sender is recovering from loss, because
116 * reducing the ACK rate can damage performance.
118 static int lro_loss_packets = 20;
119 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
120 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
121 &lro_loss_packets, 0,
122 "Number of packets with payload that must arrive in-order "
123 "following loss before a connection is eligible for LRO");
125 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
126 #define SFXGE_LRO_L2_ID_VLAN 0x4000
127 #define SFXGE_LRO_L2_ID_IPV6 0x8000
128 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
129 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
131 /* Compare IPv6 addresses, avoiding conditional branches */
132 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
133 const struct in6_addr *right)
136 const uint64_t *left64 = (const uint64_t *)left;
137 const uint64_t *right64 = (const uint64_t *)right;
138 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
140 return (left->s6_addr32[0] - right->s6_addr32[0]) |
141 (left->s6_addr32[1] - right->s6_addr32[1]) |
142 (left->s6_addr32[2] - right->s6_addr32[2]) |
143 (left->s6_addr32[3] - right->s6_addr32[3]);
147 #endif /* SFXGE_LRO */
150 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
153 rxq->flush_state = SFXGE_FLUSH_DONE;
157 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
160 rxq->flush_state = SFXGE_FLUSH_FAILED;
163 static uint8_t toep_key[] = {
164 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
165 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
166 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
167 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
168 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
172 sfxge_rx_post_refill(void *arg)
174 struct sfxge_rxq *rxq = arg;
175 struct sfxge_softc *sc;
177 struct sfxge_evq *evq;
182 evq = sc->evq[index];
183 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
185 /* This is guaranteed due to the start/stop order of rx and ev */
186 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
187 ("evq not started"));
188 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
189 ("rxq not started"));
190 efx_ev_qpost(evq->common, magic);
194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
196 /* Initially retry after 100 ms, but back off in case of
197 * repeated failures as we probably have to wait for the
198 * administrator to raise the pool limit. */
200 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
202 rxq->refill_delay = hz / 10;
204 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
205 sfxge_rx_post_refill, rxq);
208 #define SFXGE_REFILL_BATCH 64
211 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
213 struct sfxge_softc *sc;
215 struct sfxge_evq *evq;
218 unsigned int mblksize;
220 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
224 evq = sc->evq[index];
226 prefetch_read_many(sc->enp);
227 prefetch_read_many(rxq->common);
229 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
231 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
234 rxfill = rxq->added - rxq->completed;
235 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
236 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
237 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
238 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
239 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
245 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
246 while (ntodo-- > 0) {
248 struct sfxge_rx_sw_desc *rx_desc;
249 bus_dma_segment_t seg;
252 id = (rxq->added + batch) & rxq->ptr_mask;
253 rx_desc = &rxq->queue[id];
254 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
256 rx_desc->flags = EFX_DISCARD;
257 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
258 sc->rx_cluster_size);
262 /* m_len specifies length of area to be mapped for DMA */
264 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
265 m->m_data += sc->rx_buffer_align;
267 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
268 addr[batch++] = seg.ds_addr;
270 if (batch == SFXGE_REFILL_BATCH) {
271 efx_rx_qpost(rxq->common, addr, mblksize, batch,
272 rxq->completed, rxq->added);
279 sfxge_rx_schedule_refill(rxq, retrying);
282 efx_rx_qpost(rxq->common, addr, mblksize, batch,
283 rxq->completed, rxq->added);
287 /* Make the descriptors visible to the hardware */
288 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
289 BUS_DMASYNC_PREWRITE);
291 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
293 /* The queue could still be empty if no descriptors were actually
294 * pushed, in which case there will be no event to cause the next
295 * refill, so we must schedule a refill ourselves.
297 if(rxq->pushed == rxq->completed) {
298 sfxge_rx_schedule_refill(rxq, retrying);
303 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
306 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
309 /* Make sure the queue is full */
310 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
313 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
315 struct ifnet *ifp = sc->ifnet;
317 m->m_pkthdr.rcvif = ifp;
318 m->m_pkthdr.csum_data = 0xffff;
319 ifp->if_input(ifp, m);
323 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
325 struct mbuf *m = rx_desc->mbuf;
326 int flags = rx_desc->flags;
329 /* Convert checksum flags */
330 csum_flags = (flags & EFX_CKSUM_IPV4) ?
331 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
332 if (flags & EFX_CKSUM_TCPUDP)
333 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
335 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
337 efx_psuedo_hdr_hash_get(sc->enp,
338 EFX_RX_HASHALG_TOEPLITZ,
340 /* The hash covers a 4-tuple for TCP only */
342 (flags & EFX_PKT_IPV4) ?
343 ((flags & EFX_PKT_TCP) ?
344 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
345 ((flags & EFX_PKT_TCP) ?
346 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
348 m->m_data += sc->rx_prefix_size;
349 m->m_len = rx_desc->size - sc->rx_prefix_size;
350 m->m_pkthdr.len = m->m_len;
351 m->m_pkthdr.csum_flags = csum_flags;
352 __sfxge_rx_deliver(sc, rx_desc->mbuf);
354 rx_desc->flags = EFX_DISCARD;
355 rx_desc->mbuf = NULL;
361 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
363 struct sfxge_softc *sc = st->sc;
364 struct mbuf *m = c->mbuf;
368 KASSERT(m, ("no mbuf to deliver"));
372 /* Finish off packet munging and recalculate IP header checksum. */
373 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
374 struct ip *iph = c->nh;
375 iph->ip_len = htons(iph->ip_len);
377 iph->ip_sum = in_cksum_hdr(iph);
378 c_th = (struct tcphdr *)(iph + 1);
379 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
380 CSUM_IP_CHECKED | CSUM_IP_VALID);
382 struct ip6_hdr *iph = c->nh;
383 iph->ip6_plen = htons(iph->ip6_plen);
384 c_th = (struct tcphdr *)(iph + 1);
385 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
388 c_th->th_win = c->th_last->th_win;
389 c_th->th_ack = c->th_last->th_ack;
390 if (c_th->th_off == c->th_last->th_off) {
391 /* Copy TCP options (take care to avoid going negative). */
392 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
393 memcpy(c_th + 1, c->th_last + 1, optlen);
396 m->m_pkthdr.flowid = c->conn_hash;
398 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
399 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
401 m->m_pkthdr.csum_flags = csum_flags;
402 __sfxge_rx_deliver(sc, m);
408 /* Drop the given connection, and add it to the free list. */
409 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
413 KASSERT(!c->mbuf, ("found orphaned mbuf"));
415 if (c->next_buf.mbuf != NULL) {
416 sfxge_rx_deliver(rxq->sc, &c->next_buf);
417 LIST_REMOVE(c, active_link);
420 bucket = c->conn_hash & rxq->lro.conns_mask;
421 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
422 --rxq->lro.conns_n[bucket];
423 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
424 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
427 /* Stop tracking connections that have gone idle in order to keep hash
430 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
432 struct sfxge_lro_conn *c;
435 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
436 ("found active connections"));
438 rxq->lro.last_purge_ticks = now;
439 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
440 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
443 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
444 if (now - c->last_pkt_ticks > lro_idle_ticks) {
445 ++rxq->lro.n_drop_idle;
446 sfxge_lro_drop(rxq, c);
452 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
453 struct mbuf *mbuf, struct tcphdr *th)
457 /* Tack the new mbuf onto the chain. */
458 KASSERT(!mbuf->m_next, ("mbuf already chained"));
459 c->mbuf_tail->m_next = mbuf;
462 /* Increase length appropriately */
463 c->mbuf->m_pkthdr.len += mbuf->m_len;
465 /* Update the connection state flags */
466 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
467 struct ip *iph = c->nh;
468 iph->ip_len += mbuf->m_len;
469 c_th = (struct tcphdr *)(iph + 1);
471 struct ip6_hdr *iph = c->nh;
472 iph->ip6_plen += mbuf->m_len;
473 c_th = (struct tcphdr *)(iph + 1);
475 c_th->th_flags |= (th->th_flags & TH_PUSH);
479 /* Pass packet up now if another segment could overflow the IP
482 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
483 sfxge_lro_deliver(st, c);
487 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
488 struct mbuf *mbuf, void *nh, struct tcphdr *th)
490 /* Start the chain */
492 c->mbuf_tail = c->mbuf;
496 mbuf->m_pkthdr.len = mbuf->m_len;
498 /* Mangle header fields for later processing */
499 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
501 iph->ip_len = ntohs(iph->ip_len);
503 struct ip6_hdr *iph = nh;
504 iph->ip6_plen = ntohs(iph->ip6_plen);
508 /* Try to merge or otherwise hold or deliver (as appropriate) the
509 * packet buffered for this connection (c->next_buf). Return a flag
510 * indicating whether the connection is still active for LRO purposes.
513 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
515 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
516 char *eh = c->next_eh;
517 int data_length, hdr_length, dont_merge;
518 unsigned th_seq, pkt_length;
522 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
523 struct ip *iph = c->next_nh;
524 th = (struct tcphdr *)(iph + 1);
525 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
527 struct ip6_hdr *iph = c->next_nh;
528 th = (struct tcphdr *)(iph + 1);
529 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
532 hdr_length = (char *) th + th->th_off * 4 - eh;
533 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
535 th_seq = ntohl(th->th_seq);
536 dont_merge = ((data_length <= 0)
537 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
539 /* Check for options other than aligned timestamp. */
540 if (th->th_off != 5) {
541 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
542 if (th->th_off == 8 &&
543 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
545 (TCPOPT_TIMESTAMP << 8) |
546 TCPOLEN_TIMESTAMP)) {
547 /* timestamp option -- okay */
553 if (__predict_false(th_seq != c->next_seq)) {
554 /* Out-of-order, so start counting again. */
556 sfxge_lro_deliver(&rxq->lro, c);
557 c->n_in_order_pkts -= lro_loss_packets;
558 c->next_seq = th_seq + data_length;
559 ++rxq->lro.n_misorder;
560 goto deliver_buf_out;
562 c->next_seq = th_seq + data_length;
565 if (now - c->last_pkt_ticks > lro_idle_ticks) {
566 ++rxq->lro.n_drop_idle;
568 sfxge_lro_deliver(&rxq->lro, c);
569 sfxge_lro_drop(rxq, c);
572 c->last_pkt_ticks = ticks;
574 if (c->n_in_order_pkts < lro_slow_start_packets) {
575 /* May be in slow-start, so don't merge. */
576 ++rxq->lro.n_slow_start;
577 ++c->n_in_order_pkts;
578 goto deliver_buf_out;
581 if (__predict_false(dont_merge)) {
583 sfxge_lro_deliver(&rxq->lro, c);
584 if (th->th_flags & (TH_FIN | TH_RST)) {
585 ++rxq->lro.n_drop_closed;
586 sfxge_lro_drop(rxq, c);
589 goto deliver_buf_out;
592 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
594 if (__predict_true(c->mbuf != NULL)) {
595 /* Remove headers and any padding */
596 rx_buf->mbuf->m_data += hdr_length;
597 rx_buf->mbuf->m_len = data_length;
599 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
601 /* Remove any padding */
602 rx_buf->mbuf->m_len = pkt_length;
604 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
611 sfxge_rx_deliver(rxq->sc, rx_buf);
615 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
616 uint16_t l2_id, void *nh, struct tcphdr *th)
618 unsigned bucket = conn_hash & st->conns_mask;
619 struct sfxge_lro_conn *c;
621 if (st->conns_n[bucket] >= lro_chain_max) {
626 if (!TAILQ_EMPTY(&st->free_conns)) {
627 c = TAILQ_FIRST(&st->free_conns);
628 TAILQ_REMOVE(&st->free_conns, c, link);
630 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
634 c->next_buf.mbuf = NULL;
637 /* Create the connection tracking data */
638 ++st->conns_n[bucket];
639 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
641 c->conn_hash = conn_hash;
642 c->source = th->th_sport;
643 c->dest = th->th_dport;
644 c->n_in_order_pkts = 0;
645 c->last_pkt_ticks = *(volatile int *)&ticks;
648 /* NB. We don't initialise c->next_seq, and it doesn't matter what
649 * value it has. Most likely the next packet received for this
650 * connection will not match -- no harm done.
654 /* Process mbuf and decide whether to dispatch it to the stack now or
658 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
660 struct sfxge_softc *sc = rxq->sc;
661 struct mbuf *m = rx_buf->mbuf;
662 struct ether_header *eh;
663 struct sfxge_lro_conn *c;
671 /* Get the hardware hash */
672 conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
673 EFX_RX_HASHALG_TOEPLITZ,
676 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
677 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
678 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
679 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
680 SFXGE_LRO_L2_ID_VLAN;
681 l3_proto = veh->evl_proto;
685 l3_proto = eh->ether_type;
689 /* Check whether this is a suitable packet (unfragmented
690 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
691 * length, and compute a hash if necessary. If not, return.
693 if (l3_proto == htons(ETHERTYPE_IP)) {
696 KASSERT(iph->ip_p == IPPROTO_TCP,
697 ("IPv4 protocol is not TCP, but packet marker is set"));
698 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
699 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
701 th = (struct tcphdr *)(iph + 1);
702 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
703 struct ip6_hdr *iph = nh;
705 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
706 ("IPv6 next header is not TCP, but packet marker is set"));
707 l2_id |= SFXGE_LRO_L2_ID_IPV6;
708 th = (struct tcphdr *)(iph + 1);
713 bucket = conn_hash & rxq->lro.conns_mask;
715 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
716 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
718 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
720 if (c->mbuf != NULL) {
721 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
722 struct ip *c_iph, *iph = nh;
724 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
725 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
728 struct ip6_hdr *c_iph, *iph = nh;
730 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
731 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
736 /* Re-insert at head of list to reduce lookup time. */
737 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
738 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
740 if (c->next_buf.mbuf != NULL) {
741 if (!sfxge_lro_try_merge(rxq, c))
744 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
747 c->next_buf = *rx_buf;
752 rx_buf->flags = EFX_DISCARD;
756 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
758 sfxge_rx_deliver(sc, rx_buf);
761 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
763 struct sfxge_lro_state *st = &rxq->lro;
764 struct sfxge_lro_conn *c;
767 while (!LIST_EMPTY(&st->active_conns)) {
768 c = LIST_FIRST(&st->active_conns);
769 if (!c->delivered && c->mbuf != NULL)
770 sfxge_lro_deliver(st, c);
771 if (sfxge_lro_try_merge(rxq, c)) {
773 sfxge_lro_deliver(st, c);
774 LIST_REMOVE(c, active_link);
779 t = *(volatile int *)&ticks;
780 if (__predict_false(t != st->last_purge_ticks))
781 sfxge_lro_purge_idle(rxq, t);
784 #else /* !SFXGE_LRO */
787 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
792 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
796 #endif /* SFXGE_LRO */
799 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
801 struct sfxge_softc *sc = rxq->sc;
802 int if_capenable = sc->ifnet->if_capenable;
803 int lro_enabled = if_capenable & IFCAP_LRO;
805 struct sfxge_evq *evq;
806 unsigned int completed;
809 struct sfxge_rx_sw_desc *prev = NULL;
812 evq = sc->evq[index];
814 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
816 completed = rxq->completed;
817 while (completed != rxq->pending) {
819 struct sfxge_rx_sw_desc *rx_desc;
821 id = completed++ & rxq->ptr_mask;
822 rx_desc = &rxq->queue[id];
825 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
828 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
831 /* Read the length from the pseudo header if required */
832 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
835 rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
838 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
839 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
842 prefetch_read_many(mtod(m, caddr_t));
844 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
846 if (~if_capenable & IFCAP_RXCSUM)
848 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
851 if (~if_capenable & IFCAP_RXCSUM_IPV6)
852 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
855 /* Check for loopback packets */
857 struct ether_header *etherhp;
860 etherhp = mtod(m, struct ether_header *);
862 if (etherhp->ether_type ==
863 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
864 EFSYS_PROBE(loopback);
873 ("Rx descriptor with both IPv4 and IPv6 flags"));
877 /* Pass packet up the stack or into LRO (pipelined) */
880 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
881 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
882 sfxge_lro(rxq, prev);
884 sfxge_rx_deliver(sc, prev);
890 /* Return the packet to the pool */
892 rx_desc->mbuf = NULL;
894 rxq->completed = completed;
896 level = rxq->added - rxq->completed;
898 /* Pass last packet up the stack or into LRO */
901 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
902 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
903 sfxge_lro(rxq, prev);
905 sfxge_rx_deliver(sc, prev);
909 * If there are any pending flows and this is the end of the
910 * poll then they must be completed.
913 sfxge_lro_end_of_burst(rxq);
915 /* Top up the queue if necessary */
916 if (level < rxq->refill_threshold)
917 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
921 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
923 struct sfxge_rxq *rxq;
924 struct sfxge_evq *evq;
926 unsigned int retry = 3;
928 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
930 rxq = sc->rxq[index];
931 evq = sc->evq[index];
935 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
936 ("rxq not started"));
938 rxq->init_state = SFXGE_RXQ_INITIALIZED;
940 callout_stop(&rxq->refill_callout);
942 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
943 rxq->flush_state = SFXGE_FLUSH_PENDING;
945 SFXGE_EVQ_UNLOCK(evq);
947 /* Flush the receive queue */
948 if (efx_rx_qflush(rxq->common) != 0) {
950 rxq->flush_state = SFXGE_FLUSH_FAILED;
956 /* Spin for 100 ms */
959 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
962 } while (++count < 20);
966 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
967 /* Flush timeout - neither done nor failed */
968 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
969 device_get_nameunit(sc->dev), index);
970 rxq->flush_state = SFXGE_FLUSH_DONE;
974 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
975 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
976 device_get_nameunit(sc->dev), index);
977 rxq->flush_state = SFXGE_FLUSH_DONE;
980 rxq->pending = rxq->added;
981 sfxge_rx_qcomplete(rxq, B_TRUE);
983 KASSERT(rxq->completed == rxq->pending,
984 ("rxq->completed != rxq->pending"));
992 /* Destroy the common code receive queue. */
993 efx_rx_qdestroy(rxq->common);
995 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
996 EFX_RXQ_NBUFS(sc->rxq_entries));
998 SFXGE_EVQ_UNLOCK(evq);
1002 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1004 struct sfxge_rxq *rxq;
1006 struct sfxge_evq *evq;
1009 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1011 rxq = sc->rxq[index];
1013 evq = sc->evq[index];
1015 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1016 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1017 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1018 ("evq->init_state != SFXGE_EVQ_STARTED"));
1020 /* Program the buffer table. */
1021 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1022 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1025 /* Create the common code receive queue. */
1026 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1027 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1028 &rxq->common)) != 0)
1031 SFXGE_EVQ_LOCK(evq);
1033 /* Enable the receive queue. */
1034 efx_rx_qenable(rxq->common);
1036 rxq->init_state = SFXGE_RXQ_STARTED;
1037 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1039 /* Try to fill the queue from the pool. */
1040 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1042 SFXGE_EVQ_UNLOCK(evq);
1047 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1048 EFX_RXQ_NBUFS(sc->rxq_entries));
1053 sfxge_rx_stop(struct sfxge_softc *sc)
1057 efx_mac_filter_default_rxq_clear(sc->enp);
1059 /* Stop the receive queue(s) */
1060 index = sc->rxq_count;
1061 while (--index >= 0)
1062 sfxge_rx_qstop(sc, index);
1064 sc->rx_prefix_size = 0;
1065 sc->rx_buffer_size = 0;
1067 efx_rx_fini(sc->enp);
1071 sfxge_rx_start(struct sfxge_softc *sc)
1073 struct sfxge_intr *intr;
1074 const efx_nic_cfg_t *encp;
1075 size_t hdrlen, align, reserved;
1081 /* Initialize the common code receive module. */
1082 if ((rc = efx_rx_init(sc->enp)) != 0)
1085 encp = efx_nic_cfg_get(sc->enp);
1086 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1088 /* Calculate the receive packet buffer size. */
1089 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1091 /* Ensure IP headers are 32bit aligned */
1092 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1093 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1095 sc->rx_buffer_size += sc->rx_buffer_align;
1097 /* Align end of packet buffer for RX DMA end padding */
1098 align = MAX(1, encp->enc_rx_buf_align_end);
1099 EFSYS_ASSERT(ISP2(align));
1100 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1103 * Standard mbuf zones only guarantee pointer-size alignment;
1104 * we need extra space to align to the cache line
1106 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1108 /* Select zone for packet buffers */
1109 if (reserved <= MCLBYTES)
1110 sc->rx_cluster_size = MCLBYTES;
1111 else if (reserved <= MJUMPAGESIZE)
1112 sc->rx_cluster_size = MJUMPAGESIZE;
1113 else if (reserved <= MJUM9BYTES)
1114 sc->rx_cluster_size = MJUM9BYTES;
1116 sc->rx_cluster_size = MJUM16BYTES;
1119 * Set up the scale table. Enable all hash types and hash insertion.
1121 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1122 sc->rx_indir_table[index] = index % sc->rxq_count;
1123 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1124 SFXGE_RX_SCALE_MAX)) != 0)
1126 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1127 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1128 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1130 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1131 sizeof(toep_key))) != 0)
1134 /* Start the receive queue(s). */
1135 for (index = 0; index < sc->rxq_count; index++) {
1136 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1140 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1141 sc->intr.n_alloc > 1);
1149 while (--index >= 0)
1150 sfxge_rx_qstop(sc, index);
1153 efx_rx_fini(sc->enp);
1160 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1162 struct sfxge_lro_state *st = &rxq->lro;
1165 st->conns_mask = lro_table_size - 1;
1166 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1167 ("lro_table_size must be a power of 2"));
1169 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1171 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1173 for (i = 0; i <= st->conns_mask; ++i) {
1174 TAILQ_INIT(&st->conns[i]);
1177 LIST_INIT(&st->active_conns);
1178 TAILQ_INIT(&st->free_conns);
1181 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1183 struct sfxge_lro_state *st = &rxq->lro;
1184 struct sfxge_lro_conn *c;
1187 /* Return cleanly if sfxge_lro_init() has not been called. */
1188 if (st->conns == NULL)
1191 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1193 for (i = 0; i <= st->conns_mask; ++i) {
1194 while (!TAILQ_EMPTY(&st->conns[i])) {
1195 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1196 sfxge_lro_drop(rxq, c);
1200 while (!TAILQ_EMPTY(&st->free_conns)) {
1201 c = TAILQ_FIRST(&st->free_conns);
1202 TAILQ_REMOVE(&st->free_conns, c, link);
1203 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1207 free(st->conns_n, M_SFXGE);
1208 free(st->conns, M_SFXGE);
1215 sfxge_lro_init(struct sfxge_rxq *rxq)
1220 sfxge_lro_fini(struct sfxge_rxq *rxq)
1224 #endif /* SFXGE_LRO */
1227 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1229 struct sfxge_rxq *rxq;
1231 rxq = sc->rxq[index];
1233 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1234 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1236 /* Free the context array and the flow table. */
1237 free(rxq->queue, M_SFXGE);
1238 sfxge_lro_fini(rxq);
1240 /* Release DMA memory. */
1241 sfxge_dma_free(&rxq->mem);
1243 sc->rxq[index] = NULL;
1249 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1251 struct sfxge_rxq *rxq;
1252 struct sfxge_evq *evq;
1256 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1258 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1261 rxq->entries = sc->rxq_entries;
1262 rxq->ptr_mask = rxq->entries - 1;
1263 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1265 sc->rxq[index] = rxq;
1268 evq = sc->evq[index];
1270 /* Allocate and zero DMA space. */
1271 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1274 /* Allocate buffer table entries. */
1275 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1278 /* Allocate the context array and the flow table. */
1279 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1280 M_SFXGE, M_WAITOK | M_ZERO);
1281 sfxge_lro_init(rxq);
1283 callout_init(&rxq->refill_callout, 1);
1285 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1290 static const struct {
1293 } sfxge_rx_stats[] = {
1294 #define SFXGE_RX_STAT(name, member) \
1295 { #name, offsetof(struct sfxge_rxq, member) }
1297 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1298 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1299 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1300 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1301 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1302 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1303 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1304 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1309 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1311 struct sfxge_softc *sc = arg1;
1312 unsigned int id = arg2;
1313 unsigned int sum, index;
1315 /* Sum across all RX queues */
1317 for (index = 0; index < sc->rxq_count; index++)
1318 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1319 sfxge_rx_stats[id].offset);
1321 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1325 sfxge_rx_stat_init(struct sfxge_softc *sc)
1327 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1328 struct sysctl_oid_list *stat_list;
1331 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1333 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1336 OID_AUTO, sfxge_rx_stats[id].name,
1337 CTLTYPE_UINT|CTLFLAG_RD,
1338 sc, id, sfxge_rx_stat_handler, "IU",
1344 sfxge_rx_fini(struct sfxge_softc *sc)
1348 index = sc->rxq_count;
1349 while (--index >= 0)
1350 sfxge_rx_qfini(sc, index);
1356 sfxge_rx_init(struct sfxge_softc *sc)
1358 struct sfxge_intr *intr;
1363 if (!ISP2(lro_table_size)) {
1364 log(LOG_ERR, "%s=%u must be power of 2",
1365 SFXGE_LRO_PARAM(table_size), lro_table_size);
1367 goto fail_lro_table_size;
1370 if (lro_idle_ticks == 0)
1371 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1376 sc->rxq_count = intr->n_alloc;
1378 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1379 ("intr->state != SFXGE_INTR_INITIALIZED"));
1381 /* Initialize the receive queue(s) - one per interrupt. */
1382 for (index = 0; index < sc->rxq_count; index++) {
1383 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1387 sfxge_rx_stat_init(sc);
1392 /* Tear down the receive queue(s). */
1393 while (--index >= 0)
1394 sfxge_rx_qfini(sc, index);
1399 fail_lro_table_size: