]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/sfxge/sfxge_rx.c
sfxge(4): get RSS key to be programmed into NIC from the kernel
[FreeBSD/FreeBSD.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include "opt_rss.h"
38
39 #include <sys/param.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/smp.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
48
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_vlan_var.h>
52
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57
58 #include <machine/in_cksum.h>
59
60 #ifdef RSS
61 #include <net/rss_config.h>
62 #endif
63
64 #include "common/efx.h"
65
66
67 #include "sfxge.h"
68 #include "sfxge_rx.h"
69
70 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
71
72 #ifdef SFXGE_LRO
73
74 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75             "Large receive offload (LRO) parameters");
76
77 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
78
79 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
80  * means we can accelerate a larger number of streams.
81  */
82 static unsigned lro_table_size = 128;
83 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
85             &lro_table_size, 0,
86             "Size of the LRO hash table (must be a power of 2)");
87
88 /* Maximum length of a hash chain.  If chains get too long then the lookup
89  * time increases and may exceed the benefit of LRO.
90  */
91 static unsigned lro_chain_max = 20;
92 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
94             &lro_chain_max, 0,
95             "The maximum length of a hash chain");
96
97 /* Maximum time (in ticks) that a connection can be idle before it's LRO
98  * state is discarded.
99  */
100 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
103             &lro_idle_ticks, 0,
104             "The maximum time (in ticks) that a connection can be idle "
105             "before it's LRO state is discarded");
106
107 /* Number of packets with payload that must arrive in-order before a
108  * connection is eligible for LRO.  The idea is we should avoid coalescing
109  * segments when the sender is in slow-start because reducing the ACK rate
110  * can damage performance.
111  */
112 static int lro_slow_start_packets = 2000;
113 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115             &lro_slow_start_packets, 0,
116             "Number of packets with payload that must arrive in-order before "
117             "a connection is eligible for LRO");
118
119 /* Number of packets with payload that must arrive in-order following loss
120  * before a connection is eligible for LRO.  The idea is we should avoid
121  * coalescing segments when the sender is recovering from loss, because
122  * reducing the ACK rate can damage performance.
123  */
124 static int lro_loss_packets = 20;
125 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127             &lro_loss_packets, 0,
128             "Number of packets with payload that must arrive in-order "
129             "following loss before a connection is eligible for LRO");
130
131 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132 #define SFXGE_LRO_L2_ID_VLAN 0x4000
133 #define SFXGE_LRO_L2_ID_IPV6 0x8000
134 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
136
137 /* Compare IPv6 addresses, avoiding conditional branches */
138 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139                                    const struct in6_addr *right)
140 {
141 #if LONG_BIT == 64
142         const uint64_t *left64 = (const uint64_t *)left;
143         const uint64_t *right64 = (const uint64_t *)right;
144         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
145 #else
146         return (left->s6_addr32[0] - right->s6_addr32[0]) |
147                (left->s6_addr32[1] - right->s6_addr32[1]) |
148                (left->s6_addr32[2] - right->s6_addr32[2]) |
149                (left->s6_addr32[3] - right->s6_addr32[3]);
150 #endif
151 }
152
153 #endif  /* SFXGE_LRO */
154
155 void
156 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
157 {
158
159         rxq->flush_state = SFXGE_FLUSH_DONE;
160 }
161
162 void
163 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
164 {
165
166         rxq->flush_state = SFXGE_FLUSH_FAILED;
167 }
168
169 #ifdef RSS
170 static uint8_t toep_key[RSS_KEYSIZE];
171 #else
172 static uint8_t toep_key[] = {
173         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
178 };
179 #endif
180
181 static void
182 sfxge_rx_post_refill(void *arg)
183 {
184         struct sfxge_rxq *rxq = arg;
185         struct sfxge_softc *sc;
186         unsigned int index;
187         struct sfxge_evq *evq;
188         uint16_t magic;
189
190         sc = rxq->sc;
191         index = rxq->index;
192         evq = sc->evq[index];
193         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
194
195         /* This is guaranteed due to the start/stop order of rx and ev */
196         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197             ("evq not started"));
198         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199             ("rxq not started"));
200         efx_ev_qpost(evq->common, magic);
201 }
202
203 static void
204 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
205 {
206         /* Initially retry after 100 ms, but back off in case of
207          * repeated failures as we probably have to wait for the
208          * administrator to raise the pool limit. */
209         if (retrying)
210                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
211         else
212                 rxq->refill_delay = hz / 10;
213
214         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215                              sfxge_rx_post_refill, rxq);
216 }
217
218 #define SFXGE_REFILL_BATCH  64
219
220 static void
221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
222 {
223         struct sfxge_softc *sc;
224         unsigned int index;
225         struct sfxge_evq *evq;
226         unsigned int batch;
227         unsigned int rxfill;
228         unsigned int mblksize;
229         int ntodo;
230         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
231
232         sc = rxq->sc;
233         index = rxq->index;
234         evq = sc->evq[index];
235
236         prefetch_read_many(sc->enp);
237         prefetch_read_many(rxq->common);
238
239         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
240
241         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
242                 return;
243
244         rxfill = rxq->added - rxq->completed;
245         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
250
251         if (ntodo == 0)
252                 return;
253
254         batch = 0;
255         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256         while (ntodo-- > 0) {
257                 unsigned int id;
258                 struct sfxge_rx_sw_desc *rx_desc;
259                 bus_dma_segment_t seg;
260                 struct mbuf *m;
261
262                 id = (rxq->added + batch) & rxq->ptr_mask;
263                 rx_desc = &rxq->queue[id];
264                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
265
266                 rx_desc->flags = EFX_DISCARD;
267                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268                     sc->rx_cluster_size);
269                 if (m == NULL)
270                         break;
271
272                 /* m_len specifies length of area to be mapped for DMA */
273                 m->m_len  = mblksize;
274                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
275                 m->m_data += sc->rx_buffer_align;
276
277                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278                 addr[batch++] = seg.ds_addr;
279
280                 if (batch == SFXGE_REFILL_BATCH) {
281                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
282                             rxq->completed, rxq->added);
283                         rxq->added += batch;
284                         batch = 0;
285                 }
286         }
287
288         if (ntodo != 0)
289                 sfxge_rx_schedule_refill(rxq, retrying);
290
291         if (batch != 0) {
292                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
293                     rxq->completed, rxq->added);
294                 rxq->added += batch;
295         }
296
297         /* Make the descriptors visible to the hardware */
298         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299                         BUS_DMASYNC_PREWRITE);
300
301         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
302
303         /* The queue could still be empty if no descriptors were actually
304          * pushed, in which case there will be no event to cause the next
305          * refill, so we must schedule a refill ourselves.
306          */
307         if(rxq->pushed == rxq->completed) {
308                 sfxge_rx_schedule_refill(rxq, retrying);
309         }
310 }
311
312 void
313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
314 {
315
316         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
317                 return;
318
319         /* Make sure the queue is full */
320         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
321 }
322
323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
324 {
325         struct ifnet *ifp = sc->ifnet;
326
327         m->m_pkthdr.rcvif = ifp;
328         m->m_pkthdr.csum_data = 0xffff;
329         ifp->if_input(ifp, m);
330 }
331
332 static void
333 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
334 {
335         struct mbuf *m = rx_desc->mbuf;
336         int flags = rx_desc->flags;
337         int csum_flags;
338
339         /* Convert checksum flags */
340         csum_flags = (flags & EFX_CKSUM_IPV4) ?
341                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
342         if (flags & EFX_CKSUM_TCPUDP)
343                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
344
345         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
346                 m->m_pkthdr.flowid =
347                         efx_psuedo_hdr_hash_get(sc->enp,
348                                                 EFX_RX_HASHALG_TOEPLITZ,
349                                                 mtod(m, uint8_t *));
350                 /* The hash covers a 4-tuple for TCP only */
351                 M_HASHTYPE_SET(m,
352                     (flags & EFX_PKT_IPV4) ?
353                         ((flags & EFX_PKT_TCP) ?
354                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
355                         ((flags & EFX_PKT_TCP) ?
356                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
357         }
358         m->m_data += sc->rx_prefix_size;
359         m->m_len = rx_desc->size - sc->rx_prefix_size;
360         m->m_pkthdr.len = m->m_len;
361         m->m_pkthdr.csum_flags = csum_flags;
362         __sfxge_rx_deliver(sc, rx_desc->mbuf);
363
364         rx_desc->flags = EFX_DISCARD;
365         rx_desc->mbuf = NULL;
366 }
367
368 #ifdef SFXGE_LRO
369
370 static void
371 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
372 {
373         struct sfxge_softc *sc = st->sc;
374         struct mbuf *m = c->mbuf;
375         struct tcphdr *c_th;
376         int csum_flags;
377
378         KASSERT(m, ("no mbuf to deliver"));
379
380         ++st->n_bursts;
381
382         /* Finish off packet munging and recalculate IP header checksum. */
383         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
384                 struct ip *iph = c->nh;
385                 iph->ip_len = htons(iph->ip_len);
386                 iph->ip_sum = 0;
387                 iph->ip_sum = in_cksum_hdr(iph);
388                 c_th = (struct tcphdr *)(iph + 1);
389                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
390                               CSUM_IP_CHECKED | CSUM_IP_VALID);
391         } else {
392                 struct ip6_hdr *iph = c->nh;
393                 iph->ip6_plen = htons(iph->ip6_plen);
394                 c_th = (struct tcphdr *)(iph + 1);
395                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
396         }
397
398         c_th->th_win = c->th_last->th_win;
399         c_th->th_ack = c->th_last->th_ack;
400         if (c_th->th_off == c->th_last->th_off) {
401                 /* Copy TCP options (take care to avoid going negative). */
402                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
403                 memcpy(c_th + 1, c->th_last + 1, optlen);
404         }
405
406         m->m_pkthdr.flowid = c->conn_hash;
407         M_HASHTYPE_SET(m,
408             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
409                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
410
411         m->m_pkthdr.csum_flags = csum_flags;
412         __sfxge_rx_deliver(sc, m);
413
414         c->mbuf = NULL;
415         c->delivered = 1;
416 }
417
418 /* Drop the given connection, and add it to the free list. */
419 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
420 {
421         unsigned bucket;
422
423         KASSERT(!c->mbuf, ("found orphaned mbuf"));
424
425         if (c->next_buf.mbuf != NULL) {
426                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
427                 LIST_REMOVE(c, active_link);
428         }
429
430         bucket = c->conn_hash & rxq->lro.conns_mask;
431         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
432         --rxq->lro.conns_n[bucket];
433         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
434         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
435 }
436
437 /* Stop tracking connections that have gone idle in order to keep hash
438  * chains short.
439  */
440 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
441 {
442         struct sfxge_lro_conn *c;
443         unsigned i;
444
445         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
446                 ("found active connections"));
447
448         rxq->lro.last_purge_ticks = now;
449         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
450                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
451                         continue;
452
453                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
454                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
455                         ++rxq->lro.n_drop_idle;
456                         sfxge_lro_drop(rxq, c);
457                 }
458         }
459 }
460
461 static void
462 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
463                 struct mbuf *mbuf, struct tcphdr *th)
464 {
465         struct tcphdr *c_th;
466
467         /* Tack the new mbuf onto the chain. */
468         KASSERT(!mbuf->m_next, ("mbuf already chained"));
469         c->mbuf_tail->m_next = mbuf;
470         c->mbuf_tail = mbuf;
471
472         /* Increase length appropriately */
473         c->mbuf->m_pkthdr.len += mbuf->m_len;
474
475         /* Update the connection state flags */
476         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
477                 struct ip *iph = c->nh;
478                 iph->ip_len += mbuf->m_len;
479                 c_th = (struct tcphdr *)(iph + 1);
480         } else {
481                 struct ip6_hdr *iph = c->nh;
482                 iph->ip6_plen += mbuf->m_len;
483                 c_th = (struct tcphdr *)(iph + 1);
484         }
485         c_th->th_flags |= (th->th_flags & TH_PUSH);
486         c->th_last = th;
487         ++st->n_merges;
488
489         /* Pass packet up now if another segment could overflow the IP
490          * length.
491          */
492         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
493                 sfxge_lro_deliver(st, c);
494 }
495
496 static void
497 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
498                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
499 {
500         /* Start the chain */
501         c->mbuf = mbuf;
502         c->mbuf_tail = c->mbuf;
503         c->nh = nh;
504         c->th_last = th;
505
506         mbuf->m_pkthdr.len = mbuf->m_len;
507
508         /* Mangle header fields for later processing */
509         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
510                 struct ip *iph = nh;
511                 iph->ip_len = ntohs(iph->ip_len);
512         } else {
513                 struct ip6_hdr *iph = nh;
514                 iph->ip6_plen = ntohs(iph->ip6_plen);
515         }
516 }
517
518 /* Try to merge or otherwise hold or deliver (as appropriate) the
519  * packet buffered for this connection (c->next_buf).  Return a flag
520  * indicating whether the connection is still active for LRO purposes.
521  */
522 static int
523 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
524 {
525         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
526         char *eh = c->next_eh;
527         int data_length, hdr_length, dont_merge;
528         unsigned th_seq, pkt_length;
529         struct tcphdr *th;
530         unsigned now;
531
532         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
533                 struct ip *iph = c->next_nh;
534                 th = (struct tcphdr *)(iph + 1);
535                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
536         } else {
537                 struct ip6_hdr *iph = c->next_nh;
538                 th = (struct tcphdr *)(iph + 1);
539                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
540         }
541
542         hdr_length = (char *) th + th->th_off * 4 - eh;
543         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
544                        hdr_length);
545         th_seq = ntohl(th->th_seq);
546         dont_merge = ((data_length <= 0)
547                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
548
549         /* Check for options other than aligned timestamp. */
550         if (th->th_off != 5) {
551                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
552                 if (th->th_off == 8 &&
553                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
554                                         (TCPOPT_NOP << 16) |
555                                         (TCPOPT_TIMESTAMP << 8) |
556                                         TCPOLEN_TIMESTAMP)) {
557                         /* timestamp option -- okay */
558                 } else {
559                         dont_merge = 1;
560                 }
561         }
562
563         if (__predict_false(th_seq != c->next_seq)) {
564                 /* Out-of-order, so start counting again. */
565                 if (c->mbuf != NULL)
566                         sfxge_lro_deliver(&rxq->lro, c);
567                 c->n_in_order_pkts -= lro_loss_packets;
568                 c->next_seq = th_seq + data_length;
569                 ++rxq->lro.n_misorder;
570                 goto deliver_buf_out;
571         }
572         c->next_seq = th_seq + data_length;
573
574         now = ticks;
575         if (now - c->last_pkt_ticks > lro_idle_ticks) {
576                 ++rxq->lro.n_drop_idle;
577                 if (c->mbuf != NULL)
578                         sfxge_lro_deliver(&rxq->lro, c);
579                 sfxge_lro_drop(rxq, c);
580                 return (0);
581         }
582         c->last_pkt_ticks = ticks;
583
584         if (c->n_in_order_pkts < lro_slow_start_packets) {
585                 /* May be in slow-start, so don't merge. */
586                 ++rxq->lro.n_slow_start;
587                 ++c->n_in_order_pkts;
588                 goto deliver_buf_out;
589         }
590
591         if (__predict_false(dont_merge)) {
592                 if (c->mbuf != NULL)
593                         sfxge_lro_deliver(&rxq->lro, c);
594                 if (th->th_flags & (TH_FIN | TH_RST)) {
595                         ++rxq->lro.n_drop_closed;
596                         sfxge_lro_drop(rxq, c);
597                         return (0);
598                 }
599                 goto deliver_buf_out;
600         }
601
602         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
603
604         if (__predict_true(c->mbuf != NULL)) {
605                 /* Remove headers and any padding */
606                 rx_buf->mbuf->m_data += hdr_length;
607                 rx_buf->mbuf->m_len = data_length;
608
609                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
610         } else {
611                 /* Remove any padding */
612                 rx_buf->mbuf->m_len = pkt_length;
613
614                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
615         }
616
617         rx_buf->mbuf = NULL;
618         return (1);
619
620  deliver_buf_out:
621         sfxge_rx_deliver(rxq->sc, rx_buf);
622         return (1);
623 }
624
625 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
626                                uint16_t l2_id, void *nh, struct tcphdr *th)
627 {
628         unsigned bucket = conn_hash & st->conns_mask;
629         struct sfxge_lro_conn *c;
630
631         if (st->conns_n[bucket] >= lro_chain_max) {
632                 ++st->n_too_many;
633                 return;
634         }
635
636         if (!TAILQ_EMPTY(&st->free_conns)) {
637                 c = TAILQ_FIRST(&st->free_conns);
638                 TAILQ_REMOVE(&st->free_conns, c, link);
639         } else {
640                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
641                 if (c == NULL)
642                         return;
643                 c->mbuf = NULL;
644                 c->next_buf.mbuf = NULL;
645         }
646
647         /* Create the connection tracking data */
648         ++st->conns_n[bucket];
649         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
650         c->l2_id = l2_id;
651         c->conn_hash = conn_hash;
652         c->source = th->th_sport;
653         c->dest = th->th_dport;
654         c->n_in_order_pkts = 0;
655         c->last_pkt_ticks = *(volatile int *)&ticks;
656         c->delivered = 0;
657         ++st->n_new_stream;
658         /* NB. We don't initialise c->next_seq, and it doesn't matter what
659          * value it has.  Most likely the next packet received for this
660          * connection will not match -- no harm done.
661          */
662 }
663
664 /* Process mbuf and decide whether to dispatch it to the stack now or
665  * later.
666  */
667 static void
668 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
669 {
670         struct sfxge_softc *sc = rxq->sc;
671         struct mbuf *m = rx_buf->mbuf;
672         struct ether_header *eh;
673         struct sfxge_lro_conn *c;
674         uint16_t l2_id;
675         uint16_t l3_proto;
676         void *nh;
677         struct tcphdr *th;
678         uint32_t conn_hash;
679         unsigned bucket;
680
681         /* Get the hardware hash */
682         conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
683                                             EFX_RX_HASHALG_TOEPLITZ,
684                                             mtod(m, uint8_t *));
685
686         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
687         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
688                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
689                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
690                         SFXGE_LRO_L2_ID_VLAN;
691                 l3_proto = veh->evl_proto;
692                 nh = veh + 1;
693         } else {
694                 l2_id = 0;
695                 l3_proto = eh->ether_type;
696                 nh = eh + 1;
697         }
698
699         /* Check whether this is a suitable packet (unfragmented
700          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
701          * length, and compute a hash if necessary.  If not, return.
702          */
703         if (l3_proto == htons(ETHERTYPE_IP)) {
704                 struct ip *iph = nh;
705
706                 KASSERT(iph->ip_p == IPPROTO_TCP,
707                     ("IPv4 protocol is not TCP, but packet marker is set"));
708                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
709                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
710                         goto deliver_now;
711                 th = (struct tcphdr *)(iph + 1);
712         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
713                 struct ip6_hdr *iph = nh;
714
715                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
716                     ("IPv6 next header is not TCP, but packet marker is set"));
717                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
718                 th = (struct tcphdr *)(iph + 1);
719         } else {
720                 goto deliver_now;
721         }
722
723         bucket = conn_hash & rxq->lro.conns_mask;
724
725         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
726                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
727                         continue;
728                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
729                         continue;
730                 if (c->mbuf != NULL) {
731                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
732                                 struct ip *c_iph, *iph = nh;
733                                 c_iph = c->nh;
734                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
735                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
736                                         continue;
737                         } else {
738                                 struct ip6_hdr *c_iph, *iph = nh;
739                                 c_iph = c->nh;
740                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
741                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
742                                         continue;
743                         }
744                 }
745
746                 /* Re-insert at head of list to reduce lookup time. */
747                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
748                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
749
750                 if (c->next_buf.mbuf != NULL) {
751                         if (!sfxge_lro_try_merge(rxq, c))
752                                 goto deliver_now;
753                 } else {
754                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
755                             active_link);
756                 }
757                 c->next_buf = *rx_buf;
758                 c->next_eh = eh;
759                 c->next_nh = nh;
760
761                 rx_buf->mbuf = NULL;
762                 rx_buf->flags = EFX_DISCARD;
763                 return;
764         }
765
766         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
767  deliver_now:
768         sfxge_rx_deliver(sc, rx_buf);
769 }
770
771 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
772 {
773         struct sfxge_lro_state *st = &rxq->lro;
774         struct sfxge_lro_conn *c;
775         unsigned t;
776
777         while (!LIST_EMPTY(&st->active_conns)) {
778                 c = LIST_FIRST(&st->active_conns);
779                 if (!c->delivered && c->mbuf != NULL)
780                         sfxge_lro_deliver(st, c);
781                 if (sfxge_lro_try_merge(rxq, c)) {
782                         if (c->mbuf != NULL)
783                                 sfxge_lro_deliver(st, c);
784                         LIST_REMOVE(c, active_link);
785                 }
786                 c->delivered = 0;
787         }
788
789         t = *(volatile int *)&ticks;
790         if (__predict_false(t != st->last_purge_ticks))
791                 sfxge_lro_purge_idle(rxq, t);
792 }
793
794 #else   /* !SFXGE_LRO */
795
796 static void
797 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
798 {
799 }
800
801 static void
802 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
803 {
804 }
805
806 #endif  /* SFXGE_LRO */
807
808 void
809 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
810 {
811         struct sfxge_softc *sc = rxq->sc;
812         int if_capenable = sc->ifnet->if_capenable;
813         int lro_enabled = if_capenable & IFCAP_LRO;
814         unsigned int index;
815         struct sfxge_evq *evq;
816         unsigned int completed;
817         unsigned int level;
818         struct mbuf *m;
819         struct sfxge_rx_sw_desc *prev = NULL;
820
821         index = rxq->index;
822         evq = sc->evq[index];
823
824         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
825
826         completed = rxq->completed;
827         while (completed != rxq->pending) {
828                 unsigned int id;
829                 struct sfxge_rx_sw_desc *rx_desc;
830
831                 id = completed++ & rxq->ptr_mask;
832                 rx_desc = &rxq->queue[id];
833                 m = rx_desc->mbuf;
834
835                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
836                         goto discard;
837
838                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
839                         goto discard;
840
841                 /* Read the length from the pseudo header if required */
842                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
843                         uint16_t tmp_size;
844                         int rc;
845                         rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
846                                                            mtod(m, uint8_t *),
847                                                            &tmp_size);
848                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
849                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
850                 }
851
852                 prefetch_read_many(mtod(m, caddr_t));
853
854                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
855                 case EFX_PKT_IPV4:
856                         if (~if_capenable & IFCAP_RXCSUM)
857                                 rx_desc->flags &=
858                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
859                         break;
860                 case EFX_PKT_IPV6:
861                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
862                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
863                         break;
864                 case 0:
865                         /* Check for loopback packets */
866                         {
867                                 struct ether_header *etherhp;
868
869                                 /*LINTED*/
870                                 etherhp = mtod(m, struct ether_header *);
871
872                                 if (etherhp->ether_type ==
873                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
874                                         EFSYS_PROBE(loopback);
875
876                                         rxq->loopback++;
877                                         goto discard;
878                                 }
879                         }
880                         break;
881                 default:
882                         KASSERT(B_FALSE,
883                             ("Rx descriptor with both IPv4 and IPv6 flags"));
884                         goto discard;
885                 }
886
887                 /* Pass packet up the stack or into LRO (pipelined) */
888                 if (prev != NULL) {
889                         if (lro_enabled &&
890                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
891                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
892                                 sfxge_lro(rxq, prev);
893                         else
894                                 sfxge_rx_deliver(sc, prev);
895                 }
896                 prev = rx_desc;
897                 continue;
898
899 discard:
900                 /* Return the packet to the pool */
901                 m_free(m);
902                 rx_desc->mbuf = NULL;
903         }
904         rxq->completed = completed;
905
906         level = rxq->added - rxq->completed;
907
908         /* Pass last packet up the stack or into LRO */
909         if (prev != NULL) {
910                 if (lro_enabled &&
911                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
912                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
913                         sfxge_lro(rxq, prev);
914                 else
915                         sfxge_rx_deliver(sc, prev);
916         }
917
918         /*
919          * If there are any pending flows and this is the end of the
920          * poll then they must be completed.
921          */
922         if (eop)
923                 sfxge_lro_end_of_burst(rxq);
924
925         /* Top up the queue if necessary */
926         if (level < rxq->refill_threshold)
927                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
928 }
929
930 static void
931 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
932 {
933         struct sfxge_rxq *rxq;
934         struct sfxge_evq *evq;
935         unsigned int count;
936         unsigned int retry = 3;
937
938         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
939
940         rxq = sc->rxq[index];
941         evq = sc->evq[index];
942
943         SFXGE_EVQ_LOCK(evq);
944
945         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
946             ("rxq not started"));
947
948         rxq->init_state = SFXGE_RXQ_INITIALIZED;
949
950         callout_stop(&rxq->refill_callout);
951
952         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
953                 rxq->flush_state = SFXGE_FLUSH_PENDING;
954
955                 SFXGE_EVQ_UNLOCK(evq);
956
957                 /* Flush the receive queue */
958                 if (efx_rx_qflush(rxq->common) != 0) {
959                         SFXGE_EVQ_LOCK(evq);
960                         rxq->flush_state = SFXGE_FLUSH_FAILED;
961                         break;
962                 }
963
964                 count = 0;
965                 do {
966                         /* Spin for 100 ms */
967                         DELAY(100000);
968
969                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
970                                 break;
971
972                 } while (++count < 20);
973
974                 SFXGE_EVQ_LOCK(evq);
975
976                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
977                         /* Flush timeout - neither done nor failed */
978                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
979                             device_get_nameunit(sc->dev), index);
980                         rxq->flush_state = SFXGE_FLUSH_DONE;
981                 }
982                 retry--;
983         }
984         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
985                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
986                     device_get_nameunit(sc->dev), index);
987                 rxq->flush_state = SFXGE_FLUSH_DONE;
988         }
989
990         rxq->pending = rxq->added;
991         sfxge_rx_qcomplete(rxq, B_TRUE);
992
993         KASSERT(rxq->completed == rxq->pending,
994             ("rxq->completed != rxq->pending"));
995
996         rxq->added = 0;
997         rxq->pushed = 0;
998         rxq->pending = 0;
999         rxq->completed = 0;
1000         rxq->loopback = 0;
1001
1002         /* Destroy the common code receive queue. */
1003         efx_rx_qdestroy(rxq->common);
1004
1005         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1006             EFX_RXQ_NBUFS(sc->rxq_entries));
1007
1008         SFXGE_EVQ_UNLOCK(evq);
1009 }
1010
1011 static int
1012 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1013 {
1014         struct sfxge_rxq *rxq;
1015         efsys_mem_t *esmp;
1016         struct sfxge_evq *evq;
1017         int rc;
1018
1019         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1020
1021         rxq = sc->rxq[index];
1022         esmp = &rxq->mem;
1023         evq = sc->evq[index];
1024
1025         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1026             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1027         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1028             ("evq->init_state != SFXGE_EVQ_STARTED"));
1029
1030         /* Program the buffer table. */
1031         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1032             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1033                 return (rc);
1034
1035         /* Create the common code receive queue. */
1036         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1037             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1038             &rxq->common)) != 0)
1039                 goto fail;
1040
1041         SFXGE_EVQ_LOCK(evq);
1042
1043         /* Enable the receive queue. */
1044         efx_rx_qenable(rxq->common);
1045
1046         rxq->init_state = SFXGE_RXQ_STARTED;
1047         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1048
1049         /* Try to fill the queue from the pool. */
1050         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1051
1052         SFXGE_EVQ_UNLOCK(evq);
1053
1054         return (0);
1055
1056 fail:
1057         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1058             EFX_RXQ_NBUFS(sc->rxq_entries));
1059         return (rc);
1060 }
1061
1062 void
1063 sfxge_rx_stop(struct sfxge_softc *sc)
1064 {
1065         int index;
1066
1067         efx_mac_filter_default_rxq_clear(sc->enp);
1068
1069         /* Stop the receive queue(s) */
1070         index = sc->rxq_count;
1071         while (--index >= 0)
1072                 sfxge_rx_qstop(sc, index);
1073
1074         sc->rx_prefix_size = 0;
1075         sc->rx_buffer_size = 0;
1076
1077         efx_rx_fini(sc->enp);
1078 }
1079
1080 int
1081 sfxge_rx_start(struct sfxge_softc *sc)
1082 {
1083         struct sfxge_intr *intr;
1084         const efx_nic_cfg_t *encp;
1085         size_t hdrlen, align, reserved;
1086         int index;
1087         int rc;
1088
1089         intr = &sc->intr;
1090
1091         /* Initialize the common code receive module. */
1092         if ((rc = efx_rx_init(sc->enp)) != 0)
1093                 return (rc);
1094
1095         encp = efx_nic_cfg_get(sc->enp);
1096         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1097
1098         /* Calculate the receive packet buffer size. */ 
1099         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1100
1101         /* Ensure IP headers are 32bit aligned */
1102         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1103         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1104
1105         sc->rx_buffer_size += sc->rx_buffer_align;
1106
1107         /* Align end of packet buffer for RX DMA end padding */
1108         align = MAX(1, encp->enc_rx_buf_align_end);
1109         EFSYS_ASSERT(ISP2(align));
1110         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1111
1112         /*
1113          * Standard mbuf zones only guarantee pointer-size alignment;
1114          * we need extra space to align to the cache line
1115          */
1116         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1117
1118         /* Select zone for packet buffers */
1119         if (reserved <= MCLBYTES)
1120                 sc->rx_cluster_size = MCLBYTES;
1121         else if (reserved <= MJUMPAGESIZE)
1122                 sc->rx_cluster_size = MJUMPAGESIZE;
1123         else if (reserved <= MJUM9BYTES)
1124                 sc->rx_cluster_size = MJUM9BYTES;
1125         else
1126                 sc->rx_cluster_size = MJUM16BYTES;
1127
1128         /*
1129          * Set up the scale table.  Enable all hash types and hash insertion.
1130          */
1131         for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1132                 sc->rx_indir_table[index] = index % sc->rxq_count;
1133         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1134                                        SFXGE_RX_SCALE_MAX)) != 0)
1135                 goto fail;
1136         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1137             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1138             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1139
1140 #ifdef RSS
1141         rss_getkey(toep_key);
1142 #endif
1143         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1144                                        sizeof(toep_key))) != 0)
1145                 goto fail;
1146
1147         /* Start the receive queue(s). */
1148         for (index = 0; index < sc->rxq_count; index++) {
1149                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1150                         goto fail2;
1151         }
1152
1153         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1154                                             sc->intr.n_alloc > 1);
1155         if (rc != 0)
1156                 goto fail3;
1157
1158         return (0);
1159
1160 fail3:
1161 fail2:
1162         while (--index >= 0)
1163                 sfxge_rx_qstop(sc, index);
1164
1165 fail:
1166         efx_rx_fini(sc->enp);
1167
1168         return (rc);
1169 }
1170
1171 #ifdef SFXGE_LRO
1172
1173 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1174 {
1175         struct sfxge_lro_state *st = &rxq->lro;
1176         unsigned i;
1177
1178         st->conns_mask = lro_table_size - 1;
1179         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1180                 ("lro_table_size must be a power of 2"));
1181         st->sc = rxq->sc;
1182         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1183                            M_SFXGE, M_WAITOK);
1184         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1185                              M_SFXGE, M_WAITOK);
1186         for (i = 0; i <= st->conns_mask; ++i) {
1187                 TAILQ_INIT(&st->conns[i]);
1188                 st->conns_n[i] = 0;
1189         }
1190         LIST_INIT(&st->active_conns);
1191         TAILQ_INIT(&st->free_conns);
1192 }
1193
1194 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1195 {
1196         struct sfxge_lro_state *st = &rxq->lro;
1197         struct sfxge_lro_conn *c;
1198         unsigned i;
1199
1200         /* Return cleanly if sfxge_lro_init() has not been called. */
1201         if (st->conns == NULL)
1202                 return;
1203
1204         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1205
1206         for (i = 0; i <= st->conns_mask; ++i) {
1207                 while (!TAILQ_EMPTY(&st->conns[i])) {
1208                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1209                         sfxge_lro_drop(rxq, c);
1210                 }
1211         }
1212
1213         while (!TAILQ_EMPTY(&st->free_conns)) {
1214                 c = TAILQ_FIRST(&st->free_conns);
1215                 TAILQ_REMOVE(&st->free_conns, c, link);
1216                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1217                 free(c, M_SFXGE);
1218         }
1219
1220         free(st->conns_n, M_SFXGE);
1221         free(st->conns, M_SFXGE);
1222         st->conns = NULL;
1223 }
1224
1225 #else
1226
1227 static void
1228 sfxge_lro_init(struct sfxge_rxq *rxq)
1229 {
1230 }
1231
1232 static void
1233 sfxge_lro_fini(struct sfxge_rxq *rxq)
1234 {
1235 }
1236
1237 #endif  /* SFXGE_LRO */
1238
1239 static void
1240 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1241 {
1242         struct sfxge_rxq *rxq;
1243
1244         rxq = sc->rxq[index];
1245
1246         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1247             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1248
1249         /* Free the context array and the flow table. */
1250         free(rxq->queue, M_SFXGE);
1251         sfxge_lro_fini(rxq);
1252
1253         /* Release DMA memory. */
1254         sfxge_dma_free(&rxq->mem);
1255
1256         sc->rxq[index] = NULL;
1257
1258         free(rxq, M_SFXGE);
1259 }
1260
1261 static int
1262 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1263 {
1264         struct sfxge_rxq *rxq;
1265         struct sfxge_evq *evq;
1266         efsys_mem_t *esmp;
1267         int rc;
1268
1269         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1270
1271         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1272         rxq->sc = sc;
1273         rxq->index = index;
1274         rxq->entries = sc->rxq_entries;
1275         rxq->ptr_mask = rxq->entries - 1;
1276         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1277
1278         sc->rxq[index] = rxq;
1279         esmp = &rxq->mem;
1280
1281         evq = sc->evq[index];
1282
1283         /* Allocate and zero DMA space. */
1284         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1285                 return (rc);
1286
1287         /* Allocate buffer table entries. */
1288         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1289                                  &rxq->buf_base_id);
1290
1291         /* Allocate the context array and the flow table. */
1292         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1293             M_SFXGE, M_WAITOK | M_ZERO);
1294         sfxge_lro_init(rxq);
1295
1296         callout_init(&rxq->refill_callout, 1);
1297
1298         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1299
1300         return (0);
1301 }
1302
1303 static const struct {
1304         const char *name;
1305         size_t offset;
1306 } sfxge_rx_stats[] = {
1307 #define SFXGE_RX_STAT(name, member) \
1308         { #name, offsetof(struct sfxge_rxq, member) }
1309 #ifdef SFXGE_LRO
1310         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1311         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1312         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1313         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1314         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1315         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1316         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1317         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1318 #endif
1319 };
1320
1321 static int
1322 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1323 {
1324         struct sfxge_softc *sc = arg1;
1325         unsigned int id = arg2;
1326         unsigned int sum, index;
1327
1328         /* Sum across all RX queues */
1329         sum = 0;
1330         for (index = 0; index < sc->rxq_count; index++)
1331                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1332                                          sfxge_rx_stats[id].offset);
1333
1334         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1335 }
1336
1337 static void
1338 sfxge_rx_stat_init(struct sfxge_softc *sc)
1339 {
1340         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1341         struct sysctl_oid_list *stat_list;
1342         unsigned int id;
1343
1344         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1345
1346         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1347                 SYSCTL_ADD_PROC(
1348                         ctx, stat_list,
1349                         OID_AUTO, sfxge_rx_stats[id].name,
1350                         CTLTYPE_UINT|CTLFLAG_RD,
1351                         sc, id, sfxge_rx_stat_handler, "IU",
1352                         "");
1353         }
1354 }
1355
1356 void
1357 sfxge_rx_fini(struct sfxge_softc *sc)
1358 {
1359         int index;
1360
1361         index = sc->rxq_count;
1362         while (--index >= 0)
1363                 sfxge_rx_qfini(sc, index);
1364
1365         sc->rxq_count = 0;
1366 }
1367
1368 int
1369 sfxge_rx_init(struct sfxge_softc *sc)
1370 {
1371         struct sfxge_intr *intr;
1372         int index;
1373         int rc;
1374
1375 #ifdef SFXGE_LRO
1376         if (!ISP2(lro_table_size)) {
1377                 log(LOG_ERR, "%s=%u must be power of 2",
1378                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1379                 rc = EINVAL;
1380                 goto fail_lro_table_size;
1381         }
1382
1383         if (lro_idle_ticks == 0)
1384                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1385 #endif
1386
1387         intr = &sc->intr;
1388
1389         sc->rxq_count = intr->n_alloc;
1390
1391         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1392             ("intr->state != SFXGE_INTR_INITIALIZED"));
1393
1394         /* Initialize the receive queue(s) - one per interrupt. */
1395         for (index = 0; index < sc->rxq_count; index++) {
1396                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1397                         goto fail;
1398         }
1399
1400         sfxge_rx_stat_init(sc);
1401
1402         return (0);
1403
1404 fail:
1405         /* Tear down the receive queue(s). */
1406         while (--index >= 0)
1407                 sfxge_rx_qfini(sc, index);
1408
1409         sc->rxq_count = 0;
1410
1411 #ifdef SFXGE_LRO
1412 fail_lro_table_size:
1413 #endif
1414         return (rc);
1415 }