]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/sfxge/sfxge_rx.c
There is no need to use array any more. No functional change.
[FreeBSD/FreeBSD.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2015 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/param.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/smp.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/syslog.h>
44 #include <sys/limits.h>
45 #include <sys/syslog.h>
46
47 #include <net/ethernet.h>
48 #include <net/if.h>
49 #include <net/if_vlan_var.h>
50
51 #include <netinet/in.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
54 #include <netinet/tcp.h>
55
56 #include <machine/in_cksum.h>
57
58 #include "common/efx.h"
59
60
61 #include "sfxge.h"
62 #include "sfxge_rx.h"
63
64 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
65
66 #ifdef SFXGE_LRO
67
68 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
69             "Large receive offload (LRO) parameters");
70
71 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
72
73 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
74  * means we can accelerate a larger number of streams.
75  */
76 static unsigned lro_table_size = 128;
77 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
78 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
79             &lro_table_size, 0,
80             "Size of the LRO hash table (must be a power of 2)");
81
82 /* Maximum length of a hash chain.  If chains get too long then the lookup
83  * time increases and may exceed the benefit of LRO.
84  */
85 static unsigned lro_chain_max = 20;
86 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
87 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
88             &lro_chain_max, 0,
89             "The maximum length of a hash chain");
90
91 /* Maximum time (in ticks) that a connection can be idle before it's LRO
92  * state is discarded.
93  */
94 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
95 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
96 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
97             &lro_idle_ticks, 0,
98             "The maximum time (in ticks) that a connection can be idle "
99             "before it's LRO state is discarded");
100
101 /* Number of packets with payload that must arrive in-order before a
102  * connection is eligible for LRO.  The idea is we should avoid coalescing
103  * segments when the sender is in slow-start because reducing the ACK rate
104  * can damage performance.
105  */
106 static int lro_slow_start_packets = 2000;
107 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
108 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
109             &lro_slow_start_packets, 0,
110             "Number of packets with payload that must arrive in-order before "
111             "a connection is eligible for LRO");
112
113 /* Number of packets with payload that must arrive in-order following loss
114  * before a connection is eligible for LRO.  The idea is we should avoid
115  * coalescing segments when the sender is recovering from loss, because
116  * reducing the ACK rate can damage performance.
117  */
118 static int lro_loss_packets = 20;
119 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
120 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
121             &lro_loss_packets, 0,
122             "Number of packets with payload that must arrive in-order "
123             "following loss before a connection is eligible for LRO");
124
125 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
126 #define SFXGE_LRO_L2_ID_VLAN 0x4000
127 #define SFXGE_LRO_L2_ID_IPV6 0x8000
128 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
129 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
130
131 /* Compare IPv6 addresses, avoiding conditional branches */
132 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
133                                    const struct in6_addr *right)
134 {
135 #if LONG_BIT == 64
136         const uint64_t *left64 = (const uint64_t *)left;
137         const uint64_t *right64 = (const uint64_t *)right;
138         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
139 #else
140         return (left->s6_addr32[0] - right->s6_addr32[0]) |
141                (left->s6_addr32[1] - right->s6_addr32[1]) |
142                (left->s6_addr32[2] - right->s6_addr32[2]) |
143                (left->s6_addr32[3] - right->s6_addr32[3]);
144 #endif
145 }
146
147 #endif  /* SFXGE_LRO */
148
149 void
150 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
151 {
152
153         rxq->flush_state = SFXGE_FLUSH_DONE;
154 }
155
156 void
157 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
158 {
159
160         rxq->flush_state = SFXGE_FLUSH_FAILED;
161 }
162
163 static uint8_t toep_key[] = {
164         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
165         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
166         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
167         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
168         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
169 };
170
171 static void
172 sfxge_rx_post_refill(void *arg)
173 {
174         struct sfxge_rxq *rxq = arg;
175         struct sfxge_softc *sc;
176         unsigned int index;
177         struct sfxge_evq *evq;
178         uint16_t magic;
179
180         sc = rxq->sc;
181         index = rxq->index;
182         evq = sc->evq[index];
183
184         magic = SFXGE_MAGIC_RX_QREFILL | index;
185
186         /* This is guaranteed due to the start/stop order of rx and ev */
187         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
188             ("evq not started"));
189         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
190             ("rxq not started"));
191         efx_ev_qpost(evq->common, magic);
192 }
193
194 static void
195 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
196 {
197         /* Initially retry after 100 ms, but back off in case of
198          * repeated failures as we probably have to wait for the
199          * administrator to raise the pool limit. */
200         if (retrying)
201                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
202         else
203                 rxq->refill_delay = hz / 10;
204
205         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
206                              sfxge_rx_post_refill, rxq);
207 }
208
209 #define SFXGE_REFILL_BATCH  64
210
211 static void
212 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
213 {
214         struct sfxge_softc *sc;
215         unsigned int index;
216         struct sfxge_evq *evq;
217         unsigned int batch;
218         unsigned int rxfill;
219         unsigned int mblksize;
220         int ntodo;
221         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
222
223         sc = rxq->sc;
224         index = rxq->index;
225         evq = sc->evq[index];
226
227         prefetch_read_many(sc->enp);
228         prefetch_read_many(rxq->common);
229
230         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
231
232         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
233                 return;
234
235         rxfill = rxq->added - rxq->completed;
236         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
237             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
238         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
239         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
240             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
241
242         if (ntodo == 0)
243                 return;
244
245         batch = 0;
246         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
247         while (ntodo-- > 0) {
248                 unsigned int id;
249                 struct sfxge_rx_sw_desc *rx_desc;
250                 bus_dma_segment_t seg;
251                 struct mbuf *m;
252
253                 id = (rxq->added + batch) & rxq->ptr_mask;
254                 rx_desc = &rxq->queue[id];
255                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
256
257                 rx_desc->flags = EFX_DISCARD;
258                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
259                     sc->rx_cluster_size);
260                 if (m == NULL)
261                         break;
262
263                 /* m_len specifies length of area to be mapped for DMA */
264                 m->m_len  = mblksize;
265                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
266                 m->m_data += sc->rx_buffer_align;
267
268                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
269                 addr[batch++] = seg.ds_addr;
270
271                 if (batch == SFXGE_REFILL_BATCH) {
272                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
273                             rxq->completed, rxq->added);
274                         rxq->added += batch;
275                         batch = 0;
276                 }
277         }
278
279         if (ntodo != 0)
280                 sfxge_rx_schedule_refill(rxq, retrying);
281
282         if (batch != 0) {
283                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
284                     rxq->completed, rxq->added);
285                 rxq->added += batch;
286         }
287
288         /* Make the descriptors visible to the hardware */
289         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
290                         BUS_DMASYNC_PREWRITE);
291
292         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
293
294         /* The queue could still be empty if no descriptors were actually
295          * pushed, in which case there will be no event to cause the next
296          * refill, so we must schedule a refill ourselves.
297          */
298         if(rxq->pushed == rxq->completed) {
299                 sfxge_rx_schedule_refill(rxq, retrying);
300         }
301 }
302
303 void
304 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
305 {
306
307         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
308                 return;
309
310         /* Make sure the queue is full */
311         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
312 }
313
314 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
315 {
316         struct ifnet *ifp = sc->ifnet;
317
318         m->m_pkthdr.rcvif = ifp;
319         m->m_pkthdr.csum_data = 0xffff;
320         ifp->if_input(ifp, m);
321 }
322
323 static void
324 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
325 {
326         struct mbuf *m = rx_desc->mbuf;
327         int flags = rx_desc->flags;
328         int csum_flags;
329
330         /* Convert checksum flags */
331         csum_flags = (flags & EFX_CKSUM_IPV4) ?
332                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
333         if (flags & EFX_CKSUM_TCPUDP)
334                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
335
336         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
337                 m->m_pkthdr.flowid =
338                         efx_psuedo_hdr_hash_get(sc->enp,
339                                                 EFX_RX_HASHALG_TOEPLITZ,
340                                                 mtod(m, uint8_t *));
341                 /* The hash covers a 4-tuple for TCP only */
342                 M_HASHTYPE_SET(m,
343                     (flags & EFX_PKT_IPV4) ?
344                         ((flags & EFX_PKT_TCP) ?
345                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
346                         ((flags & EFX_PKT_TCP) ?
347                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
348         }
349         m->m_data += sc->rx_prefix_size;
350         m->m_len = rx_desc->size - sc->rx_prefix_size;
351         m->m_pkthdr.len = m->m_len;
352         m->m_pkthdr.csum_flags = csum_flags;
353         __sfxge_rx_deliver(sc, rx_desc->mbuf);
354
355         rx_desc->flags = EFX_DISCARD;
356         rx_desc->mbuf = NULL;
357 }
358
359 #ifdef SFXGE_LRO
360
361 static void
362 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
363 {
364         struct sfxge_softc *sc = st->sc;
365         struct mbuf *m = c->mbuf;
366         struct tcphdr *c_th;
367         int csum_flags;
368
369         KASSERT(m, ("no mbuf to deliver"));
370
371         ++st->n_bursts;
372
373         /* Finish off packet munging and recalculate IP header checksum. */
374         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
375                 struct ip *iph = c->nh;
376                 iph->ip_len = htons(iph->ip_len);
377                 iph->ip_sum = 0;
378                 iph->ip_sum = in_cksum_hdr(iph);
379                 c_th = (struct tcphdr *)(iph + 1);
380                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
381                               CSUM_IP_CHECKED | CSUM_IP_VALID);
382         } else {
383                 struct ip6_hdr *iph = c->nh;
384                 iph->ip6_plen = htons(iph->ip6_plen);
385                 c_th = (struct tcphdr *)(iph + 1);
386                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
387         }
388
389         c_th->th_win = c->th_last->th_win;
390         c_th->th_ack = c->th_last->th_ack;
391         if (c_th->th_off == c->th_last->th_off) {
392                 /* Copy TCP options (take care to avoid going negative). */
393                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
394                 memcpy(c_th + 1, c->th_last + 1, optlen);
395         }
396
397         m->m_pkthdr.flowid = c->conn_hash;
398         M_HASHTYPE_SET(m,
399             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
400                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
401
402         m->m_pkthdr.csum_flags = csum_flags;
403         __sfxge_rx_deliver(sc, m);
404
405         c->mbuf = NULL;
406         c->delivered = 1;
407 }
408
409 /* Drop the given connection, and add it to the free list. */
410 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
411 {
412         unsigned bucket;
413
414         KASSERT(!c->mbuf, ("found orphaned mbuf"));
415
416         if (c->next_buf.mbuf != NULL) {
417                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
418                 LIST_REMOVE(c, active_link);
419         }
420
421         bucket = c->conn_hash & rxq->lro.conns_mask;
422         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
423         --rxq->lro.conns_n[bucket];
424         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
425         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
426 }
427
428 /* Stop tracking connections that have gone idle in order to keep hash
429  * chains short.
430  */
431 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
432 {
433         struct sfxge_lro_conn *c;
434         unsigned i;
435
436         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
437                 ("found active connections"));
438
439         rxq->lro.last_purge_ticks = now;
440         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
441                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
442                         continue;
443
444                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
445                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
446                         ++rxq->lro.n_drop_idle;
447                         sfxge_lro_drop(rxq, c);
448                 }
449         }
450 }
451
452 static void
453 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
454                 struct mbuf *mbuf, struct tcphdr *th)
455 {
456         struct tcphdr *c_th;
457
458         /* Tack the new mbuf onto the chain. */
459         KASSERT(!mbuf->m_next, ("mbuf already chained"));
460         c->mbuf_tail->m_next = mbuf;
461         c->mbuf_tail = mbuf;
462
463         /* Increase length appropriately */
464         c->mbuf->m_pkthdr.len += mbuf->m_len;
465
466         /* Update the connection state flags */
467         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
468                 struct ip *iph = c->nh;
469                 iph->ip_len += mbuf->m_len;
470                 c_th = (struct tcphdr *)(iph + 1);
471         } else {
472                 struct ip6_hdr *iph = c->nh;
473                 iph->ip6_plen += mbuf->m_len;
474                 c_th = (struct tcphdr *)(iph + 1);
475         }
476         c_th->th_flags |= (th->th_flags & TH_PUSH);
477         c->th_last = th;
478         ++st->n_merges;
479
480         /* Pass packet up now if another segment could overflow the IP
481          * length.
482          */
483         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
484                 sfxge_lro_deliver(st, c);
485 }
486
487 static void
488 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
489                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
490 {
491         /* Start the chain */
492         c->mbuf = mbuf;
493         c->mbuf_tail = c->mbuf;
494         c->nh = nh;
495         c->th_last = th;
496
497         mbuf->m_pkthdr.len = mbuf->m_len;
498
499         /* Mangle header fields for later processing */
500         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
501                 struct ip *iph = nh;
502                 iph->ip_len = ntohs(iph->ip_len);
503         } else {
504                 struct ip6_hdr *iph = nh;
505                 iph->ip6_plen = ntohs(iph->ip6_plen);
506         }
507 }
508
509 /* Try to merge or otherwise hold or deliver (as appropriate) the
510  * packet buffered for this connection (c->next_buf).  Return a flag
511  * indicating whether the connection is still active for LRO purposes.
512  */
513 static int
514 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
515 {
516         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
517         char *eh = c->next_eh;
518         int data_length, hdr_length, dont_merge;
519         unsigned th_seq, pkt_length;
520         struct tcphdr *th;
521         unsigned now;
522
523         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
524                 struct ip *iph = c->next_nh;
525                 th = (struct tcphdr *)(iph + 1);
526                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
527         } else {
528                 struct ip6_hdr *iph = c->next_nh;
529                 th = (struct tcphdr *)(iph + 1);
530                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
531         }
532
533         hdr_length = (char *) th + th->th_off * 4 - eh;
534         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
535                        hdr_length);
536         th_seq = ntohl(th->th_seq);
537         dont_merge = ((data_length <= 0)
538                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
539
540         /* Check for options other than aligned timestamp. */
541         if (th->th_off != 5) {
542                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
543                 if (th->th_off == 8 &&
544                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
545                                         (TCPOPT_NOP << 16) |
546                                         (TCPOPT_TIMESTAMP << 8) |
547                                         TCPOLEN_TIMESTAMP)) {
548                         /* timestamp option -- okay */
549                 } else {
550                         dont_merge = 1;
551                 }
552         }
553
554         if (__predict_false(th_seq != c->next_seq)) {
555                 /* Out-of-order, so start counting again. */
556                 if (c->mbuf != NULL)
557                         sfxge_lro_deliver(&rxq->lro, c);
558                 c->n_in_order_pkts -= lro_loss_packets;
559                 c->next_seq = th_seq + data_length;
560                 ++rxq->lro.n_misorder;
561                 goto deliver_buf_out;
562         }
563         c->next_seq = th_seq + data_length;
564
565         now = ticks;
566         if (now - c->last_pkt_ticks > lro_idle_ticks) {
567                 ++rxq->lro.n_drop_idle;
568                 if (c->mbuf != NULL)
569                         sfxge_lro_deliver(&rxq->lro, c);
570                 sfxge_lro_drop(rxq, c);
571                 return (0);
572         }
573         c->last_pkt_ticks = ticks;
574
575         if (c->n_in_order_pkts < lro_slow_start_packets) {
576                 /* May be in slow-start, so don't merge. */
577                 ++rxq->lro.n_slow_start;
578                 ++c->n_in_order_pkts;
579                 goto deliver_buf_out;
580         }
581
582         if (__predict_false(dont_merge)) {
583                 if (c->mbuf != NULL)
584                         sfxge_lro_deliver(&rxq->lro, c);
585                 if (th->th_flags & (TH_FIN | TH_RST)) {
586                         ++rxq->lro.n_drop_closed;
587                         sfxge_lro_drop(rxq, c);
588                         return (0);
589                 }
590                 goto deliver_buf_out;
591         }
592
593         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
594
595         if (__predict_true(c->mbuf != NULL)) {
596                 /* Remove headers and any padding */
597                 rx_buf->mbuf->m_data += hdr_length;
598                 rx_buf->mbuf->m_len = data_length;
599
600                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
601         } else {
602                 /* Remove any padding */
603                 rx_buf->mbuf->m_len = pkt_length;
604
605                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
606         }
607
608         rx_buf->mbuf = NULL;
609         return (1);
610
611  deliver_buf_out:
612         sfxge_rx_deliver(rxq->sc, rx_buf);
613         return (1);
614 }
615
616 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
617                                uint16_t l2_id, void *nh, struct tcphdr *th)
618 {
619         unsigned bucket = conn_hash & st->conns_mask;
620         struct sfxge_lro_conn *c;
621
622         if (st->conns_n[bucket] >= lro_chain_max) {
623                 ++st->n_too_many;
624                 return;
625         }
626
627         if (!TAILQ_EMPTY(&st->free_conns)) {
628                 c = TAILQ_FIRST(&st->free_conns);
629                 TAILQ_REMOVE(&st->free_conns, c, link);
630         } else {
631                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
632                 if (c == NULL)
633                         return;
634                 c->mbuf = NULL;
635                 c->next_buf.mbuf = NULL;
636         }
637
638         /* Create the connection tracking data */
639         ++st->conns_n[bucket];
640         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
641         c->l2_id = l2_id;
642         c->conn_hash = conn_hash;
643         c->source = th->th_sport;
644         c->dest = th->th_dport;
645         c->n_in_order_pkts = 0;
646         c->last_pkt_ticks = *(volatile int *)&ticks;
647         c->delivered = 0;
648         ++st->n_new_stream;
649         /* NB. We don't initialise c->next_seq, and it doesn't matter what
650          * value it has.  Most likely the next packet received for this
651          * connection will not match -- no harm done.
652          */
653 }
654
655 /* Process mbuf and decide whether to dispatch it to the stack now or
656  * later.
657  */
658 static void
659 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
660 {
661         struct sfxge_softc *sc = rxq->sc;
662         struct mbuf *m = rx_buf->mbuf;
663         struct ether_header *eh;
664         struct sfxge_lro_conn *c;
665         uint16_t l2_id;
666         uint16_t l3_proto;
667         void *nh;
668         struct tcphdr *th;
669         uint32_t conn_hash;
670         unsigned bucket;
671
672         /* Get the hardware hash */
673         conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
674                                             EFX_RX_HASHALG_TOEPLITZ,
675                                             mtod(m, uint8_t *));
676
677         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
678         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
679                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
680                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
681                         SFXGE_LRO_L2_ID_VLAN;
682                 l3_proto = veh->evl_proto;
683                 nh = veh + 1;
684         } else {
685                 l2_id = 0;
686                 l3_proto = eh->ether_type;
687                 nh = eh + 1;
688         }
689
690         /* Check whether this is a suitable packet (unfragmented
691          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
692          * length, and compute a hash if necessary.  If not, return.
693          */
694         if (l3_proto == htons(ETHERTYPE_IP)) {
695                 struct ip *iph = nh;
696
697                 KASSERT(iph->ip_p == IPPROTO_TCP,
698                     ("IPv4 protocol is not TCP, but packet marker is set"));
699                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
700                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
701                         goto deliver_now;
702                 th = (struct tcphdr *)(iph + 1);
703         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
704                 struct ip6_hdr *iph = nh;
705
706                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
707                     ("IPv6 next header is not TCP, but packet marker is set"));
708                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
709                 th = (struct tcphdr *)(iph + 1);
710         } else {
711                 goto deliver_now;
712         }
713
714         bucket = conn_hash & rxq->lro.conns_mask;
715
716         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
717                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
718                         continue;
719                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
720                         continue;
721                 if (c->mbuf != NULL) {
722                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
723                                 struct ip *c_iph, *iph = nh;
724                                 c_iph = c->nh;
725                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
726                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
727                                         continue;
728                         } else {
729                                 struct ip6_hdr *c_iph, *iph = nh;
730                                 c_iph = c->nh;
731                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
732                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
733                                         continue;
734                         }
735                 }
736
737                 /* Re-insert at head of list to reduce lookup time. */
738                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
739                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
740
741                 if (c->next_buf.mbuf != NULL) {
742                         if (!sfxge_lro_try_merge(rxq, c))
743                                 goto deliver_now;
744                 } else {
745                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
746                             active_link);
747                 }
748                 c->next_buf = *rx_buf;
749                 c->next_eh = eh;
750                 c->next_nh = nh;
751
752                 rx_buf->mbuf = NULL;
753                 rx_buf->flags = EFX_DISCARD;
754                 return;
755         }
756
757         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
758  deliver_now:
759         sfxge_rx_deliver(sc, rx_buf);
760 }
761
762 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
763 {
764         struct sfxge_lro_state *st = &rxq->lro;
765         struct sfxge_lro_conn *c;
766         unsigned t;
767
768         while (!LIST_EMPTY(&st->active_conns)) {
769                 c = LIST_FIRST(&st->active_conns);
770                 if (!c->delivered && c->mbuf != NULL)
771                         sfxge_lro_deliver(st, c);
772                 if (sfxge_lro_try_merge(rxq, c)) {
773                         if (c->mbuf != NULL)
774                                 sfxge_lro_deliver(st, c);
775                         LIST_REMOVE(c, active_link);
776                 }
777                 c->delivered = 0;
778         }
779
780         t = *(volatile int *)&ticks;
781         if (__predict_false(t != st->last_purge_ticks))
782                 sfxge_lro_purge_idle(rxq, t);
783 }
784
785 #else   /* !SFXGE_LRO */
786
787 static void
788 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
789 {
790 }
791
792 static void
793 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
794 {
795 }
796
797 #endif  /* SFXGE_LRO */
798
799 void
800 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
801 {
802         struct sfxge_softc *sc = rxq->sc;
803         int if_capenable = sc->ifnet->if_capenable;
804         int lro_enabled = if_capenable & IFCAP_LRO;
805         unsigned int index;
806         struct sfxge_evq *evq;
807         unsigned int completed;
808         unsigned int level;
809         struct mbuf *m;
810         struct sfxge_rx_sw_desc *prev = NULL;
811
812         index = rxq->index;
813         evq = sc->evq[index];
814
815         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
816
817         completed = rxq->completed;
818         while (completed != rxq->pending) {
819                 unsigned int id;
820                 struct sfxge_rx_sw_desc *rx_desc;
821
822                 id = completed++ & rxq->ptr_mask;
823                 rx_desc = &rxq->queue[id];
824                 m = rx_desc->mbuf;
825
826                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
827                         goto discard;
828
829                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
830                         goto discard;
831
832                 /* Read the length from the psuedo header if required */
833                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
834                         uint16_t tmp_size;
835                         int rc;
836                         rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 
837                                                            mtod(m, uint8_t *),
838                                                            &tmp_size);
839                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
840                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
841                 }
842
843                 prefetch_read_many(mtod(m, caddr_t));
844
845                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
846                 case EFX_PKT_IPV4:
847                         if (~if_capenable & IFCAP_RXCSUM)
848                                 rx_desc->flags &=
849                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
850                         break;
851                 case EFX_PKT_IPV6:
852                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
853                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
854                         break;
855                 case 0:
856                         /* Check for loopback packets */
857                         {
858                                 struct ether_header *etherhp;
859
860                                 /*LINTED*/
861                                 etherhp = mtod(m, struct ether_header *);
862
863                                 if (etherhp->ether_type ==
864                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
865                                         EFSYS_PROBE(loopback);
866
867                                         rxq->loopback++;
868                                         goto discard;
869                                 }
870                         }
871                         break;
872                 default:
873                         KASSERT(B_FALSE,
874                             ("Rx descriptor with both IPv4 and IPv6 flags"));
875                         goto discard;
876                 }
877
878                 /* Pass packet up the stack or into LRO (pipelined) */
879                 if (prev != NULL) {
880                         if (lro_enabled &&
881                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
882                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
883                                 sfxge_lro(rxq, prev);
884                         else
885                                 sfxge_rx_deliver(sc, prev);
886                 }
887                 prev = rx_desc;
888                 continue;
889
890 discard:
891                 /* Return the packet to the pool */
892                 m_free(m);
893                 rx_desc->mbuf = NULL;
894         }
895         rxq->completed = completed;
896
897         level = rxq->added - rxq->completed;
898
899         /* Pass last packet up the stack or into LRO */
900         if (prev != NULL) {
901                 if (lro_enabled &&
902                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
903                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
904                         sfxge_lro(rxq, prev);
905                 else
906                         sfxge_rx_deliver(sc, prev);
907         }
908
909         /*
910          * If there are any pending flows and this is the end of the
911          * poll then they must be completed.
912          */
913         if (eop)
914                 sfxge_lro_end_of_burst(rxq);
915
916         /* Top up the queue if necessary */
917         if (level < rxq->refill_threshold)
918                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
919 }
920
921 static void
922 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
923 {
924         struct sfxge_rxq *rxq;
925         struct sfxge_evq *evq;
926         unsigned int count;
927         unsigned int retry = 3;
928
929         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
930
931         rxq = sc->rxq[index];
932         evq = sc->evq[index];
933
934         SFXGE_EVQ_LOCK(evq);
935
936         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
937             ("rxq not started"));
938
939         rxq->init_state = SFXGE_RXQ_INITIALIZED;
940
941         callout_stop(&rxq->refill_callout);
942
943         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
944                 rxq->flush_state = SFXGE_FLUSH_PENDING;
945
946                 SFXGE_EVQ_UNLOCK(evq);
947
948                 /* Flush the receive queue */
949                 if (efx_rx_qflush(rxq->common) != 0) {
950                         SFXGE_EVQ_LOCK(evq);
951                         rxq->flush_state = SFXGE_FLUSH_FAILED;
952                         break;
953                 }
954
955                 count = 0;
956                 do {
957                         /* Spin for 100 ms */
958                         DELAY(100000);
959
960                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
961                                 break;
962
963                 } while (++count < 20);
964
965                 SFXGE_EVQ_LOCK(evq);
966
967                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
968                         /* Flush timeout - neither done nor failed */
969                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
970                             device_get_nameunit(sc->dev), index);
971                         rxq->flush_state = SFXGE_FLUSH_DONE;
972                 }
973                 retry--;
974         }
975         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
976                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
977                     device_get_nameunit(sc->dev), index);
978                 rxq->flush_state = SFXGE_FLUSH_DONE;
979         }
980
981         rxq->pending = rxq->added;
982         sfxge_rx_qcomplete(rxq, B_TRUE);
983
984         KASSERT(rxq->completed == rxq->pending,
985             ("rxq->completed != rxq->pending"));
986
987         rxq->added = 0;
988         rxq->pushed = 0;
989         rxq->pending = 0;
990         rxq->completed = 0;
991         rxq->loopback = 0;
992
993         /* Destroy the common code receive queue. */
994         efx_rx_qdestroy(rxq->common);
995
996         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
997             EFX_RXQ_NBUFS(sc->rxq_entries));
998
999         SFXGE_EVQ_UNLOCK(evq);
1000 }
1001
1002 static int
1003 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1004 {
1005         struct sfxge_rxq *rxq;
1006         efsys_mem_t *esmp;
1007         struct sfxge_evq *evq;
1008         int rc;
1009
1010         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1011
1012         rxq = sc->rxq[index];
1013         esmp = &rxq->mem;
1014         evq = sc->evq[index];
1015
1016         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1017             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1018         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1019             ("evq->init_state != SFXGE_EVQ_STARTED"));
1020
1021         /* Program the buffer table. */
1022         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1023             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1024                 return (rc);
1025
1026         /* Create the common code receive queue. */
1027         if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1028             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1029             &rxq->common)) != 0)
1030                 goto fail;
1031
1032         SFXGE_EVQ_LOCK(evq);
1033
1034         /* Enable the receive queue. */
1035         efx_rx_qenable(rxq->common);
1036
1037         rxq->init_state = SFXGE_RXQ_STARTED;
1038         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1039
1040         /* Try to fill the queue from the pool. */
1041         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1042
1043         SFXGE_EVQ_UNLOCK(evq);
1044
1045         return (0);
1046
1047 fail:
1048         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1049             EFX_RXQ_NBUFS(sc->rxq_entries));
1050         return (rc);
1051 }
1052
1053 void
1054 sfxge_rx_stop(struct sfxge_softc *sc)
1055 {
1056         int index;
1057
1058         efx_mac_filter_default_rxq_clear(sc->enp);
1059
1060         /* Stop the receive queue(s) */
1061         index = sc->rxq_count;
1062         while (--index >= 0)
1063                 sfxge_rx_qstop(sc, index);
1064
1065         sc->rx_prefix_size = 0;
1066         sc->rx_buffer_size = 0;
1067
1068         efx_rx_fini(sc->enp);
1069 }
1070
1071 int
1072 sfxge_rx_start(struct sfxge_softc *sc)
1073 {
1074         struct sfxge_intr *intr;
1075         const efx_nic_cfg_t *encp;
1076         size_t hdrlen, align, reserved;
1077         int index;
1078         int rc;
1079
1080         intr = &sc->intr;
1081
1082         /* Initialize the common code receive module. */
1083         if ((rc = efx_rx_init(sc->enp)) != 0)
1084                 return (rc);
1085
1086         encp = efx_nic_cfg_get(sc->enp);
1087         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1088
1089         /* Calculate the receive packet buffer size. */ 
1090         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1091
1092         /* Ensure IP headers are 32bit aligned */
1093         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1094         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1095
1096         sc->rx_buffer_size += sc->rx_buffer_align;
1097
1098         /* Align end of packet buffer for RX DMA end padding */
1099         align = MAX(1, encp->enc_rx_buf_align_end);
1100         EFSYS_ASSERT(ISP2(align));
1101         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1102
1103         /* 
1104          * Standard mbuf zones only guarantee pointer-size alignment;
1105          * we need extra space to align to the cache line
1106          */
1107         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1108
1109         /* Select zone for packet buffers */
1110         if (reserved <= MCLBYTES)
1111                 sc->rx_cluster_size = MCLBYTES;
1112         else if (reserved <= MJUMPAGESIZE)
1113                 sc->rx_cluster_size = MJUMPAGESIZE;
1114         else if (reserved <= MJUM9BYTES)
1115                 sc->rx_cluster_size = MJUM9BYTES;
1116         else
1117                 sc->rx_cluster_size = MJUM16BYTES;
1118
1119         /*
1120          * Set up the scale table.  Enable all hash types and hash insertion.
1121          */
1122         for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1123                 sc->rx_indir_table[index] = index % sc->rxq_count;
1124         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1125                                        SFXGE_RX_SCALE_MAX)) != 0)
1126                 goto fail;
1127         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1128             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1129             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1130
1131         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1132                                        sizeof(toep_key))) != 0)
1133                 goto fail;
1134
1135         /* Start the receive queue(s). */
1136         for (index = 0; index < sc->rxq_count; index++) {
1137                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1138                         goto fail2;
1139         }
1140
1141         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1142                                             sc->intr.n_alloc > 1);
1143         if (rc != 0)
1144                 goto fail3;
1145
1146         return (0);
1147
1148 fail3:
1149 fail2:
1150         while (--index >= 0)
1151                 sfxge_rx_qstop(sc, index);
1152
1153 fail:
1154         efx_rx_fini(sc->enp);
1155
1156         return (rc);
1157 }
1158
1159 #ifdef SFXGE_LRO
1160
1161 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1162 {
1163         struct sfxge_lro_state *st = &rxq->lro;
1164         unsigned i;
1165
1166         st->conns_mask = lro_table_size - 1;
1167         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1168                 ("lro_table_size must be a power of 2"));
1169         st->sc = rxq->sc;
1170         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1171                            M_SFXGE, M_WAITOK);
1172         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1173                              M_SFXGE, M_WAITOK);
1174         for (i = 0; i <= st->conns_mask; ++i) {
1175                 TAILQ_INIT(&st->conns[i]);
1176                 st->conns_n[i] = 0;
1177         }
1178         LIST_INIT(&st->active_conns);
1179         TAILQ_INIT(&st->free_conns);
1180 }
1181
1182 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1183 {
1184         struct sfxge_lro_state *st = &rxq->lro;
1185         struct sfxge_lro_conn *c;
1186         unsigned i;
1187
1188         /* Return cleanly if sfxge_lro_init() has not been called. */
1189         if (st->conns == NULL)
1190                 return;
1191
1192         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1193
1194         for (i = 0; i <= st->conns_mask; ++i) {
1195                 while (!TAILQ_EMPTY(&st->conns[i])) {
1196                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1197                         sfxge_lro_drop(rxq, c);
1198                 }
1199         }
1200
1201         while (!TAILQ_EMPTY(&st->free_conns)) {
1202                 c = TAILQ_FIRST(&st->free_conns);
1203                 TAILQ_REMOVE(&st->free_conns, c, link);
1204                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1205                 free(c, M_SFXGE);
1206         }
1207
1208         free(st->conns_n, M_SFXGE);
1209         free(st->conns, M_SFXGE);
1210         st->conns = NULL;
1211 }
1212
1213 #else
1214
1215 static void
1216 sfxge_lro_init(struct sfxge_rxq *rxq)
1217 {
1218 }
1219
1220 static void
1221 sfxge_lro_fini(struct sfxge_rxq *rxq)
1222 {
1223 }
1224
1225 #endif  /* SFXGE_LRO */
1226
1227 static void
1228 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1229 {
1230         struct sfxge_rxq *rxq;
1231
1232         rxq = sc->rxq[index];
1233
1234         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1235             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1236
1237         /* Free the context array and the flow table. */
1238         free(rxq->queue, M_SFXGE);
1239         sfxge_lro_fini(rxq);
1240
1241         /* Release DMA memory. */
1242         sfxge_dma_free(&rxq->mem);
1243
1244         sc->rxq[index] = NULL;
1245
1246         free(rxq, M_SFXGE);
1247 }
1248
1249 static int
1250 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1251 {
1252         struct sfxge_rxq *rxq;
1253         struct sfxge_evq *evq;
1254         efsys_mem_t *esmp;
1255         int rc;
1256
1257         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1258
1259         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1260         rxq->sc = sc;
1261         rxq->index = index;
1262         rxq->entries = sc->rxq_entries;
1263         rxq->ptr_mask = rxq->entries - 1;
1264         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1265
1266         sc->rxq[index] = rxq;
1267         esmp = &rxq->mem;
1268
1269         evq = sc->evq[index];
1270
1271         /* Allocate and zero DMA space. */
1272         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1273                 return (rc);
1274
1275         /* Allocate buffer table entries. */
1276         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1277                                  &rxq->buf_base_id);
1278
1279         /* Allocate the context array and the flow table. */
1280         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1281             M_SFXGE, M_WAITOK | M_ZERO);
1282         sfxge_lro_init(rxq);
1283
1284         callout_init(&rxq->refill_callout, 1);
1285
1286         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1287
1288         return (0);
1289 }
1290
1291 static const struct {
1292         const char *name;
1293         size_t offset;
1294 } sfxge_rx_stats[] = {
1295 #define SFXGE_RX_STAT(name, member) \
1296         { #name, offsetof(struct sfxge_rxq, member) }
1297 #ifdef SFXGE_LRO
1298         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1299         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1300         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1301         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1302         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1303         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1304         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1305         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1306 #endif
1307 };
1308
1309 static int
1310 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1311 {
1312         struct sfxge_softc *sc = arg1;
1313         unsigned int id = arg2;
1314         unsigned int sum, index;
1315
1316         /* Sum across all RX queues */
1317         sum = 0;
1318         for (index = 0; index < sc->rxq_count; index++)
1319                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1320                                          sfxge_rx_stats[id].offset);
1321
1322         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1323 }
1324
1325 static void
1326 sfxge_rx_stat_init(struct sfxge_softc *sc)
1327 {
1328         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1329         struct sysctl_oid_list *stat_list;
1330         unsigned int id;
1331
1332         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1333
1334         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1335                 SYSCTL_ADD_PROC(
1336                         ctx, stat_list,
1337                         OID_AUTO, sfxge_rx_stats[id].name,
1338                         CTLTYPE_UINT|CTLFLAG_RD,
1339                         sc, id, sfxge_rx_stat_handler, "IU",
1340                         "");
1341         }
1342 }
1343
1344 void
1345 sfxge_rx_fini(struct sfxge_softc *sc)
1346 {
1347         int index;
1348
1349         index = sc->rxq_count;
1350         while (--index >= 0)
1351                 sfxge_rx_qfini(sc, index);
1352
1353         sc->rxq_count = 0;
1354 }
1355
1356 int
1357 sfxge_rx_init(struct sfxge_softc *sc)
1358 {
1359         struct sfxge_intr *intr;
1360         int index;
1361         int rc;
1362
1363 #ifdef SFXGE_LRO
1364         if (!ISP2(lro_table_size)) {
1365                 log(LOG_ERR, "%s=%u must be power of 2",
1366                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1367                 rc = EINVAL;
1368                 goto fail_lro_table_size;
1369         }
1370
1371         if (lro_idle_ticks == 0)
1372                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1373 #endif
1374
1375         intr = &sc->intr;
1376
1377         sc->rxq_count = intr->n_alloc;
1378
1379         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1380             ("intr->state != SFXGE_INTR_INITIALIZED"));
1381
1382         /* Initialize the receive queue(s) - one per interrupt. */
1383         for (index = 0; index < sc->rxq_count; index++) {
1384                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1385                         goto fail;
1386         }
1387
1388         sfxge_rx_stat_init(sc);
1389
1390         return (0);
1391
1392 fail:
1393         /* Tear down the receive queue(s). */
1394         while (--index >= 0)
1395                 sfxge_rx_qfini(sc, index);
1396
1397         sc->rxq_count = 0;
1398
1399 #ifdef SFXGE_LRO
1400 fail_lro_table_size:
1401 #endif
1402         return (rc);
1403 }