]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - sys/dev/sfxge/sfxge_rx.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2015 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/types.h>
38 #include <sys/mbuf.h>
39 #include <sys/smp.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/syslog.h>
43 #include <sys/limits.h>
44 #include <sys/syslog.h>
45
46 #include <net/ethernet.h>
47 #include <net/if.h>
48 #include <net/if_vlan_var.h>
49
50 #include <netinet/in.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 #include <netinet/tcp.h>
54
55 #include <machine/in_cksum.h>
56
57 #include "common/efx.h"
58
59
60 #include "sfxge.h"
61 #include "sfxge_rx.h"
62
63 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
64
65 #ifdef SFXGE_LRO
66
67 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
68             "Large receive offload (LRO) parameters");
69
70 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
71
72 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
73  * means we can accelerate a larger number of streams.
74  */
75 static unsigned lro_table_size = 128;
76 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
77 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
78             &lro_table_size, 0,
79             "Size of the LRO hash table (must be a power of 2)");
80
81 /* Maximum length of a hash chain.  If chains get too long then the lookup
82  * time increases and may exceed the benefit of LRO.
83  */
84 static unsigned lro_chain_max = 20;
85 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
87             &lro_chain_max, 0,
88             "The maximum length of a hash chain");
89
90 /* Maximum time (in ticks) that a connection can be idle before it's LRO
91  * state is discarded.
92  */
93 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
94 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
96             &lro_idle_ticks, 0,
97             "The maximum time (in ticks) that a connection can be idle "
98             "before it's LRO state is discarded");
99
100 /* Number of packets with payload that must arrive in-order before a
101  * connection is eligible for LRO.  The idea is we should avoid coalescing
102  * segments when the sender is in slow-start because reducing the ACK rate
103  * can damage performance.
104  */
105 static int lro_slow_start_packets = 2000;
106 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
107 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
108             &lro_slow_start_packets, 0,
109             "Number of packets with payload that must arrive in-order before "
110             "a connection is eligible for LRO");
111
112 /* Number of packets with payload that must arrive in-order following loss
113  * before a connection is eligible for LRO.  The idea is we should avoid
114  * coalescing segments when the sender is recovering from loss, because
115  * reducing the ACK rate can damage performance.
116  */
117 static int lro_loss_packets = 20;
118 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
119 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
120             &lro_loss_packets, 0,
121             "Number of packets with payload that must arrive in-order "
122             "following loss before a connection is eligible for LRO");
123
124 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
125 #define SFXGE_LRO_L2_ID_VLAN 0x4000
126 #define SFXGE_LRO_L2_ID_IPV6 0x8000
127 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
128 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
129
130 /* Compare IPv6 addresses, avoiding conditional branches */
131 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
132                                    const struct in6_addr *right)
133 {
134 #if LONG_BIT == 64
135         const uint64_t *left64 = (const uint64_t *)left;
136         const uint64_t *right64 = (const uint64_t *)right;
137         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
138 #else
139         return (left->s6_addr32[0] - right->s6_addr32[0]) |
140                (left->s6_addr32[1] - right->s6_addr32[1]) |
141                (left->s6_addr32[2] - right->s6_addr32[2]) |
142                (left->s6_addr32[3] - right->s6_addr32[3]);
143 #endif
144 }
145
146 #endif  /* SFXGE_LRO */
147
148 void
149 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
150 {
151
152         rxq->flush_state = SFXGE_FLUSH_DONE;
153 }
154
155 void
156 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
157 {
158
159         rxq->flush_state = SFXGE_FLUSH_FAILED;
160 }
161
162 static uint8_t toep_key[] = {
163         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
164         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
165         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
166         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
167         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
168 };
169
170 static void
171 sfxge_rx_post_refill(void *arg)
172 {
173         struct sfxge_rxq *rxq = arg;
174         struct sfxge_softc *sc;
175         unsigned int index;
176         struct sfxge_evq *evq;
177         uint16_t magic;
178
179         sc = rxq->sc;
180         index = rxq->index;
181         evq = sc->evq[index];
182
183         magic = SFXGE_MAGIC_RX_QREFILL | index;
184
185         /* This is guaranteed due to the start/stop order of rx and ev */
186         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
187             ("evq not started"));
188         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
189             ("rxq not started"));
190         efx_ev_qpost(evq->common, magic);
191 }
192
193 static void
194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
195 {
196         /* Initially retry after 100 ms, but back off in case of
197          * repeated failures as we probably have to wait for the
198          * administrator to raise the pool limit. */
199         if (retrying)
200                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
201         else
202                 rxq->refill_delay = hz / 10;
203
204         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
205                              sfxge_rx_post_refill, rxq);
206 }
207
208 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
209 {
210         struct mb_args args;
211         struct mbuf *m;
212
213         /* Allocate mbuf structure */
214         args.flags = M_PKTHDR;
215         args.type = MT_DATA;
216         m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
217
218         /* Allocate (and attach) packet buffer */
219         if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
220                 uma_zfree(zone_mbuf, m);
221                 m = NULL;
222         }
223
224         return (m);
225 }
226
227 #define SFXGE_REFILL_BATCH  64
228
229 static void
230 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
231 {
232         struct sfxge_softc *sc;
233         unsigned int index;
234         struct sfxge_evq *evq;
235         unsigned int batch;
236         unsigned int rxfill;
237         unsigned int mblksize;
238         int ntodo;
239         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
240
241         sc = rxq->sc;
242         index = rxq->index;
243         evq = sc->evq[index];
244
245         prefetch_read_many(sc->enp);
246         prefetch_read_many(rxq->common);
247
248         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
249
250         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
251                 return;
252
253         rxfill = rxq->added - rxq->completed;
254         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
255             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
256         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
257         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
258             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
259
260         if (ntodo == 0)
261                 return;
262
263         batch = 0;
264         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
265         while (ntodo-- > 0) {
266                 unsigned int id;
267                 struct sfxge_rx_sw_desc *rx_desc;
268                 bus_dma_segment_t seg;
269                 struct mbuf *m;
270
271                 id = (rxq->added + batch) & rxq->ptr_mask;
272                 rx_desc = &rxq->queue[id];
273                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
274
275                 rx_desc->flags = EFX_DISCARD;
276                 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
277                 if (m == NULL)
278                         break;
279
280                 /* m_len specifies length of area to be mapped for DMA */
281                 m->m_len  = mblksize;
282                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
283                 m->m_data += sc->rx_buffer_align;
284
285                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
286                 addr[batch++] = seg.ds_addr;
287
288                 if (batch == SFXGE_REFILL_BATCH) {
289                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
290                             rxq->completed, rxq->added);
291                         rxq->added += batch;
292                         batch = 0;
293                 }
294         }
295
296         if (ntodo != 0)
297                 sfxge_rx_schedule_refill(rxq, retrying);
298
299         if (batch != 0) {
300                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
301                     rxq->completed, rxq->added);
302                 rxq->added += batch;
303         }
304
305         /* Make the descriptors visible to the hardware */
306         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
307                         BUS_DMASYNC_PREWRITE);
308
309         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
310
311         /* The queue could still be empty if no descriptors were actually
312          * pushed, in which case there will be no event to cause the next
313          * refill, so we must schedule a refill ourselves.
314          */
315         if(rxq->pushed == rxq->completed) {
316                 sfxge_rx_schedule_refill(rxq, retrying);
317         }
318 }
319
320 void
321 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
322 {
323
324         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
325                 return;
326
327         /* Make sure the queue is full */
328         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
329 }
330
331 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
332 {
333         struct ifnet *ifp = sc->ifnet;
334
335         m->m_pkthdr.rcvif = ifp;
336         m->m_pkthdr.csum_data = 0xffff;
337         ifp->if_input(ifp, m);
338 }
339
340 static void
341 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
342 {
343         struct mbuf *m = rx_desc->mbuf;
344         int flags = rx_desc->flags;
345         int csum_flags;
346
347         /* Convert checksum flags */
348         csum_flags = (flags & EFX_CKSUM_IPV4) ?
349                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
350         if (flags & EFX_CKSUM_TCPUDP)
351                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
352
353         /* The hash covers a 4-tuple for TCP only */
354         if (flags & EFX_PKT_TCP) {
355                 m->m_pkthdr.flowid =
356                         efx_psuedo_hdr_hash_get(sc->enp,
357                                                 EFX_RX_HASHALG_TOEPLITZ,
358                                                 mtod(m, uint8_t *));
359                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
360         }
361         m->m_data += sc->rx_prefix_size;
362         m->m_len = rx_desc->size - sc->rx_prefix_size;
363         m->m_pkthdr.len = m->m_len;
364         m->m_pkthdr.csum_flags = csum_flags;
365         __sfxge_rx_deliver(sc, rx_desc->mbuf);
366
367         rx_desc->flags = EFX_DISCARD;
368         rx_desc->mbuf = NULL;
369 }
370
371 #ifdef SFXGE_LRO
372
373 static void
374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
375 {
376         struct sfxge_softc *sc = st->sc;
377         struct mbuf *m = c->mbuf;
378         struct tcphdr *c_th;
379         int csum_flags;
380
381         KASSERT(m, ("no mbuf to deliver"));
382
383         ++st->n_bursts;
384
385         /* Finish off packet munging and recalculate IP header checksum. */
386         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
387                 struct ip *iph = c->nh;
388                 iph->ip_len = htons(iph->ip_len);
389                 iph->ip_sum = 0;
390                 iph->ip_sum = in_cksum_hdr(iph);
391                 c_th = (struct tcphdr *)(iph + 1);
392                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
393                               CSUM_IP_CHECKED | CSUM_IP_VALID);
394         } else {
395                 struct ip6_hdr *iph = c->nh;
396                 iph->ip6_plen = htons(iph->ip6_plen);
397                 c_th = (struct tcphdr *)(iph + 1);
398                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
399         }
400
401         c_th->th_win = c->th_last->th_win;
402         c_th->th_ack = c->th_last->th_ack;
403         if (c_th->th_off == c->th_last->th_off) {
404                 /* Copy TCP options (take care to avoid going negative). */
405                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
406                 memcpy(c_th + 1, c->th_last + 1, optlen);
407         }
408
409         m->m_pkthdr.flowid = c->conn_hash;
410         M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
411
412         m->m_pkthdr.csum_flags = csum_flags;
413         __sfxge_rx_deliver(sc, m);
414
415         c->mbuf = NULL;
416         c->delivered = 1;
417 }
418
419 /* Drop the given connection, and add it to the free list. */
420 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
421 {
422         unsigned bucket;
423
424         KASSERT(!c->mbuf, ("found orphaned mbuf"));
425
426         if (c->next_buf.mbuf != NULL) {
427                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
428                 LIST_REMOVE(c, active_link);
429         }
430
431         bucket = c->conn_hash & rxq->lro.conns_mask;
432         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
433         --rxq->lro.conns_n[bucket];
434         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
435         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
436 }
437
438 /* Stop tracking connections that have gone idle in order to keep hash
439  * chains short.
440  */
441 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
442 {
443         struct sfxge_lro_conn *c;
444         unsigned i;
445
446         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
447                 ("found active connections"));
448
449         rxq->lro.last_purge_ticks = now;
450         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
451                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
452                         continue;
453
454                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
455                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
456                         ++rxq->lro.n_drop_idle;
457                         sfxge_lro_drop(rxq, c);
458                 }
459         }
460 }
461
462 static void
463 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
464                 struct mbuf *mbuf, struct tcphdr *th)
465 {
466         struct tcphdr *c_th;
467
468         /* Tack the new mbuf onto the chain. */
469         KASSERT(!mbuf->m_next, ("mbuf already chained"));
470         c->mbuf_tail->m_next = mbuf;
471         c->mbuf_tail = mbuf;
472
473         /* Increase length appropriately */
474         c->mbuf->m_pkthdr.len += mbuf->m_len;
475
476         /* Update the connection state flags */
477         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
478                 struct ip *iph = c->nh;
479                 iph->ip_len += mbuf->m_len;
480                 c_th = (struct tcphdr *)(iph + 1);
481         } else {
482                 struct ip6_hdr *iph = c->nh;
483                 iph->ip6_plen += mbuf->m_len;
484                 c_th = (struct tcphdr *)(iph + 1);
485         }
486         c_th->th_flags |= (th->th_flags & TH_PUSH);
487         c->th_last = th;
488         ++st->n_merges;
489
490         /* Pass packet up now if another segment could overflow the IP
491          * length.
492          */
493         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
494                 sfxge_lro_deliver(st, c);
495 }
496
497 static void
498 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
499                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
500 {
501         /* Start the chain */
502         c->mbuf = mbuf;
503         c->mbuf_tail = c->mbuf;
504         c->nh = nh;
505         c->th_last = th;
506
507         mbuf->m_pkthdr.len = mbuf->m_len;
508
509         /* Mangle header fields for later processing */
510         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
511                 struct ip *iph = nh;
512                 iph->ip_len = ntohs(iph->ip_len);
513         } else {
514                 struct ip6_hdr *iph = nh;
515                 iph->ip6_plen = ntohs(iph->ip6_plen);
516         }
517 }
518
519 /* Try to merge or otherwise hold or deliver (as appropriate) the
520  * packet buffered for this connection (c->next_buf).  Return a flag
521  * indicating whether the connection is still active for LRO purposes.
522  */
523 static int
524 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
525 {
526         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
527         char *eh = c->next_eh;
528         int data_length, hdr_length, dont_merge;
529         unsigned th_seq, pkt_length;
530         struct tcphdr *th;
531         unsigned now;
532
533         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
534                 struct ip *iph = c->next_nh;
535                 th = (struct tcphdr *)(iph + 1);
536                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
537         } else {
538                 struct ip6_hdr *iph = c->next_nh;
539                 th = (struct tcphdr *)(iph + 1);
540                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
541         }
542
543         hdr_length = (char *) th + th->th_off * 4 - eh;
544         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
545                        hdr_length);
546         th_seq = ntohl(th->th_seq);
547         dont_merge = ((data_length <= 0)
548                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
549
550         /* Check for options other than aligned timestamp. */
551         if (th->th_off != 5) {
552                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
553                 if (th->th_off == 8 &&
554                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
555                                         (TCPOPT_NOP << 16) |
556                                         (TCPOPT_TIMESTAMP << 8) |
557                                         TCPOLEN_TIMESTAMP)) {
558                         /* timestamp option -- okay */
559                 } else {
560                         dont_merge = 1;
561                 }
562         }
563
564         if (__predict_false(th_seq != c->next_seq)) {
565                 /* Out-of-order, so start counting again. */
566                 if (c->mbuf != NULL)
567                         sfxge_lro_deliver(&rxq->lro, c);
568                 c->n_in_order_pkts -= lro_loss_packets;
569                 c->next_seq = th_seq + data_length;
570                 ++rxq->lro.n_misorder;
571                 goto deliver_buf_out;
572         }
573         c->next_seq = th_seq + data_length;
574
575         now = ticks;
576         if (now - c->last_pkt_ticks > lro_idle_ticks) {
577                 ++rxq->lro.n_drop_idle;
578                 if (c->mbuf != NULL)
579                         sfxge_lro_deliver(&rxq->lro, c);
580                 sfxge_lro_drop(rxq, c);
581                 return (0);
582         }
583         c->last_pkt_ticks = ticks;
584
585         if (c->n_in_order_pkts < lro_slow_start_packets) {
586                 /* May be in slow-start, so don't merge. */
587                 ++rxq->lro.n_slow_start;
588                 ++c->n_in_order_pkts;
589                 goto deliver_buf_out;
590         }
591
592         if (__predict_false(dont_merge)) {
593                 if (c->mbuf != NULL)
594                         sfxge_lro_deliver(&rxq->lro, c);
595                 if (th->th_flags & (TH_FIN | TH_RST)) {
596                         ++rxq->lro.n_drop_closed;
597                         sfxge_lro_drop(rxq, c);
598                         return (0);
599                 }
600                 goto deliver_buf_out;
601         }
602
603         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
604
605         if (__predict_true(c->mbuf != NULL)) {
606                 /* Remove headers and any padding */
607                 rx_buf->mbuf->m_data += hdr_length;
608                 rx_buf->mbuf->m_len = data_length;
609
610                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
611         } else {
612                 /* Remove any padding */
613                 rx_buf->mbuf->m_len = pkt_length;
614
615                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
616         }
617
618         rx_buf->mbuf = NULL;
619         return (1);
620
621  deliver_buf_out:
622         sfxge_rx_deliver(rxq->sc, rx_buf);
623         return (1);
624 }
625
626 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
627                                uint16_t l2_id, void *nh, struct tcphdr *th)
628 {
629         unsigned bucket = conn_hash & st->conns_mask;
630         struct sfxge_lro_conn *c;
631
632         if (st->conns_n[bucket] >= lro_chain_max) {
633                 ++st->n_too_many;
634                 return;
635         }
636
637         if (!TAILQ_EMPTY(&st->free_conns)) {
638                 c = TAILQ_FIRST(&st->free_conns);
639                 TAILQ_REMOVE(&st->free_conns, c, link);
640         } else {
641                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
642                 if (c == NULL)
643                         return;
644                 c->mbuf = NULL;
645                 c->next_buf.mbuf = NULL;
646         }
647
648         /* Create the connection tracking data */
649         ++st->conns_n[bucket];
650         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
651         c->l2_id = l2_id;
652         c->conn_hash = conn_hash;
653         c->source = th->th_sport;
654         c->dest = th->th_dport;
655         c->n_in_order_pkts = 0;
656         c->last_pkt_ticks = *(volatile int *)&ticks;
657         c->delivered = 0;
658         ++st->n_new_stream;
659         /* NB. We don't initialise c->next_seq, and it doesn't matter what
660          * value it has.  Most likely the next packet received for this
661          * connection will not match -- no harm done.
662          */
663 }
664
665 /* Process mbuf and decide whether to dispatch it to the stack now or
666  * later.
667  */
668 static void
669 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
670 {
671         struct sfxge_softc *sc = rxq->sc;
672         struct mbuf *m = rx_buf->mbuf;
673         struct ether_header *eh;
674         struct sfxge_lro_conn *c;
675         uint16_t l2_id;
676         uint16_t l3_proto;
677         void *nh;
678         struct tcphdr *th;
679         uint32_t conn_hash;
680         unsigned bucket;
681
682         /* Get the hardware hash */
683         conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
684                                             EFX_RX_HASHALG_TOEPLITZ,
685                                             mtod(m, uint8_t *));
686
687         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
688         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
689                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
690                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
691                         SFXGE_LRO_L2_ID_VLAN;
692                 l3_proto = veh->evl_proto;
693                 nh = veh + 1;
694         } else {
695                 l2_id = 0;
696                 l3_proto = eh->ether_type;
697                 nh = eh + 1;
698         }
699
700         /* Check whether this is a suitable packet (unfragmented
701          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
702          * length, and compute a hash if necessary.  If not, return.
703          */
704         if (l3_proto == htons(ETHERTYPE_IP)) {
705                 struct ip *iph = nh;
706
707                 KASSERT(iph->ip_p == IPPROTO_TCP,
708                     ("IPv4 protocol is not TCP, but packet marker is set"));
709                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
710                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
711                         goto deliver_now;
712                 th = (struct tcphdr *)(iph + 1);
713         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
714                 struct ip6_hdr *iph = nh;
715
716                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
717                     ("IPv6 next header is not TCP, but packet marker is set"));
718                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
719                 th = (struct tcphdr *)(iph + 1);
720         } else {
721                 goto deliver_now;
722         }
723
724         bucket = conn_hash & rxq->lro.conns_mask;
725
726         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
727                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
728                         continue;
729                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
730                         continue;
731                 if (c->mbuf != NULL) {
732                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
733                                 struct ip *c_iph, *iph = nh;
734                                 c_iph = c->nh;
735                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
736                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
737                                         continue;
738                         } else {
739                                 struct ip6_hdr *c_iph, *iph = nh;
740                                 c_iph = c->nh;
741                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
742                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
743                                         continue;
744                         }
745                 }
746
747                 /* Re-insert at head of list to reduce lookup time. */
748                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
749                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
750
751                 if (c->next_buf.mbuf != NULL) {
752                         if (!sfxge_lro_try_merge(rxq, c))
753                                 goto deliver_now;
754                 } else {
755                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
756                             active_link);
757                 }
758                 c->next_buf = *rx_buf;
759                 c->next_eh = eh;
760                 c->next_nh = nh;
761
762                 rx_buf->mbuf = NULL;
763                 rx_buf->flags = EFX_DISCARD;
764                 return;
765         }
766
767         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
768  deliver_now:
769         sfxge_rx_deliver(sc, rx_buf);
770 }
771
772 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
773 {
774         struct sfxge_lro_state *st = &rxq->lro;
775         struct sfxge_lro_conn *c;
776         unsigned t;
777
778         while (!LIST_EMPTY(&st->active_conns)) {
779                 c = LIST_FIRST(&st->active_conns);
780                 if (!c->delivered && c->mbuf != NULL)
781                         sfxge_lro_deliver(st, c);
782                 if (sfxge_lro_try_merge(rxq, c)) {
783                         if (c->mbuf != NULL)
784                                 sfxge_lro_deliver(st, c);
785                         LIST_REMOVE(c, active_link);
786                 }
787                 c->delivered = 0;
788         }
789
790         t = *(volatile int *)&ticks;
791         if (__predict_false(t != st->last_purge_ticks))
792                 sfxge_lro_purge_idle(rxq, t);
793 }
794
795 #else   /* !SFXGE_LRO */
796
797 static void
798 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
799 {
800 }
801
802 static void
803 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
804 {
805 }
806
807 #endif  /* SFXGE_LRO */
808
809 void
810 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
811 {
812         struct sfxge_softc *sc = rxq->sc;
813         int if_capenable = sc->ifnet->if_capenable;
814         int lro_enabled = if_capenable & IFCAP_LRO;
815         unsigned int index;
816         struct sfxge_evq *evq;
817         unsigned int completed;
818         unsigned int level;
819         struct mbuf *m;
820         struct sfxge_rx_sw_desc *prev = NULL;
821
822         index = rxq->index;
823         evq = sc->evq[index];
824
825         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
826
827         completed = rxq->completed;
828         while (completed != rxq->pending) {
829                 unsigned int id;
830                 struct sfxge_rx_sw_desc *rx_desc;
831
832                 id = completed++ & rxq->ptr_mask;
833                 rx_desc = &rxq->queue[id];
834                 m = rx_desc->mbuf;
835
836                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
837                         goto discard;
838
839                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
840                         goto discard;
841
842                 /* Read the length from the psuedo header if required */
843                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
844                         uint16_t tmp_size;
845                         int rc;
846                         rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 
847                                                            mtod(m, uint8_t *),
848                                                            &tmp_size);
849                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
850                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
851                 }
852
853                 prefetch_read_many(mtod(m, caddr_t));
854
855                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
856                 case EFX_PKT_IPV4:
857                         if (~if_capenable & IFCAP_RXCSUM)
858                                 rx_desc->flags &=
859                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
860                         break;
861                 case EFX_PKT_IPV6:
862                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
863                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
864                         break;
865                 case 0:
866                         /* Check for loopback packets */
867                         {
868                                 struct ether_header *etherhp;
869
870                                 /*LINTED*/
871                                 etherhp = mtod(m, struct ether_header *);
872
873                                 if (etherhp->ether_type ==
874                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
875                                         EFSYS_PROBE(loopback);
876
877                                         rxq->loopback++;
878                                         goto discard;
879                                 }
880                         }
881                         break;
882                 default:
883                         KASSERT(B_FALSE,
884                             ("Rx descriptor with both IPv4 and IPv6 flags"));
885                         goto discard;
886                 }
887
888                 /* Pass packet up the stack or into LRO (pipelined) */
889                 if (prev != NULL) {
890                         if (lro_enabled &&
891                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
892                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
893                                 sfxge_lro(rxq, prev);
894                         else
895                                 sfxge_rx_deliver(sc, prev);
896                 }
897                 prev = rx_desc;
898                 continue;
899
900 discard:
901                 /* Return the packet to the pool */
902                 m_free(m);
903                 rx_desc->mbuf = NULL;
904         }
905         rxq->completed = completed;
906
907         level = rxq->added - rxq->completed;
908
909         /* Pass last packet up the stack or into LRO */
910         if (prev != NULL) {
911                 if (lro_enabled &&
912                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
913                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
914                         sfxge_lro(rxq, prev);
915                 else
916                         sfxge_rx_deliver(sc, prev);
917         }
918
919         /*
920          * If there are any pending flows and this is the end of the
921          * poll then they must be completed.
922          */
923         if (eop)
924                 sfxge_lro_end_of_burst(rxq);
925
926         /* Top up the queue if necessary */
927         if (level < rxq->refill_threshold)
928                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
929 }
930
931 static void
932 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
933 {
934         struct sfxge_rxq *rxq;
935         struct sfxge_evq *evq;
936         unsigned int count;
937         unsigned int retry = 3;
938
939         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
940
941         rxq = sc->rxq[index];
942         evq = sc->evq[index];
943
944         SFXGE_EVQ_LOCK(evq);
945
946         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
947             ("rxq not started"));
948
949         rxq->init_state = SFXGE_RXQ_INITIALIZED;
950
951         callout_stop(&rxq->refill_callout);
952
953         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
954                 rxq->flush_state = SFXGE_FLUSH_PENDING;
955
956                 SFXGE_EVQ_UNLOCK(evq);
957
958                 /* Flush the receive queue */
959                 if (efx_rx_qflush(rxq->common) != 0) {
960                         SFXGE_EVQ_LOCK(evq);
961                         rxq->flush_state = SFXGE_FLUSH_FAILED;
962                         break;
963                 }
964
965                 count = 0;
966                 do {
967                         /* Spin for 100 ms */
968                         DELAY(100000);
969
970                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
971                                 break;
972
973                 } while (++count < 20);
974
975                 SFXGE_EVQ_LOCK(evq);
976
977                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
978                         /* Flush timeout - neither done nor failed */
979                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
980                             device_get_nameunit(sc->dev), index);
981                         rxq->flush_state = SFXGE_FLUSH_DONE;
982                 }
983                 retry--;
984         }
985         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
986                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
987                     device_get_nameunit(sc->dev), index);
988                 rxq->flush_state = SFXGE_FLUSH_DONE;
989         }
990
991         rxq->pending = rxq->added;
992         sfxge_rx_qcomplete(rxq, B_TRUE);
993
994         KASSERT(rxq->completed == rxq->pending,
995             ("rxq->completed != rxq->pending"));
996
997         rxq->added = 0;
998         rxq->pushed = 0;
999         rxq->pending = 0;
1000         rxq->completed = 0;
1001         rxq->loopback = 0;
1002
1003         /* Destroy the common code receive queue. */
1004         efx_rx_qdestroy(rxq->common);
1005
1006         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1007             EFX_RXQ_NBUFS(sc->rxq_entries));
1008
1009         SFXGE_EVQ_UNLOCK(evq);
1010 }
1011
1012 static int
1013 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1014 {
1015         struct sfxge_rxq *rxq;
1016         efsys_mem_t *esmp;
1017         struct sfxge_evq *evq;
1018         int rc;
1019
1020         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1021
1022         rxq = sc->rxq[index];
1023         esmp = &rxq->mem;
1024         evq = sc->evq[index];
1025
1026         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1027             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1028         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1029             ("evq->init_state != SFXGE_EVQ_STARTED"));
1030
1031         /* Program the buffer table. */
1032         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1033             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1034                 return (rc);
1035
1036         /* Create the common code receive queue. */
1037         if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1038             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1039             &rxq->common)) != 0)
1040                 goto fail;
1041
1042         SFXGE_EVQ_LOCK(evq);
1043
1044         /* Enable the receive queue. */
1045         efx_rx_qenable(rxq->common);
1046
1047         rxq->init_state = SFXGE_RXQ_STARTED;
1048         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1049
1050         /* Try to fill the queue from the pool. */
1051         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1052
1053         SFXGE_EVQ_UNLOCK(evq);
1054
1055         return (0);
1056
1057 fail:
1058         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1059             EFX_RXQ_NBUFS(sc->rxq_entries));
1060         return (rc);
1061 }
1062
1063 void
1064 sfxge_rx_stop(struct sfxge_softc *sc)
1065 {
1066         int index;
1067
1068         efx_mac_filter_default_rxq_clear(sc->enp);
1069
1070         /* Stop the receive queue(s) */
1071         index = sc->rxq_count;
1072         while (--index >= 0)
1073                 sfxge_rx_qstop(sc, index);
1074
1075         sc->rx_prefix_size = 0;
1076         sc->rx_buffer_size = 0;
1077
1078         efx_rx_fini(sc->enp);
1079 }
1080
1081 int
1082 sfxge_rx_start(struct sfxge_softc *sc)
1083 {
1084         struct sfxge_intr *intr;
1085         const efx_nic_cfg_t *encp;
1086         size_t hdrlen, align, reserved;
1087         int index;
1088         int rc;
1089
1090         intr = &sc->intr;
1091
1092         /* Initialize the common code receive module. */
1093         if ((rc = efx_rx_init(sc->enp)) != 0)
1094                 return (rc);
1095
1096         encp = efx_nic_cfg_get(sc->enp);
1097         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1098
1099         /* Calculate the receive packet buffer size. */ 
1100         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1101
1102         /* Ensure IP headers are 32bit aligned */
1103         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1104         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1105
1106         sc->rx_buffer_size += sc->rx_buffer_align;
1107
1108         /* Align end of packet buffer for RX DMA end padding */
1109         align = MAX(1, encp->enc_rx_buf_align_end);
1110         EFSYS_ASSERT(ISP2(align));
1111         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1112
1113         /* 
1114          * Standard mbuf zones only guarantee pointer-size alignment;
1115          * we need extra space to align to the cache line
1116          */
1117         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1118
1119         /* Select zone for packet buffers */
1120         if (reserved <= MCLBYTES)
1121                 sc->rx_buffer_zone = zone_clust;
1122         else if (reserved <= MJUMPAGESIZE)
1123                 sc->rx_buffer_zone = zone_jumbop;
1124         else if (reserved <= MJUM9BYTES)
1125                 sc->rx_buffer_zone = zone_jumbo9;
1126         else
1127                 sc->rx_buffer_zone = zone_jumbo16;
1128
1129         /*
1130          * Set up the scale table.  Enable all hash types and hash insertion.
1131          */
1132         for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1133                 sc->rx_indir_table[index] = index % sc->rxq_count;
1134         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1135                                        SFXGE_RX_SCALE_MAX)) != 0)
1136                 goto fail;
1137         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1138             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1139             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1140
1141         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1142                                        sizeof(toep_key))) != 0)
1143                 goto fail;
1144
1145         /* Start the receive queue(s). */
1146         for (index = 0; index < sc->rxq_count; index++) {
1147                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1148                         goto fail2;
1149         }
1150
1151         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1152                                             sc->intr.n_alloc > 1);
1153         if (rc != 0)
1154                 goto fail3;
1155
1156         return (0);
1157
1158 fail3:
1159 fail2:
1160         while (--index >= 0)
1161                 sfxge_rx_qstop(sc, index);
1162
1163 fail:
1164         efx_rx_fini(sc->enp);
1165
1166         return (rc);
1167 }
1168
1169 #ifdef SFXGE_LRO
1170
1171 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1172 {
1173         struct sfxge_lro_state *st = &rxq->lro;
1174         unsigned i;
1175
1176         st->conns_mask = lro_table_size - 1;
1177         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1178                 ("lro_table_size must be a power of 2"));
1179         st->sc = rxq->sc;
1180         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1181                            M_SFXGE, M_WAITOK);
1182         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1183                              M_SFXGE, M_WAITOK);
1184         for (i = 0; i <= st->conns_mask; ++i) {
1185                 TAILQ_INIT(&st->conns[i]);
1186                 st->conns_n[i] = 0;
1187         }
1188         LIST_INIT(&st->active_conns);
1189         TAILQ_INIT(&st->free_conns);
1190 }
1191
1192 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1193 {
1194         struct sfxge_lro_state *st = &rxq->lro;
1195         struct sfxge_lro_conn *c;
1196         unsigned i;
1197
1198         /* Return cleanly if sfxge_lro_init() has not been called. */
1199         if (st->conns == NULL)
1200                 return;
1201
1202         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1203
1204         for (i = 0; i <= st->conns_mask; ++i) {
1205                 while (!TAILQ_EMPTY(&st->conns[i])) {
1206                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1207                         sfxge_lro_drop(rxq, c);
1208                 }
1209         }
1210
1211         while (!TAILQ_EMPTY(&st->free_conns)) {
1212                 c = TAILQ_FIRST(&st->free_conns);
1213                 TAILQ_REMOVE(&st->free_conns, c, link);
1214                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1215                 free(c, M_SFXGE);
1216         }
1217
1218         free(st->conns_n, M_SFXGE);
1219         free(st->conns, M_SFXGE);
1220         st->conns = NULL;
1221 }
1222
1223 #else
1224
1225 static void
1226 sfxge_lro_init(struct sfxge_rxq *rxq)
1227 {
1228 }
1229
1230 static void
1231 sfxge_lro_fini(struct sfxge_rxq *rxq)
1232 {
1233 }
1234
1235 #endif  /* SFXGE_LRO */
1236
1237 static void
1238 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1239 {
1240         struct sfxge_rxq *rxq;
1241
1242         rxq = sc->rxq[index];
1243
1244         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1245             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1246
1247         /* Free the context array and the flow table. */
1248         free(rxq->queue, M_SFXGE);
1249         sfxge_lro_fini(rxq);
1250
1251         /* Release DMA memory. */
1252         sfxge_dma_free(&rxq->mem);
1253
1254         sc->rxq[index] = NULL;
1255
1256         free(rxq, M_SFXGE);
1257 }
1258
1259 static int
1260 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1261 {
1262         struct sfxge_rxq *rxq;
1263         struct sfxge_evq *evq;
1264         efsys_mem_t *esmp;
1265         int rc;
1266
1267         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1268
1269         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1270         rxq->sc = sc;
1271         rxq->index = index;
1272         rxq->entries = sc->rxq_entries;
1273         rxq->ptr_mask = rxq->entries - 1;
1274         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1275
1276         sc->rxq[index] = rxq;
1277         esmp = &rxq->mem;
1278
1279         evq = sc->evq[index];
1280
1281         /* Allocate and zero DMA space. */
1282         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1283                 return (rc);
1284
1285         /* Allocate buffer table entries. */
1286         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1287                                  &rxq->buf_base_id);
1288
1289         /* Allocate the context array and the flow table. */
1290         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1291             M_SFXGE, M_WAITOK | M_ZERO);
1292         sfxge_lro_init(rxq);
1293
1294         callout_init(&rxq->refill_callout, B_TRUE);
1295
1296         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1297
1298         return (0);
1299 }
1300
1301 static const struct {
1302         const char *name;
1303         size_t offset;
1304 } sfxge_rx_stats[] = {
1305 #define SFXGE_RX_STAT(name, member) \
1306         { #name, offsetof(struct sfxge_rxq, member) }
1307 #ifdef SFXGE_LRO
1308         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1309         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1310         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1311         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1312         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1313         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1314         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1315         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1316 #endif
1317 };
1318
1319 static int
1320 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1321 {
1322         struct sfxge_softc *sc = arg1;
1323         unsigned int id = arg2;
1324         unsigned int sum, index;
1325
1326         /* Sum across all RX queues */
1327         sum = 0;
1328         for (index = 0; index < sc->rxq_count; index++)
1329                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1330                                          sfxge_rx_stats[id].offset);
1331
1332         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1333 }
1334
1335 static void
1336 sfxge_rx_stat_init(struct sfxge_softc *sc)
1337 {
1338         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1339         struct sysctl_oid_list *stat_list;
1340         unsigned int id;
1341
1342         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1343
1344         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1345                 SYSCTL_ADD_PROC(
1346                         ctx, stat_list,
1347                         OID_AUTO, sfxge_rx_stats[id].name,
1348                         CTLTYPE_UINT|CTLFLAG_RD,
1349                         sc, id, sfxge_rx_stat_handler, "IU",
1350                         "");
1351         }
1352 }
1353
1354 void
1355 sfxge_rx_fini(struct sfxge_softc *sc)
1356 {
1357         int index;
1358
1359         index = sc->rxq_count;
1360         while (--index >= 0)
1361                 sfxge_rx_qfini(sc, index);
1362
1363         sc->rxq_count = 0;
1364 }
1365
1366 int
1367 sfxge_rx_init(struct sfxge_softc *sc)
1368 {
1369         struct sfxge_intr *intr;
1370         int index;
1371         int rc;
1372
1373 #ifdef SFXGE_LRO
1374         if (!ISP2(lro_table_size)) {
1375                 log(LOG_ERR, "%s=%u must be power of 2",
1376                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1377                 rc = EINVAL;
1378                 goto fail_lro_table_size;
1379         }
1380
1381         if (lro_idle_ticks == 0)
1382                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1383 #endif
1384
1385         intr = &sc->intr;
1386
1387         sc->rxq_count = intr->n_alloc;
1388
1389         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1390             ("intr->state != SFXGE_INTR_INITIALIZED"));
1391
1392         /* Initialize the receive queue(s) - one per interrupt. */
1393         for (index = 0; index < sc->rxq_count; index++) {
1394                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1395                         goto fail;
1396         }
1397
1398         sfxge_rx_stat_init(sc);
1399
1400         return (0);
1401
1402 fail:
1403         /* Tear down the receive queue(s). */
1404         while (--index >= 0)
1405                 sfxge_rx_qfini(sc, index);
1406
1407         sc->rxq_count = 0;
1408
1409 #ifdef SFXGE_LRO
1410 fail_lro_table_size:
1411 #endif
1412         return (rc);
1413 }