]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/sfxge/sfxge_rx.c
sfxge(4): do not limit driver RSS table to RSS channels max
[FreeBSD/FreeBSD.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include "opt_rss.h"
38
39 #include <sys/param.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/smp.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
48
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_vlan_var.h>
52
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57
58 #include <machine/in_cksum.h>
59
60 #ifdef RSS
61 #include <net/rss_config.h>
62 #endif
63
64 #include "common/efx.h"
65
66
67 #include "sfxge.h"
68 #include "sfxge_rx.h"
69
70 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
71
72 #ifdef SFXGE_LRO
73
74 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75             "Large receive offload (LRO) parameters");
76
77 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
78
79 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
80  * means we can accelerate a larger number of streams.
81  */
82 static unsigned lro_table_size = 128;
83 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
85             &lro_table_size, 0,
86             "Size of the LRO hash table (must be a power of 2)");
87
88 /* Maximum length of a hash chain.  If chains get too long then the lookup
89  * time increases and may exceed the benefit of LRO.
90  */
91 static unsigned lro_chain_max = 20;
92 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
94             &lro_chain_max, 0,
95             "The maximum length of a hash chain");
96
97 /* Maximum time (in ticks) that a connection can be idle before it's LRO
98  * state is discarded.
99  */
100 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
103             &lro_idle_ticks, 0,
104             "The maximum time (in ticks) that a connection can be idle "
105             "before it's LRO state is discarded");
106
107 /* Number of packets with payload that must arrive in-order before a
108  * connection is eligible for LRO.  The idea is we should avoid coalescing
109  * segments when the sender is in slow-start because reducing the ACK rate
110  * can damage performance.
111  */
112 static int lro_slow_start_packets = 2000;
113 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115             &lro_slow_start_packets, 0,
116             "Number of packets with payload that must arrive in-order before "
117             "a connection is eligible for LRO");
118
119 /* Number of packets with payload that must arrive in-order following loss
120  * before a connection is eligible for LRO.  The idea is we should avoid
121  * coalescing segments when the sender is recovering from loss, because
122  * reducing the ACK rate can damage performance.
123  */
124 static int lro_loss_packets = 20;
125 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127             &lro_loss_packets, 0,
128             "Number of packets with payload that must arrive in-order "
129             "following loss before a connection is eligible for LRO");
130
131 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132 #define SFXGE_LRO_L2_ID_VLAN 0x4000
133 #define SFXGE_LRO_L2_ID_IPV6 0x8000
134 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
136
137 /* Compare IPv6 addresses, avoiding conditional branches */
138 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139                                    const struct in6_addr *right)
140 {
141 #if LONG_BIT == 64
142         const uint64_t *left64 = (const uint64_t *)left;
143         const uint64_t *right64 = (const uint64_t *)right;
144         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
145 #else
146         return (left->s6_addr32[0] - right->s6_addr32[0]) |
147                (left->s6_addr32[1] - right->s6_addr32[1]) |
148                (left->s6_addr32[2] - right->s6_addr32[2]) |
149                (left->s6_addr32[3] - right->s6_addr32[3]);
150 #endif
151 }
152
153 #endif  /* SFXGE_LRO */
154
155 void
156 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
157 {
158
159         rxq->flush_state = SFXGE_FLUSH_DONE;
160 }
161
162 void
163 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
164 {
165
166         rxq->flush_state = SFXGE_FLUSH_FAILED;
167 }
168
169 #ifdef RSS
170 static uint8_t toep_key[RSS_KEYSIZE];
171 #else
172 static uint8_t toep_key[] = {
173         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
178 };
179 #endif
180
181 static void
182 sfxge_rx_post_refill(void *arg)
183 {
184         struct sfxge_rxq *rxq = arg;
185         struct sfxge_softc *sc;
186         unsigned int index;
187         struct sfxge_evq *evq;
188         uint16_t magic;
189
190         sc = rxq->sc;
191         index = rxq->index;
192         evq = sc->evq[index];
193         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
194
195         /* This is guaranteed due to the start/stop order of rx and ev */
196         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197             ("evq not started"));
198         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199             ("rxq not started"));
200         efx_ev_qpost(evq->common, magic);
201 }
202
203 static void
204 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
205 {
206         /* Initially retry after 100 ms, but back off in case of
207          * repeated failures as we probably have to wait for the
208          * administrator to raise the pool limit. */
209         if (retrying)
210                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
211         else
212                 rxq->refill_delay = hz / 10;
213
214         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215                              sfxge_rx_post_refill, rxq);
216 }
217
218 #define SFXGE_REFILL_BATCH  64
219
220 static void
221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
222 {
223         struct sfxge_softc *sc;
224         unsigned int index;
225         struct sfxge_evq *evq;
226         unsigned int batch;
227         unsigned int rxfill;
228         unsigned int mblksize;
229         int ntodo;
230         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
231
232         sc = rxq->sc;
233         index = rxq->index;
234         evq = sc->evq[index];
235
236         prefetch_read_many(sc->enp);
237         prefetch_read_many(rxq->common);
238
239         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
240
241         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
242                 return;
243
244         rxfill = rxq->added - rxq->completed;
245         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
250
251         if (ntodo == 0)
252                 return;
253
254         batch = 0;
255         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256         while (ntodo-- > 0) {
257                 unsigned int id;
258                 struct sfxge_rx_sw_desc *rx_desc;
259                 bus_dma_segment_t seg;
260                 struct mbuf *m;
261
262                 id = (rxq->added + batch) & rxq->ptr_mask;
263                 rx_desc = &rxq->queue[id];
264                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
265
266                 rx_desc->flags = EFX_DISCARD;
267                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268                     sc->rx_cluster_size);
269                 if (m == NULL)
270                         break;
271
272                 /* m_len specifies length of area to be mapped for DMA */
273                 m->m_len  = mblksize;
274                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
275                 m->m_data += sc->rx_buffer_align;
276
277                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278                 addr[batch++] = seg.ds_addr;
279
280                 if (batch == SFXGE_REFILL_BATCH) {
281                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
282                             rxq->completed, rxq->added);
283                         rxq->added += batch;
284                         batch = 0;
285                 }
286         }
287
288         if (ntodo != 0)
289                 sfxge_rx_schedule_refill(rxq, retrying);
290
291         if (batch != 0) {
292                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
293                     rxq->completed, rxq->added);
294                 rxq->added += batch;
295         }
296
297         /* Make the descriptors visible to the hardware */
298         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299                         BUS_DMASYNC_PREWRITE);
300
301         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
302
303         /* The queue could still be empty if no descriptors were actually
304          * pushed, in which case there will be no event to cause the next
305          * refill, so we must schedule a refill ourselves.
306          */
307         if(rxq->pushed == rxq->completed) {
308                 sfxge_rx_schedule_refill(rxq, retrying);
309         }
310 }
311
312 void
313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
314 {
315
316         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
317                 return;
318
319         /* Make sure the queue is full */
320         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
321 }
322
323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
324 {
325         struct ifnet *ifp = sc->ifnet;
326
327         m->m_pkthdr.rcvif = ifp;
328         m->m_pkthdr.csum_data = 0xffff;
329         ifp->if_input(ifp, m);
330 }
331
332 static void
333 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
334 {
335         struct mbuf *m = rx_desc->mbuf;
336         int flags = rx_desc->flags;
337         int csum_flags;
338
339         /* Convert checksum flags */
340         csum_flags = (flags & EFX_CKSUM_IPV4) ?
341                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
342         if (flags & EFX_CKSUM_TCPUDP)
343                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
344
345         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
346                 m->m_pkthdr.flowid =
347                         efx_psuedo_hdr_hash_get(sc->enp,
348                                                 EFX_RX_HASHALG_TOEPLITZ,
349                                                 mtod(m, uint8_t *));
350                 /* The hash covers a 4-tuple for TCP only */
351                 M_HASHTYPE_SET(m,
352                     (flags & EFX_PKT_IPV4) ?
353                         ((flags & EFX_PKT_TCP) ?
354                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
355                         ((flags & EFX_PKT_TCP) ?
356                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
357         }
358         m->m_data += sc->rx_prefix_size;
359         m->m_len = rx_desc->size - sc->rx_prefix_size;
360         m->m_pkthdr.len = m->m_len;
361         m->m_pkthdr.csum_flags = csum_flags;
362         __sfxge_rx_deliver(sc, rx_desc->mbuf);
363
364         rx_desc->flags = EFX_DISCARD;
365         rx_desc->mbuf = NULL;
366 }
367
368 #ifdef SFXGE_LRO
369
370 static void
371 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
372 {
373         struct sfxge_softc *sc = st->sc;
374         struct mbuf *m = c->mbuf;
375         struct tcphdr *c_th;
376         int csum_flags;
377
378         KASSERT(m, ("no mbuf to deliver"));
379
380         ++st->n_bursts;
381
382         /* Finish off packet munging and recalculate IP header checksum. */
383         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
384                 struct ip *iph = c->nh;
385                 iph->ip_len = htons(iph->ip_len);
386                 iph->ip_sum = 0;
387                 iph->ip_sum = in_cksum_hdr(iph);
388                 c_th = (struct tcphdr *)(iph + 1);
389                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
390                               CSUM_IP_CHECKED | CSUM_IP_VALID);
391         } else {
392                 struct ip6_hdr *iph = c->nh;
393                 iph->ip6_plen = htons(iph->ip6_plen);
394                 c_th = (struct tcphdr *)(iph + 1);
395                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
396         }
397
398         c_th->th_win = c->th_last->th_win;
399         c_th->th_ack = c->th_last->th_ack;
400         if (c_th->th_off == c->th_last->th_off) {
401                 /* Copy TCP options (take care to avoid going negative). */
402                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
403                 memcpy(c_th + 1, c->th_last + 1, optlen);
404         }
405
406         m->m_pkthdr.flowid = c->conn_hash;
407         M_HASHTYPE_SET(m,
408             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
409                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
410
411         m->m_pkthdr.csum_flags = csum_flags;
412         __sfxge_rx_deliver(sc, m);
413
414         c->mbuf = NULL;
415         c->delivered = 1;
416 }
417
418 /* Drop the given connection, and add it to the free list. */
419 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
420 {
421         unsigned bucket;
422
423         KASSERT(!c->mbuf, ("found orphaned mbuf"));
424
425         if (c->next_buf.mbuf != NULL) {
426                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
427                 LIST_REMOVE(c, active_link);
428         }
429
430         bucket = c->conn_hash & rxq->lro.conns_mask;
431         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
432         --rxq->lro.conns_n[bucket];
433         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
434         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
435 }
436
437 /* Stop tracking connections that have gone idle in order to keep hash
438  * chains short.
439  */
440 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
441 {
442         struct sfxge_lro_conn *c;
443         unsigned i;
444
445         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
446                 ("found active connections"));
447
448         rxq->lro.last_purge_ticks = now;
449         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
450                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
451                         continue;
452
453                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
454                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
455                         ++rxq->lro.n_drop_idle;
456                         sfxge_lro_drop(rxq, c);
457                 }
458         }
459 }
460
461 static void
462 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
463                 struct mbuf *mbuf, struct tcphdr *th)
464 {
465         struct tcphdr *c_th;
466
467         /* Tack the new mbuf onto the chain. */
468         KASSERT(!mbuf->m_next, ("mbuf already chained"));
469         c->mbuf_tail->m_next = mbuf;
470         c->mbuf_tail = mbuf;
471
472         /* Increase length appropriately */
473         c->mbuf->m_pkthdr.len += mbuf->m_len;
474
475         /* Update the connection state flags */
476         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
477                 struct ip *iph = c->nh;
478                 iph->ip_len += mbuf->m_len;
479                 c_th = (struct tcphdr *)(iph + 1);
480         } else {
481                 struct ip6_hdr *iph = c->nh;
482                 iph->ip6_plen += mbuf->m_len;
483                 c_th = (struct tcphdr *)(iph + 1);
484         }
485         c_th->th_flags |= (th->th_flags & TH_PUSH);
486         c->th_last = th;
487         ++st->n_merges;
488
489         /* Pass packet up now if another segment could overflow the IP
490          * length.
491          */
492         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
493                 sfxge_lro_deliver(st, c);
494 }
495
496 static void
497 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
498                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
499 {
500         /* Start the chain */
501         c->mbuf = mbuf;
502         c->mbuf_tail = c->mbuf;
503         c->nh = nh;
504         c->th_last = th;
505
506         mbuf->m_pkthdr.len = mbuf->m_len;
507
508         /* Mangle header fields for later processing */
509         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
510                 struct ip *iph = nh;
511                 iph->ip_len = ntohs(iph->ip_len);
512         } else {
513                 struct ip6_hdr *iph = nh;
514                 iph->ip6_plen = ntohs(iph->ip6_plen);
515         }
516 }
517
518 /* Try to merge or otherwise hold or deliver (as appropriate) the
519  * packet buffered for this connection (c->next_buf).  Return a flag
520  * indicating whether the connection is still active for LRO purposes.
521  */
522 static int
523 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
524 {
525         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
526         char *eh = c->next_eh;
527         int data_length, hdr_length, dont_merge;
528         unsigned th_seq, pkt_length;
529         struct tcphdr *th;
530         unsigned now;
531
532         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
533                 struct ip *iph = c->next_nh;
534                 th = (struct tcphdr *)(iph + 1);
535                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
536         } else {
537                 struct ip6_hdr *iph = c->next_nh;
538                 th = (struct tcphdr *)(iph + 1);
539                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
540         }
541
542         hdr_length = (char *) th + th->th_off * 4 - eh;
543         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
544                        hdr_length);
545         th_seq = ntohl(th->th_seq);
546         dont_merge = ((data_length <= 0)
547                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
548
549         /* Check for options other than aligned timestamp. */
550         if (th->th_off != 5) {
551                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
552                 if (th->th_off == 8 &&
553                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
554                                         (TCPOPT_NOP << 16) |
555                                         (TCPOPT_TIMESTAMP << 8) |
556                                         TCPOLEN_TIMESTAMP)) {
557                         /* timestamp option -- okay */
558                 } else {
559                         dont_merge = 1;
560                 }
561         }
562
563         if (__predict_false(th_seq != c->next_seq)) {
564                 /* Out-of-order, so start counting again. */
565                 if (c->mbuf != NULL)
566                         sfxge_lro_deliver(&rxq->lro, c);
567                 c->n_in_order_pkts -= lro_loss_packets;
568                 c->next_seq = th_seq + data_length;
569                 ++rxq->lro.n_misorder;
570                 goto deliver_buf_out;
571         }
572         c->next_seq = th_seq + data_length;
573
574         now = ticks;
575         if (now - c->last_pkt_ticks > lro_idle_ticks) {
576                 ++rxq->lro.n_drop_idle;
577                 if (c->mbuf != NULL)
578                         sfxge_lro_deliver(&rxq->lro, c);
579                 sfxge_lro_drop(rxq, c);
580                 return (0);
581         }
582         c->last_pkt_ticks = ticks;
583
584         if (c->n_in_order_pkts < lro_slow_start_packets) {
585                 /* May be in slow-start, so don't merge. */
586                 ++rxq->lro.n_slow_start;
587                 ++c->n_in_order_pkts;
588                 goto deliver_buf_out;
589         }
590
591         if (__predict_false(dont_merge)) {
592                 if (c->mbuf != NULL)
593                         sfxge_lro_deliver(&rxq->lro, c);
594                 if (th->th_flags & (TH_FIN | TH_RST)) {
595                         ++rxq->lro.n_drop_closed;
596                         sfxge_lro_drop(rxq, c);
597                         return (0);
598                 }
599                 goto deliver_buf_out;
600         }
601
602         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
603
604         if (__predict_true(c->mbuf != NULL)) {
605                 /* Remove headers and any padding */
606                 rx_buf->mbuf->m_data += hdr_length;
607                 rx_buf->mbuf->m_len = data_length;
608
609                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
610         } else {
611                 /* Remove any padding */
612                 rx_buf->mbuf->m_len = pkt_length;
613
614                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
615         }
616
617         rx_buf->mbuf = NULL;
618         return (1);
619
620  deliver_buf_out:
621         sfxge_rx_deliver(rxq->sc, rx_buf);
622         return (1);
623 }
624
625 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
626                                uint16_t l2_id, void *nh, struct tcphdr *th)
627 {
628         unsigned bucket = conn_hash & st->conns_mask;
629         struct sfxge_lro_conn *c;
630
631         if (st->conns_n[bucket] >= lro_chain_max) {
632                 ++st->n_too_many;
633                 return;
634         }
635
636         if (!TAILQ_EMPTY(&st->free_conns)) {
637                 c = TAILQ_FIRST(&st->free_conns);
638                 TAILQ_REMOVE(&st->free_conns, c, link);
639         } else {
640                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
641                 if (c == NULL)
642                         return;
643                 c->mbuf = NULL;
644                 c->next_buf.mbuf = NULL;
645         }
646
647         /* Create the connection tracking data */
648         ++st->conns_n[bucket];
649         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
650         c->l2_id = l2_id;
651         c->conn_hash = conn_hash;
652         c->source = th->th_sport;
653         c->dest = th->th_dport;
654         c->n_in_order_pkts = 0;
655         c->last_pkt_ticks = *(volatile int *)&ticks;
656         c->delivered = 0;
657         ++st->n_new_stream;
658         /* NB. We don't initialise c->next_seq, and it doesn't matter what
659          * value it has.  Most likely the next packet received for this
660          * connection will not match -- no harm done.
661          */
662 }
663
664 /* Process mbuf and decide whether to dispatch it to the stack now or
665  * later.
666  */
667 static void
668 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
669 {
670         struct sfxge_softc *sc = rxq->sc;
671         struct mbuf *m = rx_buf->mbuf;
672         struct ether_header *eh;
673         struct sfxge_lro_conn *c;
674         uint16_t l2_id;
675         uint16_t l3_proto;
676         void *nh;
677         struct tcphdr *th;
678         uint32_t conn_hash;
679         unsigned bucket;
680
681         /* Get the hardware hash */
682         conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
683                                             EFX_RX_HASHALG_TOEPLITZ,
684                                             mtod(m, uint8_t *));
685
686         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
687         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
688                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
689                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
690                         SFXGE_LRO_L2_ID_VLAN;
691                 l3_proto = veh->evl_proto;
692                 nh = veh + 1;
693         } else {
694                 l2_id = 0;
695                 l3_proto = eh->ether_type;
696                 nh = eh + 1;
697         }
698
699         /* Check whether this is a suitable packet (unfragmented
700          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
701          * length, and compute a hash if necessary.  If not, return.
702          */
703         if (l3_proto == htons(ETHERTYPE_IP)) {
704                 struct ip *iph = nh;
705
706                 KASSERT(iph->ip_p == IPPROTO_TCP,
707                     ("IPv4 protocol is not TCP, but packet marker is set"));
708                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
709                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
710                         goto deliver_now;
711                 th = (struct tcphdr *)(iph + 1);
712         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
713                 struct ip6_hdr *iph = nh;
714
715                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
716                     ("IPv6 next header is not TCP, but packet marker is set"));
717                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
718                 th = (struct tcphdr *)(iph + 1);
719         } else {
720                 goto deliver_now;
721         }
722
723         bucket = conn_hash & rxq->lro.conns_mask;
724
725         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
726                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
727                         continue;
728                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
729                         continue;
730                 if (c->mbuf != NULL) {
731                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
732                                 struct ip *c_iph, *iph = nh;
733                                 c_iph = c->nh;
734                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
735                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
736                                         continue;
737                         } else {
738                                 struct ip6_hdr *c_iph, *iph = nh;
739                                 c_iph = c->nh;
740                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
741                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
742                                         continue;
743                         }
744                 }
745
746                 /* Re-insert at head of list to reduce lookup time. */
747                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
748                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
749
750                 if (c->next_buf.mbuf != NULL) {
751                         if (!sfxge_lro_try_merge(rxq, c))
752                                 goto deliver_now;
753                 } else {
754                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
755                             active_link);
756                 }
757                 c->next_buf = *rx_buf;
758                 c->next_eh = eh;
759                 c->next_nh = nh;
760
761                 rx_buf->mbuf = NULL;
762                 rx_buf->flags = EFX_DISCARD;
763                 return;
764         }
765
766         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
767  deliver_now:
768         sfxge_rx_deliver(sc, rx_buf);
769 }
770
771 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
772 {
773         struct sfxge_lro_state *st = &rxq->lro;
774         struct sfxge_lro_conn *c;
775         unsigned t;
776
777         while (!LIST_EMPTY(&st->active_conns)) {
778                 c = LIST_FIRST(&st->active_conns);
779                 if (!c->delivered && c->mbuf != NULL)
780                         sfxge_lro_deliver(st, c);
781                 if (sfxge_lro_try_merge(rxq, c)) {
782                         if (c->mbuf != NULL)
783                                 sfxge_lro_deliver(st, c);
784                         LIST_REMOVE(c, active_link);
785                 }
786                 c->delivered = 0;
787         }
788
789         t = *(volatile int *)&ticks;
790         if (__predict_false(t != st->last_purge_ticks))
791                 sfxge_lro_purge_idle(rxq, t);
792 }
793
794 #else   /* !SFXGE_LRO */
795
796 static void
797 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
798 {
799 }
800
801 static void
802 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
803 {
804 }
805
806 #endif  /* SFXGE_LRO */
807
808 void
809 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
810 {
811         struct sfxge_softc *sc = rxq->sc;
812         int if_capenable = sc->ifnet->if_capenable;
813         int lro_enabled = if_capenable & IFCAP_LRO;
814         unsigned int index;
815         struct sfxge_evq *evq;
816         unsigned int completed;
817         unsigned int level;
818         struct mbuf *m;
819         struct sfxge_rx_sw_desc *prev = NULL;
820
821         index = rxq->index;
822         evq = sc->evq[index];
823
824         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
825
826         completed = rxq->completed;
827         while (completed != rxq->pending) {
828                 unsigned int id;
829                 struct sfxge_rx_sw_desc *rx_desc;
830
831                 id = completed++ & rxq->ptr_mask;
832                 rx_desc = &rxq->queue[id];
833                 m = rx_desc->mbuf;
834
835                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
836                         goto discard;
837
838                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
839                         goto discard;
840
841                 /* Read the length from the pseudo header if required */
842                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
843                         uint16_t tmp_size;
844                         int rc;
845                         rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
846                                                            mtod(m, uint8_t *),
847                                                            &tmp_size);
848                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
849                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
850                 }
851
852                 prefetch_read_many(mtod(m, caddr_t));
853
854                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
855                 case EFX_PKT_IPV4:
856                         if (~if_capenable & IFCAP_RXCSUM)
857                                 rx_desc->flags &=
858                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
859                         break;
860                 case EFX_PKT_IPV6:
861                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
862                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
863                         break;
864                 case 0:
865                         /* Check for loopback packets */
866                         {
867                                 struct ether_header *etherhp;
868
869                                 /*LINTED*/
870                                 etherhp = mtod(m, struct ether_header *);
871
872                                 if (etherhp->ether_type ==
873                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
874                                         EFSYS_PROBE(loopback);
875
876                                         rxq->loopback++;
877                                         goto discard;
878                                 }
879                         }
880                         break;
881                 default:
882                         KASSERT(B_FALSE,
883                             ("Rx descriptor with both IPv4 and IPv6 flags"));
884                         goto discard;
885                 }
886
887                 /* Pass packet up the stack or into LRO (pipelined) */
888                 if (prev != NULL) {
889                         if (lro_enabled &&
890                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
891                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
892                                 sfxge_lro(rxq, prev);
893                         else
894                                 sfxge_rx_deliver(sc, prev);
895                 }
896                 prev = rx_desc;
897                 continue;
898
899 discard:
900                 /* Return the packet to the pool */
901                 m_free(m);
902                 rx_desc->mbuf = NULL;
903         }
904         rxq->completed = completed;
905
906         level = rxq->added - rxq->completed;
907
908         /* Pass last packet up the stack or into LRO */
909         if (prev != NULL) {
910                 if (lro_enabled &&
911                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
912                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
913                         sfxge_lro(rxq, prev);
914                 else
915                         sfxge_rx_deliver(sc, prev);
916         }
917
918         /*
919          * If there are any pending flows and this is the end of the
920          * poll then they must be completed.
921          */
922         if (eop)
923                 sfxge_lro_end_of_burst(rxq);
924
925         /* Top up the queue if necessary */
926         if (level < rxq->refill_threshold)
927                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
928 }
929
930 static void
931 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
932 {
933         struct sfxge_rxq *rxq;
934         struct sfxge_evq *evq;
935         unsigned int count;
936         unsigned int retry = 3;
937
938         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
939
940         rxq = sc->rxq[index];
941         evq = sc->evq[index];
942
943         SFXGE_EVQ_LOCK(evq);
944
945         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
946             ("rxq not started"));
947
948         rxq->init_state = SFXGE_RXQ_INITIALIZED;
949
950         callout_stop(&rxq->refill_callout);
951
952         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
953                 rxq->flush_state = SFXGE_FLUSH_PENDING;
954
955                 SFXGE_EVQ_UNLOCK(evq);
956
957                 /* Flush the receive queue */
958                 if (efx_rx_qflush(rxq->common) != 0) {
959                         SFXGE_EVQ_LOCK(evq);
960                         rxq->flush_state = SFXGE_FLUSH_FAILED;
961                         break;
962                 }
963
964                 count = 0;
965                 do {
966                         /* Spin for 100 ms */
967                         DELAY(100000);
968
969                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
970                                 break;
971
972                 } while (++count < 20);
973
974                 SFXGE_EVQ_LOCK(evq);
975
976                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
977                         /* Flush timeout - neither done nor failed */
978                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
979                             device_get_nameunit(sc->dev), index);
980                         rxq->flush_state = SFXGE_FLUSH_DONE;
981                 }
982                 retry--;
983         }
984         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
985                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
986                     device_get_nameunit(sc->dev), index);
987                 rxq->flush_state = SFXGE_FLUSH_DONE;
988         }
989
990         rxq->pending = rxq->added;
991         sfxge_rx_qcomplete(rxq, B_TRUE);
992
993         KASSERT(rxq->completed == rxq->pending,
994             ("rxq->completed != rxq->pending"));
995
996         rxq->added = 0;
997         rxq->pushed = 0;
998         rxq->pending = 0;
999         rxq->completed = 0;
1000         rxq->loopback = 0;
1001
1002         /* Destroy the common code receive queue. */
1003         efx_rx_qdestroy(rxq->common);
1004
1005         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1006             EFX_RXQ_NBUFS(sc->rxq_entries));
1007
1008         SFXGE_EVQ_UNLOCK(evq);
1009 }
1010
1011 static int
1012 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1013 {
1014         struct sfxge_rxq *rxq;
1015         efsys_mem_t *esmp;
1016         struct sfxge_evq *evq;
1017         int rc;
1018
1019         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1020
1021         rxq = sc->rxq[index];
1022         esmp = &rxq->mem;
1023         evq = sc->evq[index];
1024
1025         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1026             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1027         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1028             ("evq->init_state != SFXGE_EVQ_STARTED"));
1029
1030         /* Program the buffer table. */
1031         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1032             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1033                 return (rc);
1034
1035         /* Create the common code receive queue. */
1036         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1037             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1038             &rxq->common)) != 0)
1039                 goto fail;
1040
1041         SFXGE_EVQ_LOCK(evq);
1042
1043         /* Enable the receive queue. */
1044         efx_rx_qenable(rxq->common);
1045
1046         rxq->init_state = SFXGE_RXQ_STARTED;
1047         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1048
1049         /* Try to fill the queue from the pool. */
1050         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1051
1052         SFXGE_EVQ_UNLOCK(evq);
1053
1054         return (0);
1055
1056 fail:
1057         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1058             EFX_RXQ_NBUFS(sc->rxq_entries));
1059         return (rc);
1060 }
1061
1062 void
1063 sfxge_rx_stop(struct sfxge_softc *sc)
1064 {
1065         int index;
1066
1067         efx_mac_filter_default_rxq_clear(sc->enp);
1068
1069         /* Stop the receive queue(s) */
1070         index = sc->rxq_count;
1071         while (--index >= 0)
1072                 sfxge_rx_qstop(sc, index);
1073
1074         sc->rx_prefix_size = 0;
1075         sc->rx_buffer_size = 0;
1076
1077         efx_rx_fini(sc->enp);
1078 }
1079
1080 int
1081 sfxge_rx_start(struct sfxge_softc *sc)
1082 {
1083         struct sfxge_intr *intr;
1084         const efx_nic_cfg_t *encp;
1085         size_t hdrlen, align, reserved;
1086         int index;
1087         int rc;
1088
1089         intr = &sc->intr;
1090
1091         /* Initialize the common code receive module. */
1092         if ((rc = efx_rx_init(sc->enp)) != 0)
1093                 return (rc);
1094
1095         encp = efx_nic_cfg_get(sc->enp);
1096         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1097
1098         /* Calculate the receive packet buffer size. */ 
1099         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1100
1101         /* Ensure IP headers are 32bit aligned */
1102         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1103         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1104
1105         sc->rx_buffer_size += sc->rx_buffer_align;
1106
1107         /* Align end of packet buffer for RX DMA end padding */
1108         align = MAX(1, encp->enc_rx_buf_align_end);
1109         EFSYS_ASSERT(ISP2(align));
1110         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1111
1112         /*
1113          * Standard mbuf zones only guarantee pointer-size alignment;
1114          * we need extra space to align to the cache line
1115          */
1116         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1117
1118         /* Select zone for packet buffers */
1119         if (reserved <= MCLBYTES)
1120                 sc->rx_cluster_size = MCLBYTES;
1121         else if (reserved <= MJUMPAGESIZE)
1122                 sc->rx_cluster_size = MJUMPAGESIZE;
1123         else if (reserved <= MJUM9BYTES)
1124                 sc->rx_cluster_size = MJUM9BYTES;
1125         else
1126                 sc->rx_cluster_size = MJUM16BYTES;
1127
1128         /*
1129          * Set up the scale table.  Enable all hash types and hash insertion.
1130          */
1131         for (index = 0; index < nitems(sc->rx_indir_table); index++)
1132 #ifdef RSS
1133                 sc->rx_indir_table[index] =
1134                         rss_get_indirection_to_bucket(index) % sc->rxq_count;
1135 #else
1136                 sc->rx_indir_table[index] = index % sc->rxq_count;
1137 #endif
1138         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1139                                        nitems(sc->rx_indir_table))) != 0)
1140                 goto fail;
1141         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1142             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1143             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1144
1145 #ifdef RSS
1146         rss_getkey(toep_key);
1147 #endif
1148         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1149                                        sizeof(toep_key))) != 0)
1150                 goto fail;
1151
1152         /* Start the receive queue(s). */
1153         for (index = 0; index < sc->rxq_count; index++) {
1154                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1155                         goto fail2;
1156         }
1157
1158         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1159                                             sc->intr.n_alloc > 1);
1160         if (rc != 0)
1161                 goto fail3;
1162
1163         return (0);
1164
1165 fail3:
1166 fail2:
1167         while (--index >= 0)
1168                 sfxge_rx_qstop(sc, index);
1169
1170 fail:
1171         efx_rx_fini(sc->enp);
1172
1173         return (rc);
1174 }
1175
1176 #ifdef SFXGE_LRO
1177
1178 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1179 {
1180         struct sfxge_lro_state *st = &rxq->lro;
1181         unsigned i;
1182
1183         st->conns_mask = lro_table_size - 1;
1184         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1185                 ("lro_table_size must be a power of 2"));
1186         st->sc = rxq->sc;
1187         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1188                            M_SFXGE, M_WAITOK);
1189         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1190                              M_SFXGE, M_WAITOK);
1191         for (i = 0; i <= st->conns_mask; ++i) {
1192                 TAILQ_INIT(&st->conns[i]);
1193                 st->conns_n[i] = 0;
1194         }
1195         LIST_INIT(&st->active_conns);
1196         TAILQ_INIT(&st->free_conns);
1197 }
1198
1199 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1200 {
1201         struct sfxge_lro_state *st = &rxq->lro;
1202         struct sfxge_lro_conn *c;
1203         unsigned i;
1204
1205         /* Return cleanly if sfxge_lro_init() has not been called. */
1206         if (st->conns == NULL)
1207                 return;
1208
1209         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1210
1211         for (i = 0; i <= st->conns_mask; ++i) {
1212                 while (!TAILQ_EMPTY(&st->conns[i])) {
1213                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1214                         sfxge_lro_drop(rxq, c);
1215                 }
1216         }
1217
1218         while (!TAILQ_EMPTY(&st->free_conns)) {
1219                 c = TAILQ_FIRST(&st->free_conns);
1220                 TAILQ_REMOVE(&st->free_conns, c, link);
1221                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1222                 free(c, M_SFXGE);
1223         }
1224
1225         free(st->conns_n, M_SFXGE);
1226         free(st->conns, M_SFXGE);
1227         st->conns = NULL;
1228 }
1229
1230 #else
1231
1232 static void
1233 sfxge_lro_init(struct sfxge_rxq *rxq)
1234 {
1235 }
1236
1237 static void
1238 sfxge_lro_fini(struct sfxge_rxq *rxq)
1239 {
1240 }
1241
1242 #endif  /* SFXGE_LRO */
1243
1244 static void
1245 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1246 {
1247         struct sfxge_rxq *rxq;
1248
1249         rxq = sc->rxq[index];
1250
1251         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1252             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1253
1254         /* Free the context array and the flow table. */
1255         free(rxq->queue, M_SFXGE);
1256         sfxge_lro_fini(rxq);
1257
1258         /* Release DMA memory. */
1259         sfxge_dma_free(&rxq->mem);
1260
1261         sc->rxq[index] = NULL;
1262
1263         free(rxq, M_SFXGE);
1264 }
1265
1266 static int
1267 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1268 {
1269         struct sfxge_rxq *rxq;
1270         struct sfxge_evq *evq;
1271         efsys_mem_t *esmp;
1272         int rc;
1273
1274         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1275
1276         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1277         rxq->sc = sc;
1278         rxq->index = index;
1279         rxq->entries = sc->rxq_entries;
1280         rxq->ptr_mask = rxq->entries - 1;
1281         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1282
1283         sc->rxq[index] = rxq;
1284         esmp = &rxq->mem;
1285
1286         evq = sc->evq[index];
1287
1288         /* Allocate and zero DMA space. */
1289         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1290                 return (rc);
1291
1292         /* Allocate buffer table entries. */
1293         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1294                                  &rxq->buf_base_id);
1295
1296         /* Allocate the context array and the flow table. */
1297         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1298             M_SFXGE, M_WAITOK | M_ZERO);
1299         sfxge_lro_init(rxq);
1300
1301         callout_init(&rxq->refill_callout, 1);
1302
1303         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1304
1305         return (0);
1306 }
1307
1308 static const struct {
1309         const char *name;
1310         size_t offset;
1311 } sfxge_rx_stats[] = {
1312 #define SFXGE_RX_STAT(name, member) \
1313         { #name, offsetof(struct sfxge_rxq, member) }
1314 #ifdef SFXGE_LRO
1315         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1316         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1317         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1318         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1319         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1320         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1321         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1322         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1323 #endif
1324 };
1325
1326 static int
1327 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1328 {
1329         struct sfxge_softc *sc = arg1;
1330         unsigned int id = arg2;
1331         unsigned int sum, index;
1332
1333         /* Sum across all RX queues */
1334         sum = 0;
1335         for (index = 0; index < sc->rxq_count; index++)
1336                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1337                                          sfxge_rx_stats[id].offset);
1338
1339         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1340 }
1341
1342 static void
1343 sfxge_rx_stat_init(struct sfxge_softc *sc)
1344 {
1345         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1346         struct sysctl_oid_list *stat_list;
1347         unsigned int id;
1348
1349         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1350
1351         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1352                 SYSCTL_ADD_PROC(
1353                         ctx, stat_list,
1354                         OID_AUTO, sfxge_rx_stats[id].name,
1355                         CTLTYPE_UINT|CTLFLAG_RD,
1356                         sc, id, sfxge_rx_stat_handler, "IU",
1357                         "");
1358         }
1359 }
1360
1361 void
1362 sfxge_rx_fini(struct sfxge_softc *sc)
1363 {
1364         int index;
1365
1366         index = sc->rxq_count;
1367         while (--index >= 0)
1368                 sfxge_rx_qfini(sc, index);
1369
1370         sc->rxq_count = 0;
1371 }
1372
1373 int
1374 sfxge_rx_init(struct sfxge_softc *sc)
1375 {
1376         struct sfxge_intr *intr;
1377         int index;
1378         int rc;
1379
1380 #ifdef SFXGE_LRO
1381         if (!ISP2(lro_table_size)) {
1382                 log(LOG_ERR, "%s=%u must be power of 2",
1383                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1384                 rc = EINVAL;
1385                 goto fail_lro_table_size;
1386         }
1387
1388         if (lro_idle_ticks == 0)
1389                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1390 #endif
1391
1392         intr = &sc->intr;
1393
1394         sc->rxq_count = intr->n_alloc;
1395
1396         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1397             ("intr->state != SFXGE_INTR_INITIALIZED"));
1398
1399         /* Initialize the receive queue(s) - one per interrupt. */
1400         for (index = 0; index < sc->rxq_count; index++) {
1401                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1402                         goto fail;
1403         }
1404
1405         sfxge_rx_stat_init(sc);
1406
1407         return (0);
1408
1409 fail:
1410         /* Tear down the receive queue(s). */
1411         while (--index >= 0)
1412                 sfxge_rx_qfini(sc, index);
1413
1414         sc->rxq_count = 0;
1415
1416 #ifdef SFXGE_LRO
1417 fail_lro_table_size:
1418 #endif
1419         return (rc);
1420 }