]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/sfxge/sfxge_rx.c
Bring LLVM libunwind snapshot into contrib/llvm/projects
[FreeBSD/FreeBSD.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2015 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/types.h>
38 #include <sys/mbuf.h>
39 #include <sys/smp.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/syslog.h>
43 #include <sys/limits.h>
44 #include <sys/syslog.h>
45
46 #include <net/ethernet.h>
47 #include <net/if.h>
48 #include <net/if_vlan_var.h>
49
50 #include <netinet/in.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 #include <netinet/tcp.h>
54
55 #include <machine/in_cksum.h>
56
57 #include "common/efx.h"
58
59
60 #include "sfxge.h"
61 #include "sfxge_rx.h"
62
63 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
64
65 #ifdef SFXGE_LRO
66
67 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
68             "Large receive offload (LRO) parameters");
69
70 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
71
72 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
73  * means we can accelerate a larger number of streams.
74  */
75 static unsigned lro_table_size = 128;
76 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
77 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
78             &lro_table_size, 0,
79             "Size of the LRO hash table (must be a power of 2)");
80
81 /* Maximum length of a hash chain.  If chains get too long then the lookup
82  * time increases and may exceed the benefit of LRO.
83  */
84 static unsigned lro_chain_max = 20;
85 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
87             &lro_chain_max, 0,
88             "The maximum length of a hash chain");
89
90 /* Maximum time (in ticks) that a connection can be idle before it's LRO
91  * state is discarded.
92  */
93 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
94 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
96             &lro_idle_ticks, 0,
97             "The maximum time (in ticks) that a connection can be idle "
98             "before it's LRO state is discarded");
99
100 /* Number of packets with payload that must arrive in-order before a
101  * connection is eligible for LRO.  The idea is we should avoid coalescing
102  * segments when the sender is in slow-start because reducing the ACK rate
103  * can damage performance.
104  */
105 static int lro_slow_start_packets = 2000;
106 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
107 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
108             &lro_slow_start_packets, 0,
109             "Number of packets with payload that must arrive in-order before "
110             "a connection is eligible for LRO");
111
112 /* Number of packets with payload that must arrive in-order following loss
113  * before a connection is eligible for LRO.  The idea is we should avoid
114  * coalescing segments when the sender is recovering from loss, because
115  * reducing the ACK rate can damage performance.
116  */
117 static int lro_loss_packets = 20;
118 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
119 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
120             &lro_loss_packets, 0,
121             "Number of packets with payload that must arrive in-order "
122             "following loss before a connection is eligible for LRO");
123
124 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
125 #define SFXGE_LRO_L2_ID_VLAN 0x4000
126 #define SFXGE_LRO_L2_ID_IPV6 0x8000
127 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
128 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
129
130 /* Compare IPv6 addresses, avoiding conditional branches */
131 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
132                                    const struct in6_addr *right)
133 {
134 #if LONG_BIT == 64
135         const uint64_t *left64 = (const uint64_t *)left;
136         const uint64_t *right64 = (const uint64_t *)right;
137         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
138 #else
139         return (left->s6_addr32[0] - right->s6_addr32[0]) |
140                (left->s6_addr32[1] - right->s6_addr32[1]) |
141                (left->s6_addr32[2] - right->s6_addr32[2]) |
142                (left->s6_addr32[3] - right->s6_addr32[3]);
143 #endif
144 }
145
146 #endif  /* SFXGE_LRO */
147
148 void
149 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
150 {
151
152         rxq->flush_state = SFXGE_FLUSH_DONE;
153 }
154
155 void
156 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
157 {
158
159         rxq->flush_state = SFXGE_FLUSH_FAILED;
160 }
161
162 static uint8_t toep_key[] = {
163         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
164         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
165         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
166         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
167         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
168 };
169
170 static void
171 sfxge_rx_post_refill(void *arg)
172 {
173         struct sfxge_rxq *rxq = arg;
174         struct sfxge_softc *sc;
175         unsigned int index;
176         struct sfxge_evq *evq;
177         uint16_t magic;
178
179         sc = rxq->sc;
180         index = rxq->index;
181         evq = sc->evq[index];
182
183         magic = SFXGE_MAGIC_RX_QREFILL | index;
184
185         /* This is guaranteed due to the start/stop order of rx and ev */
186         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
187             ("evq not started"));
188         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
189             ("rxq not started"));
190         efx_ev_qpost(evq->common, magic);
191 }
192
193 static void
194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
195 {
196         /* Initially retry after 100 ms, but back off in case of
197          * repeated failures as we probably have to wait for the
198          * administrator to raise the pool limit. */
199         if (retrying)
200                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
201         else
202                 rxq->refill_delay = hz / 10;
203
204         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
205                              sfxge_rx_post_refill, rxq);
206 }
207
208 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
209 {
210         struct mb_args args;
211         struct mbuf *m;
212
213         /* Allocate mbuf structure */
214         args.flags = M_PKTHDR;
215         args.type = MT_DATA;
216         m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
217
218         /* Allocate (and attach) packet buffer */
219         if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
220                 uma_zfree(zone_mbuf, m);
221                 m = NULL;
222         }
223
224         return (m);
225 }
226
227 #define SFXGE_REFILL_BATCH  64
228
229 static void
230 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
231 {
232         struct sfxge_softc *sc;
233         unsigned int index;
234         struct sfxge_evq *evq;
235         unsigned int batch;
236         unsigned int rxfill;
237         unsigned int mblksize;
238         int ntodo;
239         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
240
241         sc = rxq->sc;
242         index = rxq->index;
243         evq = sc->evq[index];
244
245         prefetch_read_many(sc->enp);
246         prefetch_read_many(rxq->common);
247
248         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
249
250         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
251                 return;
252
253         rxfill = rxq->added - rxq->completed;
254         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
255             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
256         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
257         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
258             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
259
260         if (ntodo == 0)
261                 return;
262
263         batch = 0;
264         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
265         while (ntodo-- > 0) {
266                 unsigned int id;
267                 struct sfxge_rx_sw_desc *rx_desc;
268                 bus_dma_segment_t seg;
269                 struct mbuf *m;
270
271                 id = (rxq->added + batch) & rxq->ptr_mask;
272                 rx_desc = &rxq->queue[id];
273                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
274
275                 rx_desc->flags = EFX_DISCARD;
276                 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
277                 if (m == NULL)
278                         break;
279
280                 /* m_len specifies length of area to be mapped for DMA */
281                 m->m_len  = mblksize;
282                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
283                 m->m_data += sc->rx_buffer_align;
284
285                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
286                 addr[batch++] = seg.ds_addr;
287
288                 if (batch == SFXGE_REFILL_BATCH) {
289                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
290                             rxq->completed, rxq->added);
291                         rxq->added += batch;
292                         batch = 0;
293                 }
294         }
295
296         if (ntodo != 0)
297                 sfxge_rx_schedule_refill(rxq, retrying);
298
299         if (batch != 0) {
300                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
301                     rxq->completed, rxq->added);
302                 rxq->added += batch;
303         }
304
305         /* Make the descriptors visible to the hardware */
306         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
307                         BUS_DMASYNC_PREWRITE);
308
309         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
310
311         /* The queue could still be empty if no descriptors were actually
312          * pushed, in which case there will be no event to cause the next
313          * refill, so we must schedule a refill ourselves.
314          */
315         if(rxq->pushed == rxq->completed) {
316                 sfxge_rx_schedule_refill(rxq, retrying);
317         }
318 }
319
320 void
321 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
322 {
323
324         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
325                 return;
326
327         /* Make sure the queue is full */
328         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
329 }
330
331 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
332 {
333         struct ifnet *ifp = sc->ifnet;
334
335         m->m_pkthdr.rcvif = ifp;
336         m->m_pkthdr.csum_data = 0xffff;
337         ifp->if_input(ifp, m);
338 }
339
340 static void
341 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
342 {
343         struct mbuf *m = rx_desc->mbuf;
344         int flags = rx_desc->flags;
345         int csum_flags;
346
347         /* Convert checksum flags */
348         csum_flags = (flags & EFX_CKSUM_IPV4) ?
349                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
350         if (flags & EFX_CKSUM_TCPUDP)
351                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
352
353         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
354                 m->m_pkthdr.flowid =
355                         efx_psuedo_hdr_hash_get(sc->enp,
356                                                 EFX_RX_HASHALG_TOEPLITZ,
357                                                 mtod(m, uint8_t *));
358                 /* The hash covers a 4-tuple for TCP only */
359                 M_HASHTYPE_SET(m,
360                     (flags & EFX_PKT_IPV4) ?
361                         ((flags & EFX_PKT_TCP) ?
362                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
363                         ((flags & EFX_PKT_TCP) ?
364                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
365         }
366         m->m_data += sc->rx_prefix_size;
367         m->m_len = rx_desc->size - sc->rx_prefix_size;
368         m->m_pkthdr.len = m->m_len;
369         m->m_pkthdr.csum_flags = csum_flags;
370         __sfxge_rx_deliver(sc, rx_desc->mbuf);
371
372         rx_desc->flags = EFX_DISCARD;
373         rx_desc->mbuf = NULL;
374 }
375
376 #ifdef SFXGE_LRO
377
378 static void
379 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
380 {
381         struct sfxge_softc *sc = st->sc;
382         struct mbuf *m = c->mbuf;
383         struct tcphdr *c_th;
384         int csum_flags;
385
386         KASSERT(m, ("no mbuf to deliver"));
387
388         ++st->n_bursts;
389
390         /* Finish off packet munging and recalculate IP header checksum. */
391         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
392                 struct ip *iph = c->nh;
393                 iph->ip_len = htons(iph->ip_len);
394                 iph->ip_sum = 0;
395                 iph->ip_sum = in_cksum_hdr(iph);
396                 c_th = (struct tcphdr *)(iph + 1);
397                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
398                               CSUM_IP_CHECKED | CSUM_IP_VALID);
399         } else {
400                 struct ip6_hdr *iph = c->nh;
401                 iph->ip6_plen = htons(iph->ip6_plen);
402                 c_th = (struct tcphdr *)(iph + 1);
403                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
404         }
405
406         c_th->th_win = c->th_last->th_win;
407         c_th->th_ack = c->th_last->th_ack;
408         if (c_th->th_off == c->th_last->th_off) {
409                 /* Copy TCP options (take care to avoid going negative). */
410                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
411                 memcpy(c_th + 1, c->th_last + 1, optlen);
412         }
413
414         m->m_pkthdr.flowid = c->conn_hash;
415         M_HASHTYPE_SET(m,
416             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
417                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
418
419         m->m_pkthdr.csum_flags = csum_flags;
420         __sfxge_rx_deliver(sc, m);
421
422         c->mbuf = NULL;
423         c->delivered = 1;
424 }
425
426 /* Drop the given connection, and add it to the free list. */
427 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
428 {
429         unsigned bucket;
430
431         KASSERT(!c->mbuf, ("found orphaned mbuf"));
432
433         if (c->next_buf.mbuf != NULL) {
434                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
435                 LIST_REMOVE(c, active_link);
436         }
437
438         bucket = c->conn_hash & rxq->lro.conns_mask;
439         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
440         --rxq->lro.conns_n[bucket];
441         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
442         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
443 }
444
445 /* Stop tracking connections that have gone idle in order to keep hash
446  * chains short.
447  */
448 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
449 {
450         struct sfxge_lro_conn *c;
451         unsigned i;
452
453         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
454                 ("found active connections"));
455
456         rxq->lro.last_purge_ticks = now;
457         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
458                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
459                         continue;
460
461                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
462                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
463                         ++rxq->lro.n_drop_idle;
464                         sfxge_lro_drop(rxq, c);
465                 }
466         }
467 }
468
469 static void
470 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
471                 struct mbuf *mbuf, struct tcphdr *th)
472 {
473         struct tcphdr *c_th;
474
475         /* Tack the new mbuf onto the chain. */
476         KASSERT(!mbuf->m_next, ("mbuf already chained"));
477         c->mbuf_tail->m_next = mbuf;
478         c->mbuf_tail = mbuf;
479
480         /* Increase length appropriately */
481         c->mbuf->m_pkthdr.len += mbuf->m_len;
482
483         /* Update the connection state flags */
484         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
485                 struct ip *iph = c->nh;
486                 iph->ip_len += mbuf->m_len;
487                 c_th = (struct tcphdr *)(iph + 1);
488         } else {
489                 struct ip6_hdr *iph = c->nh;
490                 iph->ip6_plen += mbuf->m_len;
491                 c_th = (struct tcphdr *)(iph + 1);
492         }
493         c_th->th_flags |= (th->th_flags & TH_PUSH);
494         c->th_last = th;
495         ++st->n_merges;
496
497         /* Pass packet up now if another segment could overflow the IP
498          * length.
499          */
500         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
501                 sfxge_lro_deliver(st, c);
502 }
503
504 static void
505 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
506                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
507 {
508         /* Start the chain */
509         c->mbuf = mbuf;
510         c->mbuf_tail = c->mbuf;
511         c->nh = nh;
512         c->th_last = th;
513
514         mbuf->m_pkthdr.len = mbuf->m_len;
515
516         /* Mangle header fields for later processing */
517         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
518                 struct ip *iph = nh;
519                 iph->ip_len = ntohs(iph->ip_len);
520         } else {
521                 struct ip6_hdr *iph = nh;
522                 iph->ip6_plen = ntohs(iph->ip6_plen);
523         }
524 }
525
526 /* Try to merge or otherwise hold or deliver (as appropriate) the
527  * packet buffered for this connection (c->next_buf).  Return a flag
528  * indicating whether the connection is still active for LRO purposes.
529  */
530 static int
531 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
532 {
533         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
534         char *eh = c->next_eh;
535         int data_length, hdr_length, dont_merge;
536         unsigned th_seq, pkt_length;
537         struct tcphdr *th;
538         unsigned now;
539
540         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
541                 struct ip *iph = c->next_nh;
542                 th = (struct tcphdr *)(iph + 1);
543                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
544         } else {
545                 struct ip6_hdr *iph = c->next_nh;
546                 th = (struct tcphdr *)(iph + 1);
547                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
548         }
549
550         hdr_length = (char *) th + th->th_off * 4 - eh;
551         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
552                        hdr_length);
553         th_seq = ntohl(th->th_seq);
554         dont_merge = ((data_length <= 0)
555                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
556
557         /* Check for options other than aligned timestamp. */
558         if (th->th_off != 5) {
559                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
560                 if (th->th_off == 8 &&
561                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
562                                         (TCPOPT_NOP << 16) |
563                                         (TCPOPT_TIMESTAMP << 8) |
564                                         TCPOLEN_TIMESTAMP)) {
565                         /* timestamp option -- okay */
566                 } else {
567                         dont_merge = 1;
568                 }
569         }
570
571         if (__predict_false(th_seq != c->next_seq)) {
572                 /* Out-of-order, so start counting again. */
573                 if (c->mbuf != NULL)
574                         sfxge_lro_deliver(&rxq->lro, c);
575                 c->n_in_order_pkts -= lro_loss_packets;
576                 c->next_seq = th_seq + data_length;
577                 ++rxq->lro.n_misorder;
578                 goto deliver_buf_out;
579         }
580         c->next_seq = th_seq + data_length;
581
582         now = ticks;
583         if (now - c->last_pkt_ticks > lro_idle_ticks) {
584                 ++rxq->lro.n_drop_idle;
585                 if (c->mbuf != NULL)
586                         sfxge_lro_deliver(&rxq->lro, c);
587                 sfxge_lro_drop(rxq, c);
588                 return (0);
589         }
590         c->last_pkt_ticks = ticks;
591
592         if (c->n_in_order_pkts < lro_slow_start_packets) {
593                 /* May be in slow-start, so don't merge. */
594                 ++rxq->lro.n_slow_start;
595                 ++c->n_in_order_pkts;
596                 goto deliver_buf_out;
597         }
598
599         if (__predict_false(dont_merge)) {
600                 if (c->mbuf != NULL)
601                         sfxge_lro_deliver(&rxq->lro, c);
602                 if (th->th_flags & (TH_FIN | TH_RST)) {
603                         ++rxq->lro.n_drop_closed;
604                         sfxge_lro_drop(rxq, c);
605                         return (0);
606                 }
607                 goto deliver_buf_out;
608         }
609
610         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
611
612         if (__predict_true(c->mbuf != NULL)) {
613                 /* Remove headers and any padding */
614                 rx_buf->mbuf->m_data += hdr_length;
615                 rx_buf->mbuf->m_len = data_length;
616
617                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
618         } else {
619                 /* Remove any padding */
620                 rx_buf->mbuf->m_len = pkt_length;
621
622                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
623         }
624
625         rx_buf->mbuf = NULL;
626         return (1);
627
628  deliver_buf_out:
629         sfxge_rx_deliver(rxq->sc, rx_buf);
630         return (1);
631 }
632
633 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
634                                uint16_t l2_id, void *nh, struct tcphdr *th)
635 {
636         unsigned bucket = conn_hash & st->conns_mask;
637         struct sfxge_lro_conn *c;
638
639         if (st->conns_n[bucket] >= lro_chain_max) {
640                 ++st->n_too_many;
641                 return;
642         }
643
644         if (!TAILQ_EMPTY(&st->free_conns)) {
645                 c = TAILQ_FIRST(&st->free_conns);
646                 TAILQ_REMOVE(&st->free_conns, c, link);
647         } else {
648                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
649                 if (c == NULL)
650                         return;
651                 c->mbuf = NULL;
652                 c->next_buf.mbuf = NULL;
653         }
654
655         /* Create the connection tracking data */
656         ++st->conns_n[bucket];
657         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
658         c->l2_id = l2_id;
659         c->conn_hash = conn_hash;
660         c->source = th->th_sport;
661         c->dest = th->th_dport;
662         c->n_in_order_pkts = 0;
663         c->last_pkt_ticks = *(volatile int *)&ticks;
664         c->delivered = 0;
665         ++st->n_new_stream;
666         /* NB. We don't initialise c->next_seq, and it doesn't matter what
667          * value it has.  Most likely the next packet received for this
668          * connection will not match -- no harm done.
669          */
670 }
671
672 /* Process mbuf and decide whether to dispatch it to the stack now or
673  * later.
674  */
675 static void
676 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
677 {
678         struct sfxge_softc *sc = rxq->sc;
679         struct mbuf *m = rx_buf->mbuf;
680         struct ether_header *eh;
681         struct sfxge_lro_conn *c;
682         uint16_t l2_id;
683         uint16_t l3_proto;
684         void *nh;
685         struct tcphdr *th;
686         uint32_t conn_hash;
687         unsigned bucket;
688
689         /* Get the hardware hash */
690         conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
691                                             EFX_RX_HASHALG_TOEPLITZ,
692                                             mtod(m, uint8_t *));
693
694         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
695         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
696                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
697                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
698                         SFXGE_LRO_L2_ID_VLAN;
699                 l3_proto = veh->evl_proto;
700                 nh = veh + 1;
701         } else {
702                 l2_id = 0;
703                 l3_proto = eh->ether_type;
704                 nh = eh + 1;
705         }
706
707         /* Check whether this is a suitable packet (unfragmented
708          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
709          * length, and compute a hash if necessary.  If not, return.
710          */
711         if (l3_proto == htons(ETHERTYPE_IP)) {
712                 struct ip *iph = nh;
713
714                 KASSERT(iph->ip_p == IPPROTO_TCP,
715                     ("IPv4 protocol is not TCP, but packet marker is set"));
716                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
717                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
718                         goto deliver_now;
719                 th = (struct tcphdr *)(iph + 1);
720         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
721                 struct ip6_hdr *iph = nh;
722
723                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
724                     ("IPv6 next header is not TCP, but packet marker is set"));
725                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
726                 th = (struct tcphdr *)(iph + 1);
727         } else {
728                 goto deliver_now;
729         }
730
731         bucket = conn_hash & rxq->lro.conns_mask;
732
733         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
734                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
735                         continue;
736                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
737                         continue;
738                 if (c->mbuf != NULL) {
739                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
740                                 struct ip *c_iph, *iph = nh;
741                                 c_iph = c->nh;
742                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
743                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
744                                         continue;
745                         } else {
746                                 struct ip6_hdr *c_iph, *iph = nh;
747                                 c_iph = c->nh;
748                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
749                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
750                                         continue;
751                         }
752                 }
753
754                 /* Re-insert at head of list to reduce lookup time. */
755                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
756                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
757
758                 if (c->next_buf.mbuf != NULL) {
759                         if (!sfxge_lro_try_merge(rxq, c))
760                                 goto deliver_now;
761                 } else {
762                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
763                             active_link);
764                 }
765                 c->next_buf = *rx_buf;
766                 c->next_eh = eh;
767                 c->next_nh = nh;
768
769                 rx_buf->mbuf = NULL;
770                 rx_buf->flags = EFX_DISCARD;
771                 return;
772         }
773
774         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
775  deliver_now:
776         sfxge_rx_deliver(sc, rx_buf);
777 }
778
779 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
780 {
781         struct sfxge_lro_state *st = &rxq->lro;
782         struct sfxge_lro_conn *c;
783         unsigned t;
784
785         while (!LIST_EMPTY(&st->active_conns)) {
786                 c = LIST_FIRST(&st->active_conns);
787                 if (!c->delivered && c->mbuf != NULL)
788                         sfxge_lro_deliver(st, c);
789                 if (sfxge_lro_try_merge(rxq, c)) {
790                         if (c->mbuf != NULL)
791                                 sfxge_lro_deliver(st, c);
792                         LIST_REMOVE(c, active_link);
793                 }
794                 c->delivered = 0;
795         }
796
797         t = *(volatile int *)&ticks;
798         if (__predict_false(t != st->last_purge_ticks))
799                 sfxge_lro_purge_idle(rxq, t);
800 }
801
802 #else   /* !SFXGE_LRO */
803
804 static void
805 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
806 {
807 }
808
809 static void
810 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
811 {
812 }
813
814 #endif  /* SFXGE_LRO */
815
816 void
817 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
818 {
819         struct sfxge_softc *sc = rxq->sc;
820         int if_capenable = sc->ifnet->if_capenable;
821         int lro_enabled = if_capenable & IFCAP_LRO;
822         unsigned int index;
823         struct sfxge_evq *evq;
824         unsigned int completed;
825         unsigned int level;
826         struct mbuf *m;
827         struct sfxge_rx_sw_desc *prev = NULL;
828
829         index = rxq->index;
830         evq = sc->evq[index];
831
832         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
833
834         completed = rxq->completed;
835         while (completed != rxq->pending) {
836                 unsigned int id;
837                 struct sfxge_rx_sw_desc *rx_desc;
838
839                 id = completed++ & rxq->ptr_mask;
840                 rx_desc = &rxq->queue[id];
841                 m = rx_desc->mbuf;
842
843                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
844                         goto discard;
845
846                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
847                         goto discard;
848
849                 /* Read the length from the psuedo header if required */
850                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
851                         uint16_t tmp_size;
852                         int rc;
853                         rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 
854                                                            mtod(m, uint8_t *),
855                                                            &tmp_size);
856                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
857                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
858                 }
859
860                 prefetch_read_many(mtod(m, caddr_t));
861
862                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
863                 case EFX_PKT_IPV4:
864                         if (~if_capenable & IFCAP_RXCSUM)
865                                 rx_desc->flags &=
866                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
867                         break;
868                 case EFX_PKT_IPV6:
869                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
870                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
871                         break;
872                 case 0:
873                         /* Check for loopback packets */
874                         {
875                                 struct ether_header *etherhp;
876
877                                 /*LINTED*/
878                                 etherhp = mtod(m, struct ether_header *);
879
880                                 if (etherhp->ether_type ==
881                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
882                                         EFSYS_PROBE(loopback);
883
884                                         rxq->loopback++;
885                                         goto discard;
886                                 }
887                         }
888                         break;
889                 default:
890                         KASSERT(B_FALSE,
891                             ("Rx descriptor with both IPv4 and IPv6 flags"));
892                         goto discard;
893                 }
894
895                 /* Pass packet up the stack or into LRO (pipelined) */
896                 if (prev != NULL) {
897                         if (lro_enabled &&
898                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
899                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
900                                 sfxge_lro(rxq, prev);
901                         else
902                                 sfxge_rx_deliver(sc, prev);
903                 }
904                 prev = rx_desc;
905                 continue;
906
907 discard:
908                 /* Return the packet to the pool */
909                 m_free(m);
910                 rx_desc->mbuf = NULL;
911         }
912         rxq->completed = completed;
913
914         level = rxq->added - rxq->completed;
915
916         /* Pass last packet up the stack or into LRO */
917         if (prev != NULL) {
918                 if (lro_enabled &&
919                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
920                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
921                         sfxge_lro(rxq, prev);
922                 else
923                         sfxge_rx_deliver(sc, prev);
924         }
925
926         /*
927          * If there are any pending flows and this is the end of the
928          * poll then they must be completed.
929          */
930         if (eop)
931                 sfxge_lro_end_of_burst(rxq);
932
933         /* Top up the queue if necessary */
934         if (level < rxq->refill_threshold)
935                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
936 }
937
938 static void
939 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
940 {
941         struct sfxge_rxq *rxq;
942         struct sfxge_evq *evq;
943         unsigned int count;
944         unsigned int retry = 3;
945
946         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
947
948         rxq = sc->rxq[index];
949         evq = sc->evq[index];
950
951         SFXGE_EVQ_LOCK(evq);
952
953         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
954             ("rxq not started"));
955
956         rxq->init_state = SFXGE_RXQ_INITIALIZED;
957
958         callout_stop(&rxq->refill_callout);
959
960         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
961                 rxq->flush_state = SFXGE_FLUSH_PENDING;
962
963                 SFXGE_EVQ_UNLOCK(evq);
964
965                 /* Flush the receive queue */
966                 if (efx_rx_qflush(rxq->common) != 0) {
967                         SFXGE_EVQ_LOCK(evq);
968                         rxq->flush_state = SFXGE_FLUSH_FAILED;
969                         break;
970                 }
971
972                 count = 0;
973                 do {
974                         /* Spin for 100 ms */
975                         DELAY(100000);
976
977                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
978                                 break;
979
980                 } while (++count < 20);
981
982                 SFXGE_EVQ_LOCK(evq);
983
984                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
985                         /* Flush timeout - neither done nor failed */
986                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
987                             device_get_nameunit(sc->dev), index);
988                         rxq->flush_state = SFXGE_FLUSH_DONE;
989                 }
990                 retry--;
991         }
992         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
993                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
994                     device_get_nameunit(sc->dev), index);
995                 rxq->flush_state = SFXGE_FLUSH_DONE;
996         }
997
998         rxq->pending = rxq->added;
999         sfxge_rx_qcomplete(rxq, B_TRUE);
1000
1001         KASSERT(rxq->completed == rxq->pending,
1002             ("rxq->completed != rxq->pending"));
1003
1004         rxq->added = 0;
1005         rxq->pushed = 0;
1006         rxq->pending = 0;
1007         rxq->completed = 0;
1008         rxq->loopback = 0;
1009
1010         /* Destroy the common code receive queue. */
1011         efx_rx_qdestroy(rxq->common);
1012
1013         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1014             EFX_RXQ_NBUFS(sc->rxq_entries));
1015
1016         SFXGE_EVQ_UNLOCK(evq);
1017 }
1018
1019 static int
1020 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1021 {
1022         struct sfxge_rxq *rxq;
1023         efsys_mem_t *esmp;
1024         struct sfxge_evq *evq;
1025         int rc;
1026
1027         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1028
1029         rxq = sc->rxq[index];
1030         esmp = &rxq->mem;
1031         evq = sc->evq[index];
1032
1033         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1034             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1035         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1036             ("evq->init_state != SFXGE_EVQ_STARTED"));
1037
1038         /* Program the buffer table. */
1039         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1040             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1041                 return (rc);
1042
1043         /* Create the common code receive queue. */
1044         if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1045             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1046             &rxq->common)) != 0)
1047                 goto fail;
1048
1049         SFXGE_EVQ_LOCK(evq);
1050
1051         /* Enable the receive queue. */
1052         efx_rx_qenable(rxq->common);
1053
1054         rxq->init_state = SFXGE_RXQ_STARTED;
1055         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1056
1057         /* Try to fill the queue from the pool. */
1058         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1059
1060         SFXGE_EVQ_UNLOCK(evq);
1061
1062         return (0);
1063
1064 fail:
1065         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1066             EFX_RXQ_NBUFS(sc->rxq_entries));
1067         return (rc);
1068 }
1069
1070 void
1071 sfxge_rx_stop(struct sfxge_softc *sc)
1072 {
1073         int index;
1074
1075         efx_mac_filter_default_rxq_clear(sc->enp);
1076
1077         /* Stop the receive queue(s) */
1078         index = sc->rxq_count;
1079         while (--index >= 0)
1080                 sfxge_rx_qstop(sc, index);
1081
1082         sc->rx_prefix_size = 0;
1083         sc->rx_buffer_size = 0;
1084
1085         efx_rx_fini(sc->enp);
1086 }
1087
1088 int
1089 sfxge_rx_start(struct sfxge_softc *sc)
1090 {
1091         struct sfxge_intr *intr;
1092         const efx_nic_cfg_t *encp;
1093         size_t hdrlen, align, reserved;
1094         int index;
1095         int rc;
1096
1097         intr = &sc->intr;
1098
1099         /* Initialize the common code receive module. */
1100         if ((rc = efx_rx_init(sc->enp)) != 0)
1101                 return (rc);
1102
1103         encp = efx_nic_cfg_get(sc->enp);
1104         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1105
1106         /* Calculate the receive packet buffer size. */ 
1107         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1108
1109         /* Ensure IP headers are 32bit aligned */
1110         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1111         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1112
1113         sc->rx_buffer_size += sc->rx_buffer_align;
1114
1115         /* Align end of packet buffer for RX DMA end padding */
1116         align = MAX(1, encp->enc_rx_buf_align_end);
1117         EFSYS_ASSERT(ISP2(align));
1118         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1119
1120         /* 
1121          * Standard mbuf zones only guarantee pointer-size alignment;
1122          * we need extra space to align to the cache line
1123          */
1124         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1125
1126         /* Select zone for packet buffers */
1127         if (reserved <= MCLBYTES)
1128                 sc->rx_buffer_zone = zone_clust;
1129         else if (reserved <= MJUMPAGESIZE)
1130                 sc->rx_buffer_zone = zone_jumbop;
1131         else if (reserved <= MJUM9BYTES)
1132                 sc->rx_buffer_zone = zone_jumbo9;
1133         else
1134                 sc->rx_buffer_zone = zone_jumbo16;
1135
1136         /*
1137          * Set up the scale table.  Enable all hash types and hash insertion.
1138          */
1139         for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1140                 sc->rx_indir_table[index] = index % sc->rxq_count;
1141         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1142                                        SFXGE_RX_SCALE_MAX)) != 0)
1143                 goto fail;
1144         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1145             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1146             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1147
1148         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1149                                        sizeof(toep_key))) != 0)
1150                 goto fail;
1151
1152         /* Start the receive queue(s). */
1153         for (index = 0; index < sc->rxq_count; index++) {
1154                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1155                         goto fail2;
1156         }
1157
1158         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1159                                             sc->intr.n_alloc > 1);
1160         if (rc != 0)
1161                 goto fail3;
1162
1163         return (0);
1164
1165 fail3:
1166 fail2:
1167         while (--index >= 0)
1168                 sfxge_rx_qstop(sc, index);
1169
1170 fail:
1171         efx_rx_fini(sc->enp);
1172
1173         return (rc);
1174 }
1175
1176 #ifdef SFXGE_LRO
1177
1178 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1179 {
1180         struct sfxge_lro_state *st = &rxq->lro;
1181         unsigned i;
1182
1183         st->conns_mask = lro_table_size - 1;
1184         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1185                 ("lro_table_size must be a power of 2"));
1186         st->sc = rxq->sc;
1187         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1188                            M_SFXGE, M_WAITOK);
1189         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1190                              M_SFXGE, M_WAITOK);
1191         for (i = 0; i <= st->conns_mask; ++i) {
1192                 TAILQ_INIT(&st->conns[i]);
1193                 st->conns_n[i] = 0;
1194         }
1195         LIST_INIT(&st->active_conns);
1196         TAILQ_INIT(&st->free_conns);
1197 }
1198
1199 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1200 {
1201         struct sfxge_lro_state *st = &rxq->lro;
1202         struct sfxge_lro_conn *c;
1203         unsigned i;
1204
1205         /* Return cleanly if sfxge_lro_init() has not been called. */
1206         if (st->conns == NULL)
1207                 return;
1208
1209         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1210
1211         for (i = 0; i <= st->conns_mask; ++i) {
1212                 while (!TAILQ_EMPTY(&st->conns[i])) {
1213                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1214                         sfxge_lro_drop(rxq, c);
1215                 }
1216         }
1217
1218         while (!TAILQ_EMPTY(&st->free_conns)) {
1219                 c = TAILQ_FIRST(&st->free_conns);
1220                 TAILQ_REMOVE(&st->free_conns, c, link);
1221                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1222                 free(c, M_SFXGE);
1223         }
1224
1225         free(st->conns_n, M_SFXGE);
1226         free(st->conns, M_SFXGE);
1227         st->conns = NULL;
1228 }
1229
1230 #else
1231
1232 static void
1233 sfxge_lro_init(struct sfxge_rxq *rxq)
1234 {
1235 }
1236
1237 static void
1238 sfxge_lro_fini(struct sfxge_rxq *rxq)
1239 {
1240 }
1241
1242 #endif  /* SFXGE_LRO */
1243
1244 static void
1245 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1246 {
1247         struct sfxge_rxq *rxq;
1248
1249         rxq = sc->rxq[index];
1250
1251         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1252             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1253
1254         /* Free the context array and the flow table. */
1255         free(rxq->queue, M_SFXGE);
1256         sfxge_lro_fini(rxq);
1257
1258         /* Release DMA memory. */
1259         sfxge_dma_free(&rxq->mem);
1260
1261         sc->rxq[index] = NULL;
1262
1263         free(rxq, M_SFXGE);
1264 }
1265
1266 static int
1267 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1268 {
1269         struct sfxge_rxq *rxq;
1270         struct sfxge_evq *evq;
1271         efsys_mem_t *esmp;
1272         int rc;
1273
1274         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1275
1276         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1277         rxq->sc = sc;
1278         rxq->index = index;
1279         rxq->entries = sc->rxq_entries;
1280         rxq->ptr_mask = rxq->entries - 1;
1281         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1282
1283         sc->rxq[index] = rxq;
1284         esmp = &rxq->mem;
1285
1286         evq = sc->evq[index];
1287
1288         /* Allocate and zero DMA space. */
1289         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1290                 return (rc);
1291
1292         /* Allocate buffer table entries. */
1293         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1294                                  &rxq->buf_base_id);
1295
1296         /* Allocate the context array and the flow table. */
1297         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1298             M_SFXGE, M_WAITOK | M_ZERO);
1299         sfxge_lro_init(rxq);
1300
1301         callout_init(&rxq->refill_callout, 1);
1302
1303         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1304
1305         return (0);
1306 }
1307
1308 static const struct {
1309         const char *name;
1310         size_t offset;
1311 } sfxge_rx_stats[] = {
1312 #define SFXGE_RX_STAT(name, member) \
1313         { #name, offsetof(struct sfxge_rxq, member) }
1314 #ifdef SFXGE_LRO
1315         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1316         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1317         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1318         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1319         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1320         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1321         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1322         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1323 #endif
1324 };
1325
1326 static int
1327 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1328 {
1329         struct sfxge_softc *sc = arg1;
1330         unsigned int id = arg2;
1331         unsigned int sum, index;
1332
1333         /* Sum across all RX queues */
1334         sum = 0;
1335         for (index = 0; index < sc->rxq_count; index++)
1336                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1337                                          sfxge_rx_stats[id].offset);
1338
1339         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1340 }
1341
1342 static void
1343 sfxge_rx_stat_init(struct sfxge_softc *sc)
1344 {
1345         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1346         struct sysctl_oid_list *stat_list;
1347         unsigned int id;
1348
1349         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1350
1351         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1352                 SYSCTL_ADD_PROC(
1353                         ctx, stat_list,
1354                         OID_AUTO, sfxge_rx_stats[id].name,
1355                         CTLTYPE_UINT|CTLFLAG_RD,
1356                         sc, id, sfxge_rx_stat_handler, "IU",
1357                         "");
1358         }
1359 }
1360
1361 void
1362 sfxge_rx_fini(struct sfxge_softc *sc)
1363 {
1364         int index;
1365
1366         index = sc->rxq_count;
1367         while (--index >= 0)
1368                 sfxge_rx_qfini(sc, index);
1369
1370         sc->rxq_count = 0;
1371 }
1372
1373 int
1374 sfxge_rx_init(struct sfxge_softc *sc)
1375 {
1376         struct sfxge_intr *intr;
1377         int index;
1378         int rc;
1379
1380 #ifdef SFXGE_LRO
1381         if (!ISP2(lro_table_size)) {
1382                 log(LOG_ERR, "%s=%u must be power of 2",
1383                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1384                 rc = EINVAL;
1385                 goto fail_lro_table_size;
1386         }
1387
1388         if (lro_idle_ticks == 0)
1389                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1390 #endif
1391
1392         intr = &sc->intr;
1393
1394         sc->rxq_count = intr->n_alloc;
1395
1396         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1397             ("intr->state != SFXGE_INTR_INITIALIZED"));
1398
1399         /* Initialize the receive queue(s) - one per interrupt. */
1400         for (index = 0; index < sc->rxq_count; index++) {
1401                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1402                         goto fail;
1403         }
1404
1405         sfxge_rx_stat_init(sc);
1406
1407         return (0);
1408
1409 fail:
1410         /* Tear down the receive queue(s). */
1411         while (--index >= 0)
1412                 sfxge_rx_qfini(sc, index);
1413
1414         sc->rxq_count = 0;
1415
1416 #ifdef SFXGE_LRO
1417 fail_lro_table_size:
1418 #endif
1419         return (rc);
1420 }