]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/sfxge/sfxge_rx.c
This implements default-state support as described in:
[FreeBSD/FreeBSD.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
40
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <net/if_vlan_var.h>
44
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
49
50 #include <machine/in_cksum.h>
51
52 #include "common/efx.h"
53
54
55 #include "sfxge.h"
56 #include "sfxge_rx.h"
57
58 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
59
60 #ifdef SFXGE_LRO
61
62 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63             "Large receive offload (LRO) parameters");
64
65 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
66
67 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
68  * means we can accelerate a larger number of streams.
69  */
70 static unsigned lro_table_size = 128;
71 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73             &lro_table_size, 0,
74             "Size of the LRO hash table (must be a power of 2)");
75
76 /* Maximum length of a hash chain.  If chains get too long then the lookup
77  * time increases and may exceed the benefit of LRO.
78  */
79 static unsigned lro_chain_max = 20;
80 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82             &lro_chain_max, 0,
83             "The maximum length of a hash chain");
84
85 /* Maximum time (in ticks) that a connection can be idle before it's LRO
86  * state is discarded.
87  */
88 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91             &lro_idle_ticks, 0,
92             "The maximum time (in ticks) that a connection can be idle "
93             "before it's LRO state is discarded");
94
95 /* Number of packets with payload that must arrive in-order before a
96  * connection is eligible for LRO.  The idea is we should avoid coalescing
97  * segments when the sender is in slow-start because reducing the ACK rate
98  * can damage performance.
99  */
100 static int lro_slow_start_packets = 2000;
101 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103             &lro_slow_start_packets, 0,
104             "Number of packets with payload that must arrive in-order before "
105             "a connection is eligible for LRO");
106
107 /* Number of packets with payload that must arrive in-order following loss
108  * before a connection is eligible for LRO.  The idea is we should avoid
109  * coalescing segments when the sender is recovering from loss, because
110  * reducing the ACK rate can damage performance.
111  */
112 static int lro_loss_packets = 20;
113 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115             &lro_loss_packets, 0,
116             "Number of packets with payload that must arrive in-order "
117             "following loss before a connection is eligible for LRO");
118
119 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120 #define SFXGE_LRO_L2_ID_VLAN 0x4000
121 #define SFXGE_LRO_L2_ID_IPV6 0x8000
122 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124
125 /* Compare IPv6 addresses, avoiding conditional branches */
126 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127                                    const struct in6_addr *right)
128 {
129 #if LONG_BIT == 64
130         const uint64_t *left64 = (const uint64_t *)left;
131         const uint64_t *right64 = (const uint64_t *)right;
132         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133 #else
134         return (left->s6_addr32[0] - right->s6_addr32[0]) |
135                (left->s6_addr32[1] - right->s6_addr32[1]) |
136                (left->s6_addr32[2] - right->s6_addr32[2]) |
137                (left->s6_addr32[3] - right->s6_addr32[3]);
138 #endif
139 }
140
141 #endif  /* SFXGE_LRO */
142
143 void
144 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145 {
146
147         rxq->flush_state = SFXGE_FLUSH_DONE;
148 }
149
150 void
151 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152 {
153
154         rxq->flush_state = SFXGE_FLUSH_FAILED;
155 }
156
157 static uint8_t toep_key[] = {
158         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163 };
164
165 static void
166 sfxge_rx_post_refill(void *arg)
167 {
168         struct sfxge_rxq *rxq = arg;
169         struct sfxge_softc *sc;
170         unsigned int index;
171         struct sfxge_evq *evq;
172         uint16_t magic;
173
174         sc = rxq->sc;
175         index = rxq->index;
176         evq = sc->evq[index];
177
178         magic = SFXGE_MAGIC_RX_QREFILL | index;
179
180         /* This is guaranteed due to the start/stop order of rx and ev */
181         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182             ("evq not started"));
183         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184             ("rxq not started"));
185         efx_ev_qpost(evq->common, magic);
186 }
187
188 static void
189 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190 {
191         /* Initially retry after 100 ms, but back off in case of
192          * repeated failures as we probably have to wait for the
193          * administrator to raise the pool limit. */
194         if (retrying)
195                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196         else
197                 rxq->refill_delay = hz / 10;
198
199         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200                              sfxge_rx_post_refill, rxq);
201 }
202
203 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204 {
205         struct mb_args args;
206         struct mbuf *m;
207
208         /* Allocate mbuf structure */
209         args.flags = M_PKTHDR;
210         args.type = MT_DATA;
211         m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212
213         /* Allocate (and attach) packet buffer */
214         if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215                 uma_zfree(zone_mbuf, m);
216                 m = NULL;
217         }
218
219         return (m);
220 }
221
222 #define SFXGE_REFILL_BATCH  64
223
224 static void
225 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226 {
227         struct sfxge_softc *sc;
228         unsigned int index;
229         struct sfxge_evq *evq;
230         unsigned int batch;
231         unsigned int rxfill;
232         unsigned int mblksize;
233         int ntodo;
234         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235
236         sc = rxq->sc;
237         index = rxq->index;
238         evq = sc->evq[index];
239
240         prefetch_read_many(sc->enp);
241         prefetch_read_many(rxq->common);
242
243         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244
245         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246                 return;
247
248         rxfill = rxq->added - rxq->completed;
249         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254
255         if (ntodo == 0)
256                 return;
257
258         batch = 0;
259         mblksize = sc->rx_buffer_size;
260         while (ntodo-- > 0) {
261                 unsigned int id;
262                 struct sfxge_rx_sw_desc *rx_desc;
263                 bus_dma_segment_t seg;
264                 struct mbuf *m;
265
266                 id = (rxq->added + batch) & rxq->ptr_mask;
267                 rx_desc = &rxq->queue[id];
268                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269
270                 rx_desc->flags = EFX_DISCARD;
271                 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272                 if (m == NULL)
273                         break;
274                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275                 addr[batch++] = seg.ds_addr;
276
277                 if (batch == SFXGE_REFILL_BATCH) {
278                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
279                             rxq->completed, rxq->added);
280                         rxq->added += batch;
281                         batch = 0;
282                 }
283         }
284
285         if (ntodo != 0)
286                 sfxge_rx_schedule_refill(rxq, retrying);
287
288         if (batch != 0) {
289                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
290                     rxq->completed, rxq->added);
291                 rxq->added += batch;
292         }
293
294         /* Make the descriptors visible to the hardware */
295         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296                         BUS_DMASYNC_PREWRITE);
297
298         efx_rx_qpush(rxq->common, rxq->added);
299 }
300
301 void
302 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303 {
304
305         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306                 return;
307
308         /* Make sure the queue is full */
309         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310 }
311
312 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313 {
314         struct ifnet *ifp = sc->ifnet;
315
316         m->m_pkthdr.rcvif = ifp;
317         m->m_pkthdr.csum_data = 0xffff;
318         ifp->if_input(ifp, m);
319 }
320
321 static void
322 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323 {
324         struct mbuf *m = rx_desc->mbuf;
325         int flags = rx_desc->flags;
326         int csum_flags;
327
328         /* Convert checksum flags */
329         csum_flags = (flags & EFX_CKSUM_IPV4) ?
330                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331         if (flags & EFX_CKSUM_TCPUDP)
332                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
333
334         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335                 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
336                                                        mtod(m, uint8_t *));
337                 /* The hash covers a 4-tuple for TCP only */
338                 M_HASHTYPE_SET(m,
339                     (flags & EFX_PKT_IPV4) ?
340                         ((flags & EFX_PKT_TCP) ?
341                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
342                         ((flags & EFX_PKT_TCP) ?
343                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
344         }
345         m->m_data += sc->rx_prefix_size;
346         m->m_len = rx_desc->size - sc->rx_prefix_size;
347         m->m_pkthdr.len = m->m_len;
348         m->m_pkthdr.csum_flags = csum_flags;
349         __sfxge_rx_deliver(sc, rx_desc->mbuf);
350
351         rx_desc->flags = EFX_DISCARD;
352         rx_desc->mbuf = NULL;
353 }
354
355 #ifdef SFXGE_LRO
356
357 static void
358 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
359 {
360         struct sfxge_softc *sc = st->sc;
361         struct mbuf *m = c->mbuf;
362         struct tcphdr *c_th;
363         int csum_flags;
364
365         KASSERT(m, ("no mbuf to deliver"));
366
367         ++st->n_bursts;
368
369         /* Finish off packet munging and recalculate IP header checksum. */
370         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
371                 struct ip *iph = c->nh;
372                 iph->ip_len = htons(iph->ip_len);
373                 iph->ip_sum = 0;
374                 iph->ip_sum = in_cksum_hdr(iph);
375                 c_th = (struct tcphdr *)(iph + 1);
376                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
377                               CSUM_IP_CHECKED | CSUM_IP_VALID);
378         } else {
379                 struct ip6_hdr *iph = c->nh;
380                 iph->ip6_plen = htons(iph->ip6_plen);
381                 c_th = (struct tcphdr *)(iph + 1);
382                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
383         }
384
385         c_th->th_win = c->th_last->th_win;
386         c_th->th_ack = c->th_last->th_ack;
387         if (c_th->th_off == c->th_last->th_off) {
388                 /* Copy TCP options (take care to avoid going negative). */
389                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
390                 memcpy(c_th + 1, c->th_last + 1, optlen);
391         }
392
393         m->m_pkthdr.flowid = c->conn_hash;
394         M_HASHTYPE_SET(m,
395             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
396                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
397
398         m->m_pkthdr.csum_flags = csum_flags;
399         __sfxge_rx_deliver(sc, m);
400
401         c->mbuf = NULL;
402         c->delivered = 1;
403 }
404
405 /* Drop the given connection, and add it to the free list. */
406 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
407 {
408         unsigned bucket;
409
410         KASSERT(!c->mbuf, ("found orphaned mbuf"));
411
412         if (c->next_buf.mbuf != NULL) {
413                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
414                 LIST_REMOVE(c, active_link);
415         }
416
417         bucket = c->conn_hash & rxq->lro.conns_mask;
418         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
419         --rxq->lro.conns_n[bucket];
420         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
421         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
422 }
423
424 /* Stop tracking connections that have gone idle in order to keep hash
425  * chains short.
426  */
427 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
428 {
429         struct sfxge_lro_conn *c;
430         unsigned i;
431
432         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
433                 ("found active connections"));
434
435         rxq->lro.last_purge_ticks = now;
436         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
437                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
438                         continue;
439
440                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
441                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
442                         ++rxq->lro.n_drop_idle;
443                         sfxge_lro_drop(rxq, c);
444                 }
445         }
446 }
447
448 static void
449 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
450                 struct mbuf *mbuf, struct tcphdr *th)
451 {
452         struct tcphdr *c_th;
453
454         /* Tack the new mbuf onto the chain. */
455         KASSERT(!mbuf->m_next, ("mbuf already chained"));
456         c->mbuf_tail->m_next = mbuf;
457         c->mbuf_tail = mbuf;
458
459         /* Increase length appropriately */
460         c->mbuf->m_pkthdr.len += mbuf->m_len;
461
462         /* Update the connection state flags */
463         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
464                 struct ip *iph = c->nh;
465                 iph->ip_len += mbuf->m_len;
466                 c_th = (struct tcphdr *)(iph + 1);
467         } else {
468                 struct ip6_hdr *iph = c->nh;
469                 iph->ip6_plen += mbuf->m_len;
470                 c_th = (struct tcphdr *)(iph + 1);
471         }
472         c_th->th_flags |= (th->th_flags & TH_PUSH);
473         c->th_last = th;
474         ++st->n_merges;
475
476         /* Pass packet up now if another segment could overflow the IP
477          * length.
478          */
479         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
480                 sfxge_lro_deliver(st, c);
481 }
482
483 static void
484 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
485                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
486 {
487         /* Start the chain */
488         c->mbuf = mbuf;
489         c->mbuf_tail = c->mbuf;
490         c->nh = nh;
491         c->th_last = th;
492
493         mbuf->m_pkthdr.len = mbuf->m_len;
494
495         /* Mangle header fields for later processing */
496         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
497                 struct ip *iph = nh;
498                 iph->ip_len = ntohs(iph->ip_len);
499         } else {
500                 struct ip6_hdr *iph = nh;
501                 iph->ip6_plen = ntohs(iph->ip6_plen);
502         }
503 }
504
505 /* Try to merge or otherwise hold or deliver (as appropriate) the
506  * packet buffered for this connection (c->next_buf).  Return a flag
507  * indicating whether the connection is still active for LRO purposes.
508  */
509 static int
510 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
511 {
512         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
513         char *eh = c->next_eh;
514         int data_length, hdr_length, dont_merge;
515         unsigned th_seq, pkt_length;
516         struct tcphdr *th;
517         unsigned now;
518
519         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
520                 struct ip *iph = c->next_nh;
521                 th = (struct tcphdr *)(iph + 1);
522                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
523         } else {
524                 struct ip6_hdr *iph = c->next_nh;
525                 th = (struct tcphdr *)(iph + 1);
526                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
527         }
528
529         hdr_length = (char *) th + th->th_off * 4 - eh;
530         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
531                        hdr_length);
532         th_seq = ntohl(th->th_seq);
533         dont_merge = ((data_length <= 0)
534                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
535
536         /* Check for options other than aligned timestamp. */
537         if (th->th_off != 5) {
538                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
539                 if (th->th_off == 8 &&
540                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
541                                         (TCPOPT_NOP << 16) |
542                                         (TCPOPT_TIMESTAMP << 8) |
543                                         TCPOLEN_TIMESTAMP)) {
544                         /* timestamp option -- okay */
545                 } else {
546                         dont_merge = 1;
547                 }
548         }
549
550         if (__predict_false(th_seq != c->next_seq)) {
551                 /* Out-of-order, so start counting again. */
552                 if (c->mbuf != NULL)
553                         sfxge_lro_deliver(&rxq->lro, c);
554                 c->n_in_order_pkts -= lro_loss_packets;
555                 c->next_seq = th_seq + data_length;
556                 ++rxq->lro.n_misorder;
557                 goto deliver_buf_out;
558         }
559         c->next_seq = th_seq + data_length;
560
561         now = ticks;
562         if (now - c->last_pkt_ticks > lro_idle_ticks) {
563                 ++rxq->lro.n_drop_idle;
564                 if (c->mbuf != NULL)
565                         sfxge_lro_deliver(&rxq->lro, c);
566                 sfxge_lro_drop(rxq, c);
567                 return (0);
568         }
569         c->last_pkt_ticks = ticks;
570
571         if (c->n_in_order_pkts < lro_slow_start_packets) {
572                 /* May be in slow-start, so don't merge. */
573                 ++rxq->lro.n_slow_start;
574                 ++c->n_in_order_pkts;
575                 goto deliver_buf_out;
576         }
577
578         if (__predict_false(dont_merge)) {
579                 if (c->mbuf != NULL)
580                         sfxge_lro_deliver(&rxq->lro, c);
581                 if (th->th_flags & (TH_FIN | TH_RST)) {
582                         ++rxq->lro.n_drop_closed;
583                         sfxge_lro_drop(rxq, c);
584                         return (0);
585                 }
586                 goto deliver_buf_out;
587         }
588
589         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
590
591         if (__predict_true(c->mbuf != NULL)) {
592                 /* Remove headers and any padding */
593                 rx_buf->mbuf->m_data += hdr_length;
594                 rx_buf->mbuf->m_len = data_length;
595
596                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
597         } else {
598                 /* Remove any padding */
599                 rx_buf->mbuf->m_len = pkt_length;
600
601                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
602         }
603
604         rx_buf->mbuf = NULL;
605         return (1);
606
607  deliver_buf_out:
608         sfxge_rx_deliver(rxq->sc, rx_buf);
609         return (1);
610 }
611
612 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
613                                uint16_t l2_id, void *nh, struct tcphdr *th)
614 {
615         unsigned bucket = conn_hash & st->conns_mask;
616         struct sfxge_lro_conn *c;
617
618         if (st->conns_n[bucket] >= lro_chain_max) {
619                 ++st->n_too_many;
620                 return;
621         }
622
623         if (!TAILQ_EMPTY(&st->free_conns)) {
624                 c = TAILQ_FIRST(&st->free_conns);
625                 TAILQ_REMOVE(&st->free_conns, c, link);
626         } else {
627                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
628                 if (c == NULL)
629                         return;
630                 c->mbuf = NULL;
631                 c->next_buf.mbuf = NULL;
632         }
633
634         /* Create the connection tracking data */
635         ++st->conns_n[bucket];
636         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
637         c->l2_id = l2_id;
638         c->conn_hash = conn_hash;
639         c->source = th->th_sport;
640         c->dest = th->th_dport;
641         c->n_in_order_pkts = 0;
642         c->last_pkt_ticks = *(volatile int *)&ticks;
643         c->delivered = 0;
644         ++st->n_new_stream;
645         /* NB. We don't initialise c->next_seq, and it doesn't matter what
646          * value it has.  Most likely the next packet received for this
647          * connection will not match -- no harm done.
648          */
649 }
650
651 /* Process mbuf and decide whether to dispatch it to the stack now or
652  * later.
653  */
654 static void
655 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
656 {
657         struct sfxge_softc *sc = rxq->sc;
658         struct mbuf *m = rx_buf->mbuf;
659         struct ether_header *eh;
660         struct sfxge_lro_conn *c;
661         uint16_t l2_id;
662         uint16_t l3_proto;
663         void *nh;
664         struct tcphdr *th;
665         uint32_t conn_hash;
666         unsigned bucket;
667
668         /* Get the hardware hash */
669         conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
670                                       mtod(m, uint8_t *));
671
672         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
673         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
674                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
675                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
676                         SFXGE_LRO_L2_ID_VLAN;
677                 l3_proto = veh->evl_proto;
678                 nh = veh + 1;
679         } else {
680                 l2_id = 0;
681                 l3_proto = eh->ether_type;
682                 nh = eh + 1;
683         }
684
685         /* Check whether this is a suitable packet (unfragmented
686          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
687          * length, and compute a hash if necessary.  If not, return.
688          */
689         if (l3_proto == htons(ETHERTYPE_IP)) {
690                 struct ip *iph = nh;
691
692                 KASSERT(iph->ip_p == IPPROTO_TCP,
693                     ("IPv4 protocol is not TCP, but packet marker is set"));
694                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
695                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
696                         goto deliver_now;
697                 th = (struct tcphdr *)(iph + 1);
698         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
699                 struct ip6_hdr *iph = nh;
700
701                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
702                     ("IPv6 next header is not TCP, but packet marker is set"));
703                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
704                 th = (struct tcphdr *)(iph + 1);
705         } else {
706                 goto deliver_now;
707         }
708
709         bucket = conn_hash & rxq->lro.conns_mask;
710
711         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
712                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
713                         continue;
714                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
715                         continue;
716                 if (c->mbuf != NULL) {
717                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
718                                 struct ip *c_iph, *iph = nh;
719                                 c_iph = c->nh;
720                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
721                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
722                                         continue;
723                         } else {
724                                 struct ip6_hdr *c_iph, *iph = nh;
725                                 c_iph = c->nh;
726                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
727                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
728                                         continue;
729                         }
730                 }
731
732                 /* Re-insert at head of list to reduce lookup time. */
733                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
734                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
735
736                 if (c->next_buf.mbuf != NULL) {
737                         if (!sfxge_lro_try_merge(rxq, c))
738                                 goto deliver_now;
739                 } else {
740                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
741                             active_link);
742                 }
743                 c->next_buf = *rx_buf;
744                 c->next_eh = eh;
745                 c->next_nh = nh;
746
747                 rx_buf->mbuf = NULL;
748                 rx_buf->flags = EFX_DISCARD;
749                 return;
750         }
751
752         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
753  deliver_now:
754         sfxge_rx_deliver(sc, rx_buf);
755 }
756
757 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
758 {
759         struct sfxge_lro_state *st = &rxq->lro;
760         struct sfxge_lro_conn *c;
761         unsigned t;
762
763         while (!LIST_EMPTY(&st->active_conns)) {
764                 c = LIST_FIRST(&st->active_conns);
765                 if (!c->delivered && c->mbuf != NULL)
766                         sfxge_lro_deliver(st, c);
767                 if (sfxge_lro_try_merge(rxq, c)) {
768                         if (c->mbuf != NULL)
769                                 sfxge_lro_deliver(st, c);
770                         LIST_REMOVE(c, active_link);
771                 }
772                 c->delivered = 0;
773         }
774
775         t = *(volatile int *)&ticks;
776         if (__predict_false(t != st->last_purge_ticks))
777                 sfxge_lro_purge_idle(rxq, t);
778 }
779
780 #else   /* !SFXGE_LRO */
781
782 static void
783 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
784 {
785 }
786
787 static void
788 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
789 {
790 }
791
792 #endif  /* SFXGE_LRO */
793
794 void
795 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
796 {
797         struct sfxge_softc *sc = rxq->sc;
798         int if_capenable = sc->ifnet->if_capenable;
799         int lro_enabled = if_capenable & IFCAP_LRO;
800         unsigned int index;
801         struct sfxge_evq *evq;
802         unsigned int completed;
803         unsigned int level;
804         struct mbuf *m;
805         struct sfxge_rx_sw_desc *prev = NULL;
806
807         index = rxq->index;
808         evq = sc->evq[index];
809
810         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
811
812         completed = rxq->completed;
813         while (completed != rxq->pending) {
814                 unsigned int id;
815                 struct sfxge_rx_sw_desc *rx_desc;
816
817                 id = completed++ & rxq->ptr_mask;
818                 rx_desc = &rxq->queue[id];
819                 m = rx_desc->mbuf;
820
821                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
822                         goto discard;
823
824                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
825                         goto discard;
826
827                 prefetch_read_many(mtod(m, caddr_t));
828
829                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
830                 case EFX_PKT_IPV4:
831                         if (~if_capenable & IFCAP_RXCSUM)
832                                 rx_desc->flags &=
833                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
834                         break;
835                 case EFX_PKT_IPV6:
836                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
837                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
838                         break;
839                 case 0:
840                         /* Check for loopback packets */
841                         {
842                                 struct ether_header *etherhp;
843
844                                 /*LINTED*/
845                                 etherhp = mtod(m, struct ether_header *);
846
847                                 if (etherhp->ether_type ==
848                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
849                                         EFSYS_PROBE(loopback);
850
851                                         rxq->loopback++;
852                                         goto discard;
853                                 }
854                         }
855                         break;
856                 default:
857                         KASSERT(B_FALSE,
858                             ("Rx descriptor with both IPv4 and IPv6 flags"));
859                         goto discard;
860                 }
861
862                 /* Pass packet up the stack or into LRO (pipelined) */
863                 if (prev != NULL) {
864                         if (lro_enabled &&
865                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
866                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
867                                 sfxge_lro(rxq, prev);
868                         else
869                                 sfxge_rx_deliver(sc, prev);
870                 }
871                 prev = rx_desc;
872                 continue;
873
874 discard:
875                 /* Return the packet to the pool */
876                 m_free(m);
877                 rx_desc->mbuf = NULL;
878         }
879         rxq->completed = completed;
880
881         level = rxq->added - rxq->completed;
882
883         /* Pass last packet up the stack or into LRO */
884         if (prev != NULL) {
885                 if (lro_enabled &&
886                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
887                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
888                         sfxge_lro(rxq, prev);
889                 else
890                         sfxge_rx_deliver(sc, prev);
891         }
892
893         /*
894          * If there are any pending flows and this is the end of the
895          * poll then they must be completed.
896          */
897         if (eop)
898                 sfxge_lro_end_of_burst(rxq);
899
900         /* Top up the queue if necessary */
901         if (level < rxq->refill_threshold)
902                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
903 }
904
905 static void
906 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
907 {
908         struct sfxge_rxq *rxq;
909         struct sfxge_evq *evq;
910         unsigned int count;
911
912         rxq = sc->rxq[index];
913         evq = sc->evq[index];
914
915         SFXGE_EVQ_LOCK(evq);
916
917         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
918             ("rxq not started"));
919
920         rxq->init_state = SFXGE_RXQ_INITIALIZED;
921
922         callout_stop(&rxq->refill_callout);
923
924 again:
925         rxq->flush_state = SFXGE_FLUSH_PENDING;
926
927         /* Flush the receive queue */
928         efx_rx_qflush(rxq->common);
929
930         SFXGE_EVQ_UNLOCK(evq);
931
932         count = 0;
933         do {
934                 /* Spin for 100 ms */
935                 DELAY(100000);
936
937                 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
938                         break;
939
940         } while (++count < 20);
941
942         SFXGE_EVQ_LOCK(evq);
943
944         if (rxq->flush_state == SFXGE_FLUSH_FAILED)
945                 goto again;
946
947         rxq->flush_state = SFXGE_FLUSH_DONE;
948
949         rxq->pending = rxq->added;
950         sfxge_rx_qcomplete(rxq, B_TRUE);
951
952         KASSERT(rxq->completed == rxq->pending,
953             ("rxq->completed != rxq->pending"));
954
955         rxq->added = 0;
956         rxq->pending = 0;
957         rxq->completed = 0;
958         rxq->loopback = 0;
959
960         /* Destroy the common code receive queue. */
961         efx_rx_qdestroy(rxq->common);
962
963         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
964             EFX_RXQ_NBUFS(sc->rxq_entries));
965
966         SFXGE_EVQ_UNLOCK(evq);
967 }
968
969 static int
970 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
971 {
972         struct sfxge_rxq *rxq;
973         efsys_mem_t *esmp;
974         struct sfxge_evq *evq;
975         int rc;
976
977         rxq = sc->rxq[index];
978         esmp = &rxq->mem;
979         evq = sc->evq[index];
980
981         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
982             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
983         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
984             ("evq->init_state != SFXGE_EVQ_STARTED"));
985
986         /* Program the buffer table. */
987         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
988             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
989                 return (rc);
990
991         /* Create the common code receive queue. */
992         if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
993             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
994             &rxq->common)) != 0)
995                 goto fail;
996
997         SFXGE_EVQ_LOCK(evq);
998
999         /* Enable the receive queue. */
1000         efx_rx_qenable(rxq->common);
1001
1002         rxq->init_state = SFXGE_RXQ_STARTED;
1003
1004         /* Try to fill the queue from the pool. */
1005         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1006
1007         SFXGE_EVQ_UNLOCK(evq);
1008
1009         return (0);
1010
1011 fail:
1012         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1013             EFX_RXQ_NBUFS(sc->rxq_entries));
1014         return (rc);
1015 }
1016
1017 void
1018 sfxge_rx_stop(struct sfxge_softc *sc)
1019 {
1020         int index;
1021
1022         /* Stop the receive queue(s) */
1023         index = sc->rxq_count;
1024         while (--index >= 0)
1025                 sfxge_rx_qstop(sc, index);
1026
1027         sc->rx_prefix_size = 0;
1028         sc->rx_buffer_size = 0;
1029
1030         efx_rx_fini(sc->enp);
1031 }
1032
1033 int
1034 sfxge_rx_start(struct sfxge_softc *sc)
1035 {
1036         struct sfxge_intr *intr;
1037         int index;
1038         int rc;
1039
1040         intr = &sc->intr;
1041
1042         /* Initialize the common code receive module. */
1043         if ((rc = efx_rx_init(sc->enp)) != 0)
1044                 return (rc);
1045
1046         /* Calculate the receive packet buffer size. */
1047         sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1048         sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1049                               sc->rx_prefix_size);
1050
1051         /* Select zone for packet buffers */
1052         if (sc->rx_buffer_size <= MCLBYTES)
1053                 sc->rx_buffer_zone = zone_clust;
1054         else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1055                 sc->rx_buffer_zone = zone_jumbop;
1056         else if (sc->rx_buffer_size <= MJUM9BYTES)
1057                 sc->rx_buffer_zone = zone_jumbo9;
1058         else
1059                 sc->rx_buffer_zone = zone_jumbo16;
1060
1061         /*
1062          * Set up the scale table.  Enable all hash types and hash insertion.
1063          */
1064         for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1065                 sc->rx_indir_table[index] = index % sc->rxq_count;
1066         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1067                                        SFXGE_RX_SCALE_MAX)) != 0)
1068                 goto fail;
1069         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1070             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1071             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1072
1073         if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1074             sizeof(toep_key))) != 0)
1075                 goto fail;
1076
1077         /* Start the receive queue(s). */
1078         for (index = 0; index < sc->rxq_count; index++) {
1079                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1080                         goto fail2;
1081         }
1082
1083         return (0);
1084
1085 fail2:
1086         while (--index >= 0)
1087                 sfxge_rx_qstop(sc, index);
1088
1089 fail:
1090         efx_rx_fini(sc->enp);
1091
1092         return (rc);
1093 }
1094
1095 #ifdef SFXGE_LRO
1096
1097 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1098 {
1099         struct sfxge_lro_state *st = &rxq->lro;
1100         unsigned i;
1101
1102         st->conns_mask = lro_table_size - 1;
1103         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1104                 ("lro_table_size must be a power of 2"));
1105         st->sc = rxq->sc;
1106         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1107                            M_SFXGE, M_WAITOK);
1108         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1109                              M_SFXGE, M_WAITOK);
1110         for (i = 0; i <= st->conns_mask; ++i) {
1111                 TAILQ_INIT(&st->conns[i]);
1112                 st->conns_n[i] = 0;
1113         }
1114         LIST_INIT(&st->active_conns);
1115         TAILQ_INIT(&st->free_conns);
1116 }
1117
1118 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1119 {
1120         struct sfxge_lro_state *st = &rxq->lro;
1121         struct sfxge_lro_conn *c;
1122         unsigned i;
1123
1124         /* Return cleanly if sfxge_lro_init() has not been called. */
1125         if (st->conns == NULL)
1126                 return;
1127
1128         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1129
1130         for (i = 0; i <= st->conns_mask; ++i) {
1131                 while (!TAILQ_EMPTY(&st->conns[i])) {
1132                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1133                         sfxge_lro_drop(rxq, c);
1134                 }
1135         }
1136
1137         while (!TAILQ_EMPTY(&st->free_conns)) {
1138                 c = TAILQ_FIRST(&st->free_conns);
1139                 TAILQ_REMOVE(&st->free_conns, c, link);
1140                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1141                 free(c, M_SFXGE);
1142         }
1143
1144         free(st->conns_n, M_SFXGE);
1145         free(st->conns, M_SFXGE);
1146         st->conns = NULL;
1147 }
1148
1149 #else
1150
1151 static void
1152 sfxge_lro_init(struct sfxge_rxq *rxq)
1153 {
1154 }
1155
1156 static void
1157 sfxge_lro_fini(struct sfxge_rxq *rxq)
1158 {
1159 }
1160
1161 #endif  /* SFXGE_LRO */
1162
1163 static void
1164 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1165 {
1166         struct sfxge_rxq *rxq;
1167
1168         rxq = sc->rxq[index];
1169
1170         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1171             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1172
1173         /* Free the context array and the flow table. */
1174         free(rxq->queue, M_SFXGE);
1175         sfxge_lro_fini(rxq);
1176
1177         /* Release DMA memory. */
1178         sfxge_dma_free(&rxq->mem);
1179
1180         sc->rxq[index] = NULL;
1181
1182         free(rxq, M_SFXGE);
1183 }
1184
1185 static int
1186 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1187 {
1188         struct sfxge_rxq *rxq;
1189         struct sfxge_evq *evq;
1190         efsys_mem_t *esmp;
1191         int rc;
1192
1193         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1194
1195         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1196         rxq->sc = sc;
1197         rxq->index = index;
1198         rxq->entries = sc->rxq_entries;
1199         rxq->ptr_mask = rxq->entries - 1;
1200         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1201
1202         sc->rxq[index] = rxq;
1203         esmp = &rxq->mem;
1204
1205         evq = sc->evq[index];
1206
1207         /* Allocate and zero DMA space. */
1208         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1209                 return (rc);
1210
1211         /* Allocate buffer table entries. */
1212         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1213                                  &rxq->buf_base_id);
1214
1215         /* Allocate the context array and the flow table. */
1216         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1217             M_SFXGE, M_WAITOK | M_ZERO);
1218         sfxge_lro_init(rxq);
1219
1220         callout_init(&rxq->refill_callout, 1);
1221
1222         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1223
1224         return (0);
1225 }
1226
1227 static const struct {
1228         const char *name;
1229         size_t offset;
1230 } sfxge_rx_stats[] = {
1231 #define SFXGE_RX_STAT(name, member) \
1232         { #name, offsetof(struct sfxge_rxq, member) }
1233 #ifdef SFXGE_LRO
1234         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1235         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1236         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1237         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1238         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1239         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1240         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1241         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1242 #endif
1243 };
1244
1245 static int
1246 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1247 {
1248         struct sfxge_softc *sc = arg1;
1249         unsigned int id = arg2;
1250         unsigned int sum, index;
1251
1252         /* Sum across all RX queues */
1253         sum = 0;
1254         for (index = 0; index < sc->rxq_count; index++)
1255                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1256                                          sfxge_rx_stats[id].offset);
1257
1258         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1259 }
1260
1261 static void
1262 sfxge_rx_stat_init(struct sfxge_softc *sc)
1263 {
1264         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1265         struct sysctl_oid_list *stat_list;
1266         unsigned int id;
1267
1268         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1269
1270         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1271                 SYSCTL_ADD_PROC(
1272                         ctx, stat_list,
1273                         OID_AUTO, sfxge_rx_stats[id].name,
1274                         CTLTYPE_UINT|CTLFLAG_RD,
1275                         sc, id, sfxge_rx_stat_handler, "IU",
1276                         "");
1277         }
1278 }
1279
1280 void
1281 sfxge_rx_fini(struct sfxge_softc *sc)
1282 {
1283         int index;
1284
1285         index = sc->rxq_count;
1286         while (--index >= 0)
1287                 sfxge_rx_qfini(sc, index);
1288
1289         sc->rxq_count = 0;
1290 }
1291
1292 int
1293 sfxge_rx_init(struct sfxge_softc *sc)
1294 {
1295         struct sfxge_intr *intr;
1296         int index;
1297         int rc;
1298
1299 #ifdef SFXGE_LRO
1300         if (!ISP2(lro_table_size)) {
1301                 log(LOG_ERR, "%s=%u must be power of 2",
1302                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1303                 rc = EINVAL;
1304                 goto fail_lro_table_size;
1305         }
1306
1307         if (lro_idle_ticks == 0)
1308                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1309 #endif
1310
1311         intr = &sc->intr;
1312
1313         sc->rxq_count = intr->n_alloc;
1314
1315         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1316             ("intr->state != SFXGE_INTR_INITIALIZED"));
1317
1318         /* Initialize the receive queue(s) - one per interrupt. */
1319         for (index = 0; index < sc->rxq_count; index++) {
1320                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1321                         goto fail;
1322         }
1323
1324         sfxge_rx_stat_init(sc);
1325
1326         return (0);
1327
1328 fail:
1329         /* Tear down the receive queue(s). */
1330         while (--index >= 0)
1331                 sfxge_rx_qfini(sc, index);
1332
1333         sc->rxq_count = 0;
1334
1335 #ifdef SFXGE_LRO
1336 fail_lro_table_size:
1337 #endif
1338         return (rc);
1339 }