]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/sfxge/sfxge_rx.c
Merge bmake-20150505 improve detection of malformed conditionals.
[FreeBSD/FreeBSD.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
40
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <net/if_vlan_var.h>
44
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
49
50 #include <machine/in_cksum.h>
51
52 #include "common/efx.h"
53
54
55 #include "sfxge.h"
56 #include "sfxge_rx.h"
57
58 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
59
60 #ifdef SFXGE_LRO
61
62 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63             "Large receive offload (LRO) parameters");
64
65 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
66
67 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
68  * means we can accelerate a larger number of streams.
69  */
70 static unsigned lro_table_size = 128;
71 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73             &lro_table_size, 0,
74             "Size of the LRO hash table (must be a power of 2)");
75
76 /* Maximum length of a hash chain.  If chains get too long then the lookup
77  * time increases and may exceed the benefit of LRO.
78  */
79 static unsigned lro_chain_max = 20;
80 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82             &lro_chain_max, 0,
83             "The maximum length of a hash chain");
84
85 /* Maximum time (in ticks) that a connection can be idle before it's LRO
86  * state is discarded.
87  */
88 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91             &lro_idle_ticks, 0,
92             "The maximum time (in ticks) that a connection can be idle "
93             "before it's LRO state is discarded");
94
95 /* Number of packets with payload that must arrive in-order before a
96  * connection is eligible for LRO.  The idea is we should avoid coalescing
97  * segments when the sender is in slow-start because reducing the ACK rate
98  * can damage performance.
99  */
100 static int lro_slow_start_packets = 2000;
101 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103             &lro_slow_start_packets, 0,
104             "Number of packets with payload that must arrive in-order before "
105             "a connection is eligible for LRO");
106
107 /* Number of packets with payload that must arrive in-order following loss
108  * before a connection is eligible for LRO.  The idea is we should avoid
109  * coalescing segments when the sender is recovering from loss, because
110  * reducing the ACK rate can damage performance.
111  */
112 static int lro_loss_packets = 20;
113 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115             &lro_loss_packets, 0,
116             "Number of packets with payload that must arrive in-order "
117             "following loss before a connection is eligible for LRO");
118
119 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120 #define SFXGE_LRO_L2_ID_VLAN 0x4000
121 #define SFXGE_LRO_L2_ID_IPV6 0x8000
122 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124
125 /* Compare IPv6 addresses, avoiding conditional branches */
126 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127                                    const struct in6_addr *right)
128 {
129 #if LONG_BIT == 64
130         const uint64_t *left64 = (const uint64_t *)left;
131         const uint64_t *right64 = (const uint64_t *)right;
132         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133 #else
134         return (left->s6_addr32[0] - right->s6_addr32[0]) |
135                (left->s6_addr32[1] - right->s6_addr32[1]) |
136                (left->s6_addr32[2] - right->s6_addr32[2]) |
137                (left->s6_addr32[3] - right->s6_addr32[3]);
138 #endif
139 }
140
141 #endif  /* SFXGE_LRO */
142
143 void
144 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145 {
146
147         rxq->flush_state = SFXGE_FLUSH_DONE;
148 }
149
150 void
151 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152 {
153
154         rxq->flush_state = SFXGE_FLUSH_FAILED;
155 }
156
157 static uint8_t toep_key[] = {
158         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163 };
164
165 static void
166 sfxge_rx_post_refill(void *arg)
167 {
168         struct sfxge_rxq *rxq = arg;
169         struct sfxge_softc *sc;
170         unsigned int index;
171         struct sfxge_evq *evq;
172         uint16_t magic;
173
174         sc = rxq->sc;
175         index = rxq->index;
176         evq = sc->evq[index];
177
178         magic = SFXGE_MAGIC_RX_QREFILL | index;
179
180         /* This is guaranteed due to the start/stop order of rx and ev */
181         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182             ("evq not started"));
183         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184             ("rxq not started"));
185         efx_ev_qpost(evq->common, magic);
186 }
187
188 static void
189 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190 {
191         /* Initially retry after 100 ms, but back off in case of
192          * repeated failures as we probably have to wait for the
193          * administrator to raise the pool limit. */
194         if (retrying)
195                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196         else
197                 rxq->refill_delay = hz / 10;
198
199         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200                              sfxge_rx_post_refill, rxq);
201 }
202
203 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204 {
205         struct mb_args args;
206         struct mbuf *m;
207
208         /* Allocate mbuf structure */
209         args.flags = M_PKTHDR;
210         args.type = MT_DATA;
211         m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212
213         /* Allocate (and attach) packet buffer */
214         if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215                 uma_zfree(zone_mbuf, m);
216                 m = NULL;
217         }
218
219         return (m);
220 }
221
222 #define SFXGE_REFILL_BATCH  64
223
224 static void
225 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226 {
227         struct sfxge_softc *sc;
228         unsigned int index;
229         struct sfxge_evq *evq;
230         unsigned int batch;
231         unsigned int rxfill;
232         unsigned int mblksize;
233         int ntodo;
234         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235
236         sc = rxq->sc;
237         index = rxq->index;
238         evq = sc->evq[index];
239
240         prefetch_read_many(sc->enp);
241         prefetch_read_many(rxq->common);
242
243         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244
245         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246                 return;
247
248         rxfill = rxq->added - rxq->completed;
249         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254
255         if (ntodo == 0)
256                 return;
257
258         batch = 0;
259         mblksize = sc->rx_buffer_size;
260         while (ntodo-- > 0) {
261                 unsigned int id;
262                 struct sfxge_rx_sw_desc *rx_desc;
263                 bus_dma_segment_t seg;
264                 struct mbuf *m;
265
266                 id = (rxq->added + batch) & rxq->ptr_mask;
267                 rx_desc = &rxq->queue[id];
268                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269
270                 rx_desc->flags = EFX_DISCARD;
271                 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272                 if (m == NULL)
273                         break;
274                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275                 addr[batch++] = seg.ds_addr;
276
277                 if (batch == SFXGE_REFILL_BATCH) {
278                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
279                             rxq->completed, rxq->added);
280                         rxq->added += batch;
281                         batch = 0;
282                 }
283         }
284
285         if (ntodo != 0)
286                 sfxge_rx_schedule_refill(rxq, retrying);
287
288         if (batch != 0) {
289                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
290                     rxq->completed, rxq->added);
291                 rxq->added += batch;
292         }
293
294         /* Make the descriptors visible to the hardware */
295         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296                         BUS_DMASYNC_PREWRITE);
297
298         efx_rx_qpush(rxq->common, rxq->added);
299 }
300
301 void
302 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303 {
304
305         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306                 return;
307
308         /* Make sure the queue is full */
309         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310 }
311
312 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313 {
314         struct ifnet *ifp = sc->ifnet;
315
316         m->m_pkthdr.rcvif = ifp;
317         m->m_pkthdr.csum_data = 0xffff;
318         ifp->if_input(ifp, m);
319 }
320
321 static void
322 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323 {
324         struct mbuf *m = rx_desc->mbuf;
325         int csum_flags;
326
327         /* Convert checksum flags */
328         csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
329                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
330         if (rx_desc->flags & EFX_CKSUM_TCPUDP)
331                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
332
333         if (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
334                 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
335                                                        mtod(m, uint8_t *));
336                 /* The hash covers a 4-tuple for TCP only */
337                 M_HASHTYPE_SET(m,
338                     (rx_desc->flags & EFX_PKT_IPV4) ?
339                         ((rx_desc->flags & EFX_PKT_TCP) ?
340                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
341                         ((rx_desc->flags & EFX_PKT_TCP) ?
342                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
343         }
344         m->m_data += sc->rx_prefix_size;
345         m->m_len = rx_desc->size - sc->rx_prefix_size;
346         m->m_pkthdr.len = m->m_len;
347         m->m_pkthdr.csum_flags = csum_flags;
348         __sfxge_rx_deliver(sc, rx_desc->mbuf);
349
350         rx_desc->flags = EFX_DISCARD;
351         rx_desc->mbuf = NULL;
352 }
353
354 #ifdef SFXGE_LRO
355
356 static void
357 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
358 {
359         struct sfxge_softc *sc = st->sc;
360         struct mbuf *m = c->mbuf;
361         struct tcphdr *c_th;
362         int csum_flags;
363
364         KASSERT(m, ("no mbuf to deliver"));
365
366         ++st->n_bursts;
367
368         /* Finish off packet munging and recalculate IP header checksum. */
369         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
370                 struct ip *iph = c->nh;
371                 iph->ip_len = htons(iph->ip_len);
372                 iph->ip_sum = 0;
373                 iph->ip_sum = in_cksum_hdr(iph);
374                 c_th = (struct tcphdr *)(iph + 1);
375                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
376                               CSUM_IP_CHECKED | CSUM_IP_VALID);
377         } else {
378                 struct ip6_hdr *iph = c->nh;
379                 iph->ip6_plen = htons(iph->ip6_plen);
380                 c_th = (struct tcphdr *)(iph + 1);
381                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
382         }
383
384         c_th->th_win = c->th_last->th_win;
385         c_th->th_ack = c->th_last->th_ack;
386         if (c_th->th_off == c->th_last->th_off) {
387                 /* Copy TCP options (take care to avoid going negative). */
388                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
389                 memcpy(c_th + 1, c->th_last + 1, optlen);
390         }
391
392         m->m_pkthdr.flowid = c->conn_hash;
393         M_HASHTYPE_SET(m,
394             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
395                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
396
397         m->m_pkthdr.csum_flags = csum_flags;
398         __sfxge_rx_deliver(sc, m);
399
400         c->mbuf = NULL;
401         c->delivered = 1;
402 }
403
404 /* Drop the given connection, and add it to the free list. */
405 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
406 {
407         unsigned bucket;
408
409         KASSERT(!c->mbuf, ("found orphaned mbuf"));
410
411         if (c->next_buf.mbuf != NULL) {
412                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
413                 LIST_REMOVE(c, active_link);
414         }
415
416         bucket = c->conn_hash & rxq->lro.conns_mask;
417         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
418         --rxq->lro.conns_n[bucket];
419         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
420         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
421 }
422
423 /* Stop tracking connections that have gone idle in order to keep hash
424  * chains short.
425  */
426 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
427 {
428         struct sfxge_lro_conn *c;
429         unsigned i;
430
431         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
432                 ("found active connections"));
433
434         rxq->lro.last_purge_ticks = now;
435         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
436                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
437                         continue;
438
439                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
440                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
441                         ++rxq->lro.n_drop_idle;
442                         sfxge_lro_drop(rxq, c);
443                 }
444         }
445 }
446
447 static void
448 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
449                 struct mbuf *mbuf, struct tcphdr *th)
450 {
451         struct tcphdr *c_th;
452
453         /* Tack the new mbuf onto the chain. */
454         KASSERT(!mbuf->m_next, ("mbuf already chained"));
455         c->mbuf_tail->m_next = mbuf;
456         c->mbuf_tail = mbuf;
457
458         /* Increase length appropriately */
459         c->mbuf->m_pkthdr.len += mbuf->m_len;
460
461         /* Update the connection state flags */
462         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
463                 struct ip *iph = c->nh;
464                 iph->ip_len += mbuf->m_len;
465                 c_th = (struct tcphdr *)(iph + 1);
466         } else {
467                 struct ip6_hdr *iph = c->nh;
468                 iph->ip6_plen += mbuf->m_len;
469                 c_th = (struct tcphdr *)(iph + 1);
470         }
471         c_th->th_flags |= (th->th_flags & TH_PUSH);
472         c->th_last = th;
473         ++st->n_merges;
474
475         /* Pass packet up now if another segment could overflow the IP
476          * length.
477          */
478         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
479                 sfxge_lro_deliver(st, c);
480 }
481
482 static void
483 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
484                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
485 {
486         /* Start the chain */
487         c->mbuf = mbuf;
488         c->mbuf_tail = c->mbuf;
489         c->nh = nh;
490         c->th_last = th;
491
492         mbuf->m_pkthdr.len = mbuf->m_len;
493
494         /* Mangle header fields for later processing */
495         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
496                 struct ip *iph = nh;
497                 iph->ip_len = ntohs(iph->ip_len);
498         } else {
499                 struct ip6_hdr *iph = nh;
500                 iph->ip6_plen = ntohs(iph->ip6_plen);
501         }
502 }
503
504 /* Try to merge or otherwise hold or deliver (as appropriate) the
505  * packet buffered for this connection (c->next_buf).  Return a flag
506  * indicating whether the connection is still active for LRO purposes.
507  */
508 static int
509 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
510 {
511         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
512         char *eh = c->next_eh;
513         int data_length, hdr_length, dont_merge;
514         unsigned th_seq, pkt_length;
515         struct tcphdr *th;
516         unsigned now;
517
518         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
519                 struct ip *iph = c->next_nh;
520                 th = (struct tcphdr *)(iph + 1);
521                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
522         } else {
523                 struct ip6_hdr *iph = c->next_nh;
524                 th = (struct tcphdr *)(iph + 1);
525                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
526         }
527
528         hdr_length = (char *) th + th->th_off * 4 - eh;
529         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
530                        hdr_length);
531         th_seq = ntohl(th->th_seq);
532         dont_merge = ((data_length <= 0)
533                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
534
535         /* Check for options other than aligned timestamp. */
536         if (th->th_off != 5) {
537                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
538                 if (th->th_off == 8 &&
539                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
540                                         (TCPOPT_NOP << 16) |
541                                         (TCPOPT_TIMESTAMP << 8) |
542                                         TCPOLEN_TIMESTAMP)) {
543                         /* timestamp option -- okay */
544                 } else {
545                         dont_merge = 1;
546                 }
547         }
548
549         if (__predict_false(th_seq != c->next_seq)) {
550                 /* Out-of-order, so start counting again. */
551                 if (c->mbuf != NULL)
552                         sfxge_lro_deliver(&rxq->lro, c);
553                 c->n_in_order_pkts -= lro_loss_packets;
554                 c->next_seq = th_seq + data_length;
555                 ++rxq->lro.n_misorder;
556                 goto deliver_buf_out;
557         }
558         c->next_seq = th_seq + data_length;
559
560         now = ticks;
561         if (now - c->last_pkt_ticks > lro_idle_ticks) {
562                 ++rxq->lro.n_drop_idle;
563                 if (c->mbuf != NULL)
564                         sfxge_lro_deliver(&rxq->lro, c);
565                 sfxge_lro_drop(rxq, c);
566                 return (0);
567         }
568         c->last_pkt_ticks = ticks;
569
570         if (c->n_in_order_pkts < lro_slow_start_packets) {
571                 /* May be in slow-start, so don't merge. */
572                 ++rxq->lro.n_slow_start;
573                 ++c->n_in_order_pkts;
574                 goto deliver_buf_out;
575         }
576
577         if (__predict_false(dont_merge)) {
578                 if (c->mbuf != NULL)
579                         sfxge_lro_deliver(&rxq->lro, c);
580                 if (th->th_flags & (TH_FIN | TH_RST)) {
581                         ++rxq->lro.n_drop_closed;
582                         sfxge_lro_drop(rxq, c);
583                         return (0);
584                 }
585                 goto deliver_buf_out;
586         }
587
588         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
589
590         if (__predict_true(c->mbuf != NULL)) {
591                 /* Remove headers and any padding */
592                 rx_buf->mbuf->m_data += hdr_length;
593                 rx_buf->mbuf->m_len = data_length;
594
595                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
596         } else {
597                 /* Remove any padding */
598                 rx_buf->mbuf->m_len = pkt_length;
599
600                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
601         }
602
603         rx_buf->mbuf = NULL;
604         return (1);
605
606  deliver_buf_out:
607         sfxge_rx_deliver(rxq->sc, rx_buf);
608         return (1);
609 }
610
611 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
612                                uint16_t l2_id, void *nh, struct tcphdr *th)
613 {
614         unsigned bucket = conn_hash & st->conns_mask;
615         struct sfxge_lro_conn *c;
616
617         if (st->conns_n[bucket] >= lro_chain_max) {
618                 ++st->n_too_many;
619                 return;
620         }
621
622         if (!TAILQ_EMPTY(&st->free_conns)) {
623                 c = TAILQ_FIRST(&st->free_conns);
624                 TAILQ_REMOVE(&st->free_conns, c, link);
625         } else {
626                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
627                 if (c == NULL)
628                         return;
629                 c->mbuf = NULL;
630                 c->next_buf.mbuf = NULL;
631         }
632
633         /* Create the connection tracking data */
634         ++st->conns_n[bucket];
635         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
636         c->l2_id = l2_id;
637         c->conn_hash = conn_hash;
638         c->source = th->th_sport;
639         c->dest = th->th_dport;
640         c->n_in_order_pkts = 0;
641         c->last_pkt_ticks = *(volatile int *)&ticks;
642         c->delivered = 0;
643         ++st->n_new_stream;
644         /* NB. We don't initialise c->next_seq, and it doesn't matter what
645          * value it has.  Most likely the next packet received for this
646          * connection will not match -- no harm done.
647          */
648 }
649
650 /* Process mbuf and decide whether to dispatch it to the stack now or
651  * later.
652  */
653 static void
654 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
655 {
656         struct sfxge_softc *sc = rxq->sc;
657         struct mbuf *m = rx_buf->mbuf;
658         struct ether_header *eh;
659         struct sfxge_lro_conn *c;
660         uint16_t l2_id;
661         uint16_t l3_proto;
662         void *nh;
663         struct tcphdr *th;
664         uint32_t conn_hash;
665         unsigned bucket;
666
667         /* Get the hardware hash */
668         conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
669                                       mtod(m, uint8_t *));
670
671         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
672         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
673                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
674                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
675                         SFXGE_LRO_L2_ID_VLAN;
676                 l3_proto = veh->evl_proto;
677                 nh = veh + 1;
678         } else {
679                 l2_id = 0;
680                 l3_proto = eh->ether_type;
681                 nh = eh + 1;
682         }
683
684         /* Check whether this is a suitable packet (unfragmented
685          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
686          * length, and compute a hash if necessary.  If not, return.
687          */
688         if (l3_proto == htons(ETHERTYPE_IP)) {
689                 struct ip *iph = nh;
690                 if ((iph->ip_p - IPPROTO_TCP) |
691                     (iph->ip_hl - (sizeof(*iph) >> 2u)) |
692                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
693                         goto deliver_now;
694                 th = (struct tcphdr *)(iph + 1);
695         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
696                 struct ip6_hdr *iph = nh;
697                 if (iph->ip6_nxt != IPPROTO_TCP)
698                         goto deliver_now;
699                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
700                 th = (struct tcphdr *)(iph + 1);
701         } else {
702                 goto deliver_now;
703         }
704
705         bucket = conn_hash & rxq->lro.conns_mask;
706
707         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
708                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
709                         continue;
710                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
711                         continue;
712                 if (c->mbuf != NULL) {
713                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
714                                 struct ip *c_iph, *iph = nh;
715                                 c_iph = c->nh;
716                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
717                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
718                                         continue;
719                         } else {
720                                 struct ip6_hdr *c_iph, *iph = nh;
721                                 c_iph = c->nh;
722                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
723                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
724                                         continue;
725                         }
726                 }
727
728                 /* Re-insert at head of list to reduce lookup time. */
729                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
730                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
731
732                 if (c->next_buf.mbuf != NULL) {
733                         if (!sfxge_lro_try_merge(rxq, c))
734                                 goto deliver_now;
735                 } else {
736                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
737                             active_link);
738                 }
739                 c->next_buf = *rx_buf;
740                 c->next_eh = eh;
741                 c->next_nh = nh;
742
743                 rx_buf->mbuf = NULL;
744                 rx_buf->flags = EFX_DISCARD;
745                 return;
746         }
747
748         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
749  deliver_now:
750         sfxge_rx_deliver(sc, rx_buf);
751 }
752
753 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
754 {
755         struct sfxge_lro_state *st = &rxq->lro;
756         struct sfxge_lro_conn *c;
757         unsigned t;
758
759         while (!LIST_EMPTY(&st->active_conns)) {
760                 c = LIST_FIRST(&st->active_conns);
761                 if (!c->delivered && c->mbuf != NULL)
762                         sfxge_lro_deliver(st, c);
763                 if (sfxge_lro_try_merge(rxq, c)) {
764                         if (c->mbuf != NULL)
765                                 sfxge_lro_deliver(st, c);
766                         LIST_REMOVE(c, active_link);
767                 }
768                 c->delivered = 0;
769         }
770
771         t = *(volatile int *)&ticks;
772         if (__predict_false(t != st->last_purge_ticks))
773                 sfxge_lro_purge_idle(rxq, t);
774 }
775
776 #else   /* !SFXGE_LRO */
777
778 static void
779 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
780 {
781 }
782
783 static void
784 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
785 {
786 }
787
788 #endif  /* SFXGE_LRO */
789
790 void
791 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
792 {
793         struct sfxge_softc *sc = rxq->sc;
794         int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
795         unsigned int index;
796         struct sfxge_evq *evq;
797         unsigned int completed;
798         unsigned int level;
799         struct mbuf *m;
800         struct sfxge_rx_sw_desc *prev = NULL;
801
802         index = rxq->index;
803         evq = sc->evq[index];
804
805         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
806
807         completed = rxq->completed;
808         while (completed != rxq->pending) {
809                 unsigned int id;
810                 struct sfxge_rx_sw_desc *rx_desc;
811
812                 id = completed++ & rxq->ptr_mask;
813                 rx_desc = &rxq->queue[id];
814                 m = rx_desc->mbuf;
815
816                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
817                         goto discard;
818
819                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
820                         goto discard;
821
822                 prefetch_read_many(mtod(m, caddr_t));
823
824                 /* Check for loopback packets */
825                 if (!(rx_desc->flags & EFX_PKT_IPV4) &&
826                     !(rx_desc->flags & EFX_PKT_IPV6)) {
827                         struct ether_header *etherhp;
828
829                         /*LINTED*/
830                         etherhp = mtod(m, struct ether_header *);
831
832                         if (etherhp->ether_type ==
833                             htons(SFXGE_ETHERTYPE_LOOPBACK)) {
834                                 EFSYS_PROBE(loopback);
835
836                                 rxq->loopback++;
837                                 goto discard;
838                         }
839                 }
840
841                 /* Pass packet up the stack or into LRO (pipelined) */
842                 if (prev != NULL) {
843                         if (lro_enabled)
844                                 sfxge_lro(rxq, prev);
845                         else
846                                 sfxge_rx_deliver(sc, prev);
847                 }
848                 prev = rx_desc;
849                 continue;
850
851 discard:
852                 /* Return the packet to the pool */
853                 m_free(m);
854                 rx_desc->mbuf = NULL;
855         }
856         rxq->completed = completed;
857
858         level = rxq->added - rxq->completed;
859
860         /* Pass last packet up the stack or into LRO */
861         if (prev != NULL) {
862                 if (lro_enabled)
863                         sfxge_lro(rxq, prev);
864                 else
865                         sfxge_rx_deliver(sc, prev);
866         }
867
868         /*
869          * If there are any pending flows and this is the end of the
870          * poll then they must be completed.
871          */
872         if (eop)
873                 sfxge_lro_end_of_burst(rxq);
874
875         /* Top up the queue if necessary */
876         if (level < rxq->refill_threshold)
877                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
878 }
879
880 static void
881 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
882 {
883         struct sfxge_rxq *rxq;
884         struct sfxge_evq *evq;
885         unsigned int count;
886
887         rxq = sc->rxq[index];
888         evq = sc->evq[index];
889
890         SFXGE_EVQ_LOCK(evq);
891
892         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
893             ("rxq not started"));
894
895         rxq->init_state = SFXGE_RXQ_INITIALIZED;
896
897         callout_stop(&rxq->refill_callout);
898
899 again:
900         rxq->flush_state = SFXGE_FLUSH_PENDING;
901
902         /* Flush the receive queue */
903         efx_rx_qflush(rxq->common);
904
905         SFXGE_EVQ_UNLOCK(evq);
906
907         count = 0;
908         do {
909                 /* Spin for 100 ms */
910                 DELAY(100000);
911
912                 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
913                         break;
914
915         } while (++count < 20);
916
917         SFXGE_EVQ_LOCK(evq);
918
919         if (rxq->flush_state == SFXGE_FLUSH_FAILED)
920                 goto again;
921
922         rxq->flush_state = SFXGE_FLUSH_DONE;
923
924         rxq->pending = rxq->added;
925         sfxge_rx_qcomplete(rxq, B_TRUE);
926
927         KASSERT(rxq->completed == rxq->pending,
928             ("rxq->completed != rxq->pending"));
929
930         rxq->added = 0;
931         rxq->pending = 0;
932         rxq->completed = 0;
933         rxq->loopback = 0;
934
935         /* Destroy the common code receive queue. */
936         efx_rx_qdestroy(rxq->common);
937
938         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
939             EFX_RXQ_NBUFS(sc->rxq_entries));
940
941         SFXGE_EVQ_UNLOCK(evq);
942 }
943
944 static int
945 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
946 {
947         struct sfxge_rxq *rxq;
948         efsys_mem_t *esmp;
949         struct sfxge_evq *evq;
950         int rc;
951
952         rxq = sc->rxq[index];
953         esmp = &rxq->mem;
954         evq = sc->evq[index];
955
956         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
957             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
958         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
959             ("evq->init_state != SFXGE_EVQ_STARTED"));
960
961         /* Program the buffer table. */
962         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
963             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
964                 return (rc);
965
966         /* Create the common code receive queue. */
967         if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
968             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
969             &rxq->common)) != 0)
970                 goto fail;
971
972         SFXGE_EVQ_LOCK(evq);
973
974         /* Enable the receive queue. */
975         efx_rx_qenable(rxq->common);
976
977         rxq->init_state = SFXGE_RXQ_STARTED;
978
979         /* Try to fill the queue from the pool. */
980         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
981
982         SFXGE_EVQ_UNLOCK(evq);
983
984         return (0);
985
986 fail:
987         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
988             EFX_RXQ_NBUFS(sc->rxq_entries));
989         return (rc);
990 }
991
992 void
993 sfxge_rx_stop(struct sfxge_softc *sc)
994 {
995         int index;
996
997         /* Stop the receive queue(s) */
998         index = sc->rxq_count;
999         while (--index >= 0)
1000                 sfxge_rx_qstop(sc, index);
1001
1002         sc->rx_prefix_size = 0;
1003         sc->rx_buffer_size = 0;
1004
1005         efx_rx_fini(sc->enp);
1006 }
1007
1008 int
1009 sfxge_rx_start(struct sfxge_softc *sc)
1010 {
1011         struct sfxge_intr *intr;
1012         int index;
1013         int rc;
1014
1015         intr = &sc->intr;
1016
1017         /* Initialize the common code receive module. */
1018         if ((rc = efx_rx_init(sc->enp)) != 0)
1019                 return (rc);
1020
1021         /* Calculate the receive packet buffer size. */
1022         sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1023         sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1024                               sc->rx_prefix_size);
1025
1026         /* Select zone for packet buffers */
1027         if (sc->rx_buffer_size <= MCLBYTES)
1028                 sc->rx_buffer_zone = zone_clust;
1029         else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1030                 sc->rx_buffer_zone = zone_jumbop;
1031         else if (sc->rx_buffer_size <= MJUM9BYTES)
1032                 sc->rx_buffer_zone = zone_jumbo9;
1033         else
1034                 sc->rx_buffer_zone = zone_jumbo16;
1035
1036         /*
1037          * Set up the scale table.  Enable all hash types and hash insertion.
1038          */
1039         for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1040                 sc->rx_indir_table[index] = index % sc->rxq_count;
1041         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1042                                        SFXGE_RX_SCALE_MAX)) != 0)
1043                 goto fail;
1044         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1045             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1046             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1047
1048         if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1049             sizeof(toep_key))) != 0)
1050                 goto fail;
1051
1052         /* Start the receive queue(s). */
1053         for (index = 0; index < sc->rxq_count; index++) {
1054                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1055                         goto fail2;
1056         }
1057
1058         return (0);
1059
1060 fail2:
1061         while (--index >= 0)
1062                 sfxge_rx_qstop(sc, index);
1063
1064 fail:
1065         efx_rx_fini(sc->enp);
1066
1067         return (rc);
1068 }
1069
1070 #ifdef SFXGE_LRO
1071
1072 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1073 {
1074         struct sfxge_lro_state *st = &rxq->lro;
1075         unsigned i;
1076
1077         st->conns_mask = lro_table_size - 1;
1078         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1079                 ("lro_table_size must be a power of 2"));
1080         st->sc = rxq->sc;
1081         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1082                            M_SFXGE, M_WAITOK);
1083         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1084                              M_SFXGE, M_WAITOK);
1085         for (i = 0; i <= st->conns_mask; ++i) {
1086                 TAILQ_INIT(&st->conns[i]);
1087                 st->conns_n[i] = 0;
1088         }
1089         LIST_INIT(&st->active_conns);
1090         TAILQ_INIT(&st->free_conns);
1091 }
1092
1093 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1094 {
1095         struct sfxge_lro_state *st = &rxq->lro;
1096         struct sfxge_lro_conn *c;
1097         unsigned i;
1098
1099         /* Return cleanly if sfxge_lro_init() has not been called. */
1100         if (st->conns == NULL)
1101                 return;
1102
1103         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1104
1105         for (i = 0; i <= st->conns_mask; ++i) {
1106                 while (!TAILQ_EMPTY(&st->conns[i])) {
1107                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1108                         sfxge_lro_drop(rxq, c);
1109                 }
1110         }
1111
1112         while (!TAILQ_EMPTY(&st->free_conns)) {
1113                 c = TAILQ_FIRST(&st->free_conns);
1114                 TAILQ_REMOVE(&st->free_conns, c, link);
1115                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1116                 free(c, M_SFXGE);
1117         }
1118
1119         free(st->conns_n, M_SFXGE);
1120         free(st->conns, M_SFXGE);
1121         st->conns = NULL;
1122 }
1123
1124 #else
1125
1126 static void
1127 sfxge_lro_init(struct sfxge_rxq *rxq)
1128 {
1129 }
1130
1131 static void
1132 sfxge_lro_fini(struct sfxge_rxq *rxq)
1133 {
1134 }
1135
1136 #endif  /* SFXGE_LRO */
1137
1138 static void
1139 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1140 {
1141         struct sfxge_rxq *rxq;
1142
1143         rxq = sc->rxq[index];
1144
1145         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1146             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1147
1148         /* Free the context array and the flow table. */
1149         free(rxq->queue, M_SFXGE);
1150         sfxge_lro_fini(rxq);
1151
1152         /* Release DMA memory. */
1153         sfxge_dma_free(&rxq->mem);
1154
1155         sc->rxq[index] = NULL;
1156
1157         free(rxq, M_SFXGE);
1158 }
1159
1160 static int
1161 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1162 {
1163         struct sfxge_rxq *rxq;
1164         struct sfxge_evq *evq;
1165         efsys_mem_t *esmp;
1166         int rc;
1167
1168         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1169
1170         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1171         rxq->sc = sc;
1172         rxq->index = index;
1173         rxq->entries = sc->rxq_entries;
1174         rxq->ptr_mask = rxq->entries - 1;
1175         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1176
1177         sc->rxq[index] = rxq;
1178         esmp = &rxq->mem;
1179
1180         evq = sc->evq[index];
1181
1182         /* Allocate and zero DMA space. */
1183         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1184                 return (rc);
1185
1186         /* Allocate buffer table entries. */
1187         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1188                                  &rxq->buf_base_id);
1189
1190         /* Allocate the context array and the flow table. */
1191         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1192             M_SFXGE, M_WAITOK | M_ZERO);
1193         sfxge_lro_init(rxq);
1194
1195         callout_init(&rxq->refill_callout, B_TRUE);
1196
1197         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1198
1199         return (0);
1200 }
1201
1202 static const struct {
1203         const char *name;
1204         size_t offset;
1205 } sfxge_rx_stats[] = {
1206 #define SFXGE_RX_STAT(name, member) \
1207         { #name, offsetof(struct sfxge_rxq, member) }
1208 #ifdef SFXGE_LRO
1209         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1210         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1211         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1212         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1213         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1214         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1215         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1216         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1217 #endif
1218 };
1219
1220 static int
1221 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1222 {
1223         struct sfxge_softc *sc = arg1;
1224         unsigned int id = arg2;
1225         unsigned int sum, index;
1226
1227         /* Sum across all RX queues */
1228         sum = 0;
1229         for (index = 0; index < sc->rxq_count; index++)
1230                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1231                                          sfxge_rx_stats[id].offset);
1232
1233         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1234 }
1235
1236 static void
1237 sfxge_rx_stat_init(struct sfxge_softc *sc)
1238 {
1239         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1240         struct sysctl_oid_list *stat_list;
1241         unsigned int id;
1242
1243         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1244
1245         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1246                 SYSCTL_ADD_PROC(
1247                         ctx, stat_list,
1248                         OID_AUTO, sfxge_rx_stats[id].name,
1249                         CTLTYPE_UINT|CTLFLAG_RD,
1250                         sc, id, sfxge_rx_stat_handler, "IU",
1251                         "");
1252         }
1253 }
1254
1255 void
1256 sfxge_rx_fini(struct sfxge_softc *sc)
1257 {
1258         int index;
1259
1260         index = sc->rxq_count;
1261         while (--index >= 0)
1262                 sfxge_rx_qfini(sc, index);
1263
1264         sc->rxq_count = 0;
1265 }
1266
1267 int
1268 sfxge_rx_init(struct sfxge_softc *sc)
1269 {
1270         struct sfxge_intr *intr;
1271         int index;
1272         int rc;
1273
1274 #ifdef SFXGE_LRO
1275         if (!ISP2(lro_table_size)) {
1276                 log(LOG_ERR, "%s=%u must be power of 2",
1277                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1278                 rc = EINVAL;
1279                 goto fail_lro_table_size;
1280         }
1281
1282         if (lro_idle_ticks == 0)
1283                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1284 #endif
1285
1286         intr = &sc->intr;
1287
1288         sc->rxq_count = intr->n_alloc;
1289
1290         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1291             ("intr->state != SFXGE_INTR_INITIALIZED"));
1292
1293         /* Initialize the receive queue(s) - one per interrupt. */
1294         for (index = 0; index < sc->rxq_count; index++) {
1295                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1296                         goto fail;
1297         }
1298
1299         sfxge_rx_stat_init(sc);
1300
1301         return (0);
1302
1303 fail:
1304         /* Tear down the receive queue(s). */
1305         while (--index >= 0)
1306                 sfxge_rx_qfini(sc, index);
1307
1308         sc->rxq_count = 0;
1309
1310 #ifdef SFXGE_LRO
1311 fail_lro_table_size:
1312 #endif
1313         return (rc);
1314 }