]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/dev/sfxge/sfxge_rx.c
MFC r300607
[FreeBSD/stable/10.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/types.h>
38 #include <sys/mbuf.h>
39 #include <sys/smp.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/syslog.h>
43 #include <sys/limits.h>
44 #include <sys/syslog.h>
45
46 #include <net/ethernet.h>
47 #include <net/if.h>
48 #include <net/if_vlan_var.h>
49
50 #include <netinet/in.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 #include <netinet/tcp.h>
54
55 #include <machine/in_cksum.h>
56
57 #include "common/efx.h"
58
59
60 #include "sfxge.h"
61 #include "sfxge_rx.h"
62
63 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
64
65 #ifdef SFXGE_LRO
66
67 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
68             "Large receive offload (LRO) parameters");
69
70 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
71
72 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
73  * means we can accelerate a larger number of streams.
74  */
75 static unsigned lro_table_size = 128;
76 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
77 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
78             &lro_table_size, 0,
79             "Size of the LRO hash table (must be a power of 2)");
80
81 /* Maximum length of a hash chain.  If chains get too long then the lookup
82  * time increases and may exceed the benefit of LRO.
83  */
84 static unsigned lro_chain_max = 20;
85 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
87             &lro_chain_max, 0,
88             "The maximum length of a hash chain");
89
90 /* Maximum time (in ticks) that a connection can be idle before it's LRO
91  * state is discarded.
92  */
93 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
94 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
96             &lro_idle_ticks, 0,
97             "The maximum time (in ticks) that a connection can be idle "
98             "before it's LRO state is discarded");
99
100 /* Number of packets with payload that must arrive in-order before a
101  * connection is eligible for LRO.  The idea is we should avoid coalescing
102  * segments when the sender is in slow-start because reducing the ACK rate
103  * can damage performance.
104  */
105 static int lro_slow_start_packets = 2000;
106 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
107 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
108             &lro_slow_start_packets, 0,
109             "Number of packets with payload that must arrive in-order before "
110             "a connection is eligible for LRO");
111
112 /* Number of packets with payload that must arrive in-order following loss
113  * before a connection is eligible for LRO.  The idea is we should avoid
114  * coalescing segments when the sender is recovering from loss, because
115  * reducing the ACK rate can damage performance.
116  */
117 static int lro_loss_packets = 20;
118 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
119 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
120             &lro_loss_packets, 0,
121             "Number of packets with payload that must arrive in-order "
122             "following loss before a connection is eligible for LRO");
123
124 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
125 #define SFXGE_LRO_L2_ID_VLAN 0x4000
126 #define SFXGE_LRO_L2_ID_IPV6 0x8000
127 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
128 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
129
130 /* Compare IPv6 addresses, avoiding conditional branches */
131 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
132                                    const struct in6_addr *right)
133 {
134 #if LONG_BIT == 64
135         const uint64_t *left64 = (const uint64_t *)left;
136         const uint64_t *right64 = (const uint64_t *)right;
137         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
138 #else
139         return (left->s6_addr32[0] - right->s6_addr32[0]) |
140                (left->s6_addr32[1] - right->s6_addr32[1]) |
141                (left->s6_addr32[2] - right->s6_addr32[2]) |
142                (left->s6_addr32[3] - right->s6_addr32[3]);
143 #endif
144 }
145
146 #endif  /* SFXGE_LRO */
147
148 void
149 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
150 {
151
152         rxq->flush_state = SFXGE_FLUSH_DONE;
153 }
154
155 void
156 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
157 {
158
159         rxq->flush_state = SFXGE_FLUSH_FAILED;
160 }
161
162 static uint8_t toep_key[] = {
163         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
164         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
165         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
166         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
167         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
168 };
169
170 static void
171 sfxge_rx_post_refill(void *arg)
172 {
173         struct sfxge_rxq *rxq = arg;
174         struct sfxge_softc *sc;
175         unsigned int index;
176         struct sfxge_evq *evq;
177         uint16_t magic;
178
179         sc = rxq->sc;
180         index = rxq->index;
181         evq = sc->evq[index];
182         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
183
184         /* This is guaranteed due to the start/stop order of rx and ev */
185         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
186             ("evq not started"));
187         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
188             ("rxq not started"));
189         efx_ev_qpost(evq->common, magic);
190 }
191
192 static void
193 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
194 {
195         /* Initially retry after 100 ms, but back off in case of
196          * repeated failures as we probably have to wait for the
197          * administrator to raise the pool limit. */
198         if (retrying)
199                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
200         else
201                 rxq->refill_delay = hz / 10;
202
203         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
204                              sfxge_rx_post_refill, rxq);
205 }
206
207 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
208 {
209         struct mb_args args;
210         struct mbuf *m;
211
212         /* Allocate mbuf structure */
213         args.flags = M_PKTHDR;
214         args.type = MT_DATA;
215         m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
216
217         /* Allocate (and attach) packet buffer */
218         if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
219                 uma_zfree(zone_mbuf, m);
220                 m = NULL;
221         }
222
223         return (m);
224 }
225
226 #define SFXGE_REFILL_BATCH  64
227
228 static void
229 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
230 {
231         struct sfxge_softc *sc;
232         unsigned int index;
233         struct sfxge_evq *evq;
234         unsigned int batch;
235         unsigned int rxfill;
236         unsigned int mblksize;
237         int ntodo;
238         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
239
240         sc = rxq->sc;
241         index = rxq->index;
242         evq = sc->evq[index];
243
244         prefetch_read_many(sc->enp);
245         prefetch_read_many(rxq->common);
246
247         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
248
249         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
250                 return;
251
252         rxfill = rxq->added - rxq->completed;
253         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
254             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
255         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
256         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
257             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
258
259         if (ntodo == 0)
260                 return;
261
262         batch = 0;
263         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
264         while (ntodo-- > 0) {
265                 unsigned int id;
266                 struct sfxge_rx_sw_desc *rx_desc;
267                 bus_dma_segment_t seg;
268                 struct mbuf *m;
269
270                 id = (rxq->added + batch) & rxq->ptr_mask;
271                 rx_desc = &rxq->queue[id];
272                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
273
274                 rx_desc->flags = EFX_DISCARD;
275                 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
276                 if (m == NULL)
277                         break;
278
279                 /* m_len specifies length of area to be mapped for DMA */
280                 m->m_len  = mblksize;
281                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
282                 m->m_data += sc->rx_buffer_align;
283
284                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
285                 addr[batch++] = seg.ds_addr;
286
287                 if (batch == SFXGE_REFILL_BATCH) {
288                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
289                             rxq->completed, rxq->added);
290                         rxq->added += batch;
291                         batch = 0;
292                 }
293         }
294
295         if (ntodo != 0)
296                 sfxge_rx_schedule_refill(rxq, retrying);
297
298         if (batch != 0) {
299                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
300                     rxq->completed, rxq->added);
301                 rxq->added += batch;
302         }
303
304         /* Make the descriptors visible to the hardware */
305         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
306                         BUS_DMASYNC_PREWRITE);
307
308         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
309
310         /* The queue could still be empty if no descriptors were actually
311          * pushed, in which case there will be no event to cause the next
312          * refill, so we must schedule a refill ourselves.
313          */
314         if(rxq->pushed == rxq->completed) {
315                 sfxge_rx_schedule_refill(rxq, retrying);
316         }
317 }
318
319 void
320 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
321 {
322
323         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
324                 return;
325
326         /* Make sure the queue is full */
327         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
328 }
329
330 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
331 {
332         struct ifnet *ifp = sc->ifnet;
333
334         m->m_pkthdr.rcvif = ifp;
335         m->m_pkthdr.csum_data = 0xffff;
336         ifp->if_input(ifp, m);
337 }
338
339 static void
340 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
341 {
342         struct mbuf *m = rx_desc->mbuf;
343         int flags = rx_desc->flags;
344         int csum_flags;
345
346         /* Convert checksum flags */
347         csum_flags = (flags & EFX_CKSUM_IPV4) ?
348                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
349         if (flags & EFX_CKSUM_TCPUDP)
350                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
351
352         /* The hash covers a 4-tuple for TCP only */
353         if (flags & EFX_PKT_TCP) {
354                 m->m_pkthdr.flowid =
355                         efx_psuedo_hdr_hash_get(sc->enp,
356                                                 EFX_RX_HASHALG_TOEPLITZ,
357                                                 mtod(m, uint8_t *));
358                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
359         }
360         m->m_data += sc->rx_prefix_size;
361         m->m_len = rx_desc->size - sc->rx_prefix_size;
362         m->m_pkthdr.len = m->m_len;
363         m->m_pkthdr.csum_flags = csum_flags;
364         __sfxge_rx_deliver(sc, rx_desc->mbuf);
365
366         rx_desc->flags = EFX_DISCARD;
367         rx_desc->mbuf = NULL;
368 }
369
370 #ifdef SFXGE_LRO
371
372 static void
373 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
374 {
375         struct sfxge_softc *sc = st->sc;
376         struct mbuf *m = c->mbuf;
377         struct tcphdr *c_th;
378         int csum_flags;
379
380         KASSERT(m, ("no mbuf to deliver"));
381
382         ++st->n_bursts;
383
384         /* Finish off packet munging and recalculate IP header checksum. */
385         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
386                 struct ip *iph = c->nh;
387                 iph->ip_len = htons(iph->ip_len);
388                 iph->ip_sum = 0;
389                 iph->ip_sum = in_cksum_hdr(iph);
390                 c_th = (struct tcphdr *)(iph + 1);
391                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
392                               CSUM_IP_CHECKED | CSUM_IP_VALID);
393         } else {
394                 struct ip6_hdr *iph = c->nh;
395                 iph->ip6_plen = htons(iph->ip6_plen);
396                 c_th = (struct tcphdr *)(iph + 1);
397                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
398         }
399
400         c_th->th_win = c->th_last->th_win;
401         c_th->th_ack = c->th_last->th_ack;
402         if (c_th->th_off == c->th_last->th_off) {
403                 /* Copy TCP options (take care to avoid going negative). */
404                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
405                 memcpy(c_th + 1, c->th_last + 1, optlen);
406         }
407
408         m->m_pkthdr.flowid = c->conn_hash;
409         M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
410
411         m->m_pkthdr.csum_flags = csum_flags;
412         __sfxge_rx_deliver(sc, m);
413
414         c->mbuf = NULL;
415         c->delivered = 1;
416 }
417
418 /* Drop the given connection, and add it to the free list. */
419 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
420 {
421         unsigned bucket;
422
423         KASSERT(!c->mbuf, ("found orphaned mbuf"));
424
425         if (c->next_buf.mbuf != NULL) {
426                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
427                 LIST_REMOVE(c, active_link);
428         }
429
430         bucket = c->conn_hash & rxq->lro.conns_mask;
431         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
432         --rxq->lro.conns_n[bucket];
433         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
434         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
435 }
436
437 /* Stop tracking connections that have gone idle in order to keep hash
438  * chains short.
439  */
440 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
441 {
442         struct sfxge_lro_conn *c;
443         unsigned i;
444
445         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
446                 ("found active connections"));
447
448         rxq->lro.last_purge_ticks = now;
449         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
450                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
451                         continue;
452
453                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
454                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
455                         ++rxq->lro.n_drop_idle;
456                         sfxge_lro_drop(rxq, c);
457                 }
458         }
459 }
460
461 static void
462 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
463                 struct mbuf *mbuf, struct tcphdr *th)
464 {
465         struct tcphdr *c_th;
466
467         /* Tack the new mbuf onto the chain. */
468         KASSERT(!mbuf->m_next, ("mbuf already chained"));
469         c->mbuf_tail->m_next = mbuf;
470         c->mbuf_tail = mbuf;
471
472         /* Increase length appropriately */
473         c->mbuf->m_pkthdr.len += mbuf->m_len;
474
475         /* Update the connection state flags */
476         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
477                 struct ip *iph = c->nh;
478                 iph->ip_len += mbuf->m_len;
479                 c_th = (struct tcphdr *)(iph + 1);
480         } else {
481                 struct ip6_hdr *iph = c->nh;
482                 iph->ip6_plen += mbuf->m_len;
483                 c_th = (struct tcphdr *)(iph + 1);
484         }
485         c_th->th_flags |= (th->th_flags & TH_PUSH);
486         c->th_last = th;
487         ++st->n_merges;
488
489         /* Pass packet up now if another segment could overflow the IP
490          * length.
491          */
492         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
493                 sfxge_lro_deliver(st, c);
494 }
495
496 static void
497 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
498                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
499 {
500         /* Start the chain */
501         c->mbuf = mbuf;
502         c->mbuf_tail = c->mbuf;
503         c->nh = nh;
504         c->th_last = th;
505
506         mbuf->m_pkthdr.len = mbuf->m_len;
507
508         /* Mangle header fields for later processing */
509         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
510                 struct ip *iph = nh;
511                 iph->ip_len = ntohs(iph->ip_len);
512         } else {
513                 struct ip6_hdr *iph = nh;
514                 iph->ip6_plen = ntohs(iph->ip6_plen);
515         }
516 }
517
518 /* Try to merge or otherwise hold or deliver (as appropriate) the
519  * packet buffered for this connection (c->next_buf).  Return a flag
520  * indicating whether the connection is still active for LRO purposes.
521  */
522 static int
523 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
524 {
525         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
526         char *eh = c->next_eh;
527         int data_length, hdr_length, dont_merge;
528         unsigned th_seq, pkt_length;
529         struct tcphdr *th;
530         unsigned now;
531
532         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
533                 struct ip *iph = c->next_nh;
534                 th = (struct tcphdr *)(iph + 1);
535                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
536         } else {
537                 struct ip6_hdr *iph = c->next_nh;
538                 th = (struct tcphdr *)(iph + 1);
539                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
540         }
541
542         hdr_length = (char *) th + th->th_off * 4 - eh;
543         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
544                        hdr_length);
545         th_seq = ntohl(th->th_seq);
546         dont_merge = ((data_length <= 0)
547                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
548
549         /* Check for options other than aligned timestamp. */
550         if (th->th_off != 5) {
551                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
552                 if (th->th_off == 8 &&
553                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
554                                         (TCPOPT_NOP << 16) |
555                                         (TCPOPT_TIMESTAMP << 8) |
556                                         TCPOLEN_TIMESTAMP)) {
557                         /* timestamp option -- okay */
558                 } else {
559                         dont_merge = 1;
560                 }
561         }
562
563         if (__predict_false(th_seq != c->next_seq)) {
564                 /* Out-of-order, so start counting again. */
565                 if (c->mbuf != NULL)
566                         sfxge_lro_deliver(&rxq->lro, c);
567                 c->n_in_order_pkts -= lro_loss_packets;
568                 c->next_seq = th_seq + data_length;
569                 ++rxq->lro.n_misorder;
570                 goto deliver_buf_out;
571         }
572         c->next_seq = th_seq + data_length;
573
574         now = ticks;
575         if (now - c->last_pkt_ticks > lro_idle_ticks) {
576                 ++rxq->lro.n_drop_idle;
577                 if (c->mbuf != NULL)
578                         sfxge_lro_deliver(&rxq->lro, c);
579                 sfxge_lro_drop(rxq, c);
580                 return (0);
581         }
582         c->last_pkt_ticks = ticks;
583
584         if (c->n_in_order_pkts < lro_slow_start_packets) {
585                 /* May be in slow-start, so don't merge. */
586                 ++rxq->lro.n_slow_start;
587                 ++c->n_in_order_pkts;
588                 goto deliver_buf_out;
589         }
590
591         if (__predict_false(dont_merge)) {
592                 if (c->mbuf != NULL)
593                         sfxge_lro_deliver(&rxq->lro, c);
594                 if (th->th_flags & (TH_FIN | TH_RST)) {
595                         ++rxq->lro.n_drop_closed;
596                         sfxge_lro_drop(rxq, c);
597                         return (0);
598                 }
599                 goto deliver_buf_out;
600         }
601
602         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
603
604         if (__predict_true(c->mbuf != NULL)) {
605                 /* Remove headers and any padding */
606                 rx_buf->mbuf->m_data += hdr_length;
607                 rx_buf->mbuf->m_len = data_length;
608
609                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
610         } else {
611                 /* Remove any padding */
612                 rx_buf->mbuf->m_len = pkt_length;
613
614                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
615         }
616
617         rx_buf->mbuf = NULL;
618         return (1);
619
620  deliver_buf_out:
621         sfxge_rx_deliver(rxq->sc, rx_buf);
622         return (1);
623 }
624
625 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
626                                uint16_t l2_id, void *nh, struct tcphdr *th)
627 {
628         unsigned bucket = conn_hash & st->conns_mask;
629         struct sfxge_lro_conn *c;
630
631         if (st->conns_n[bucket] >= lro_chain_max) {
632                 ++st->n_too_many;
633                 return;
634         }
635
636         if (!TAILQ_EMPTY(&st->free_conns)) {
637                 c = TAILQ_FIRST(&st->free_conns);
638                 TAILQ_REMOVE(&st->free_conns, c, link);
639         } else {
640                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
641                 if (c == NULL)
642                         return;
643                 c->mbuf = NULL;
644                 c->next_buf.mbuf = NULL;
645         }
646
647         /* Create the connection tracking data */
648         ++st->conns_n[bucket];
649         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
650         c->l2_id = l2_id;
651         c->conn_hash = conn_hash;
652         c->source = th->th_sport;
653         c->dest = th->th_dport;
654         c->n_in_order_pkts = 0;
655         c->last_pkt_ticks = *(volatile int *)&ticks;
656         c->delivered = 0;
657         ++st->n_new_stream;
658         /* NB. We don't initialise c->next_seq, and it doesn't matter what
659          * value it has.  Most likely the next packet received for this
660          * connection will not match -- no harm done.
661          */
662 }
663
664 /* Process mbuf and decide whether to dispatch it to the stack now or
665  * later.
666  */
667 static void
668 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
669 {
670         struct sfxge_softc *sc = rxq->sc;
671         struct mbuf *m = rx_buf->mbuf;
672         struct ether_header *eh;
673         struct sfxge_lro_conn *c;
674         uint16_t l2_id;
675         uint16_t l3_proto;
676         void *nh;
677         struct tcphdr *th;
678         uint32_t conn_hash;
679         unsigned bucket;
680
681         /* Get the hardware hash */
682         conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
683                                             EFX_RX_HASHALG_TOEPLITZ,
684                                             mtod(m, uint8_t *));
685
686         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
687         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
688                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
689                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
690                         SFXGE_LRO_L2_ID_VLAN;
691                 l3_proto = veh->evl_proto;
692                 nh = veh + 1;
693         } else {
694                 l2_id = 0;
695                 l3_proto = eh->ether_type;
696                 nh = eh + 1;
697         }
698
699         /* Check whether this is a suitable packet (unfragmented
700          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
701          * length, and compute a hash if necessary.  If not, return.
702          */
703         if (l3_proto == htons(ETHERTYPE_IP)) {
704                 struct ip *iph = nh;
705
706                 KASSERT(iph->ip_p == IPPROTO_TCP,
707                     ("IPv4 protocol is not TCP, but packet marker is set"));
708                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
709                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
710                         goto deliver_now;
711                 th = (struct tcphdr *)(iph + 1);
712         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
713                 struct ip6_hdr *iph = nh;
714
715                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
716                     ("IPv6 next header is not TCP, but packet marker is set"));
717                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
718                 th = (struct tcphdr *)(iph + 1);
719         } else {
720                 goto deliver_now;
721         }
722
723         bucket = conn_hash & rxq->lro.conns_mask;
724
725         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
726                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
727                         continue;
728                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
729                         continue;
730                 if (c->mbuf != NULL) {
731                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
732                                 struct ip *c_iph, *iph = nh;
733                                 c_iph = c->nh;
734                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
735                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
736                                         continue;
737                         } else {
738                                 struct ip6_hdr *c_iph, *iph = nh;
739                                 c_iph = c->nh;
740                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
741                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
742                                         continue;
743                         }
744                 }
745
746                 /* Re-insert at head of list to reduce lookup time. */
747                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
748                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
749
750                 if (c->next_buf.mbuf != NULL) {
751                         if (!sfxge_lro_try_merge(rxq, c))
752                                 goto deliver_now;
753                 } else {
754                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
755                             active_link);
756                 }
757                 c->next_buf = *rx_buf;
758                 c->next_eh = eh;
759                 c->next_nh = nh;
760
761                 rx_buf->mbuf = NULL;
762                 rx_buf->flags = EFX_DISCARD;
763                 return;
764         }
765
766         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
767  deliver_now:
768         sfxge_rx_deliver(sc, rx_buf);
769 }
770
771 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
772 {
773         struct sfxge_lro_state *st = &rxq->lro;
774         struct sfxge_lro_conn *c;
775         unsigned t;
776
777         while (!LIST_EMPTY(&st->active_conns)) {
778                 c = LIST_FIRST(&st->active_conns);
779                 if (!c->delivered && c->mbuf != NULL)
780                         sfxge_lro_deliver(st, c);
781                 if (sfxge_lro_try_merge(rxq, c)) {
782                         if (c->mbuf != NULL)
783                                 sfxge_lro_deliver(st, c);
784                         LIST_REMOVE(c, active_link);
785                 }
786                 c->delivered = 0;
787         }
788
789         t = *(volatile int *)&ticks;
790         if (__predict_false(t != st->last_purge_ticks))
791                 sfxge_lro_purge_idle(rxq, t);
792 }
793
794 #else   /* !SFXGE_LRO */
795
796 static void
797 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
798 {
799 }
800
801 static void
802 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
803 {
804 }
805
806 #endif  /* SFXGE_LRO */
807
808 void
809 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
810 {
811         struct sfxge_softc *sc = rxq->sc;
812         int if_capenable = sc->ifnet->if_capenable;
813         int lro_enabled = if_capenable & IFCAP_LRO;
814         unsigned int index;
815         struct sfxge_evq *evq;
816         unsigned int completed;
817         unsigned int level;
818         struct mbuf *m;
819         struct sfxge_rx_sw_desc *prev = NULL;
820
821         index = rxq->index;
822         evq = sc->evq[index];
823
824         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
825
826         completed = rxq->completed;
827         while (completed != rxq->pending) {
828                 unsigned int id;
829                 struct sfxge_rx_sw_desc *rx_desc;
830
831                 id = completed++ & rxq->ptr_mask;
832                 rx_desc = &rxq->queue[id];
833                 m = rx_desc->mbuf;
834
835                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
836                         goto discard;
837
838                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
839                         goto discard;
840
841                 /* Read the length from the psuedo header if required */
842                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
843                         uint16_t tmp_size;
844                         int rc;
845                         rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
846                                                            mtod(m, uint8_t *),
847                                                            &tmp_size);
848                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
849                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
850                 }
851
852                 prefetch_read_many(mtod(m, caddr_t));
853
854                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
855                 case EFX_PKT_IPV4:
856                         if (~if_capenable & IFCAP_RXCSUM)
857                                 rx_desc->flags &=
858                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
859                         break;
860                 case EFX_PKT_IPV6:
861                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
862                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
863                         break;
864                 case 0:
865                         /* Check for loopback packets */
866                         {
867                                 struct ether_header *etherhp;
868
869                                 /*LINTED*/
870                                 etherhp = mtod(m, struct ether_header *);
871
872                                 if (etherhp->ether_type ==
873                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
874                                         EFSYS_PROBE(loopback);
875
876                                         rxq->loopback++;
877                                         goto discard;
878                                 }
879                         }
880                         break;
881                 default:
882                         KASSERT(B_FALSE,
883                             ("Rx descriptor with both IPv4 and IPv6 flags"));
884                         goto discard;
885                 }
886
887                 /* Pass packet up the stack or into LRO (pipelined) */
888                 if (prev != NULL) {
889                         if (lro_enabled &&
890                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
891                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
892                                 sfxge_lro(rxq, prev);
893                         else
894                                 sfxge_rx_deliver(sc, prev);
895                 }
896                 prev = rx_desc;
897                 continue;
898
899 discard:
900                 /* Return the packet to the pool */
901                 m_free(m);
902                 rx_desc->mbuf = NULL;
903         }
904         rxq->completed = completed;
905
906         level = rxq->added - rxq->completed;
907
908         /* Pass last packet up the stack or into LRO */
909         if (prev != NULL) {
910                 if (lro_enabled &&
911                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
912                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
913                         sfxge_lro(rxq, prev);
914                 else
915                         sfxge_rx_deliver(sc, prev);
916         }
917
918         /*
919          * If there are any pending flows and this is the end of the
920          * poll then they must be completed.
921          */
922         if (eop)
923                 sfxge_lro_end_of_burst(rxq);
924
925         /* Top up the queue if necessary */
926         if (level < rxq->refill_threshold)
927                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
928 }
929
930 static void
931 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
932 {
933         struct sfxge_rxq *rxq;
934         struct sfxge_evq *evq;
935         unsigned int count;
936         unsigned int retry = 3;
937
938         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
939
940         rxq = sc->rxq[index];
941         evq = sc->evq[index];
942
943         SFXGE_EVQ_LOCK(evq);
944
945         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
946             ("rxq not started"));
947
948         rxq->init_state = SFXGE_RXQ_INITIALIZED;
949
950         callout_stop(&rxq->refill_callout);
951
952         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
953                 rxq->flush_state = SFXGE_FLUSH_PENDING;
954
955                 SFXGE_EVQ_UNLOCK(evq);
956
957                 /* Flush the receive queue */
958                 if (efx_rx_qflush(rxq->common) != 0) {
959                         SFXGE_EVQ_LOCK(evq);
960                         rxq->flush_state = SFXGE_FLUSH_FAILED;
961                         break;
962                 }
963
964                 count = 0;
965                 do {
966                         /* Spin for 100 ms */
967                         DELAY(100000);
968
969                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
970                                 break;
971
972                 } while (++count < 20);
973
974                 SFXGE_EVQ_LOCK(evq);
975
976                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
977                         /* Flush timeout - neither done nor failed */
978                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
979                             device_get_nameunit(sc->dev), index);
980                         rxq->flush_state = SFXGE_FLUSH_DONE;
981                 }
982                 retry--;
983         }
984         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
985                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
986                     device_get_nameunit(sc->dev), index);
987                 rxq->flush_state = SFXGE_FLUSH_DONE;
988         }
989
990         rxq->pending = rxq->added;
991         sfxge_rx_qcomplete(rxq, B_TRUE);
992
993         KASSERT(rxq->completed == rxq->pending,
994             ("rxq->completed != rxq->pending"));
995
996         rxq->added = 0;
997         rxq->pushed = 0;
998         rxq->pending = 0;
999         rxq->completed = 0;
1000         rxq->loopback = 0;
1001
1002         /* Destroy the common code receive queue. */
1003         efx_rx_qdestroy(rxq->common);
1004
1005         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1006             EFX_RXQ_NBUFS(sc->rxq_entries));
1007
1008         SFXGE_EVQ_UNLOCK(evq);
1009 }
1010
1011 static int
1012 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1013 {
1014         struct sfxge_rxq *rxq;
1015         efsys_mem_t *esmp;
1016         struct sfxge_evq *evq;
1017         int rc;
1018
1019         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1020
1021         rxq = sc->rxq[index];
1022         esmp = &rxq->mem;
1023         evq = sc->evq[index];
1024
1025         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1026             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1027         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1028             ("evq->init_state != SFXGE_EVQ_STARTED"));
1029
1030         /* Program the buffer table. */
1031         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1032             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1033                 return (rc);
1034
1035         /* Create the common code receive queue. */
1036         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1037             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1038             &rxq->common)) != 0)
1039                 goto fail;
1040
1041         SFXGE_EVQ_LOCK(evq);
1042
1043         /* Enable the receive queue. */
1044         efx_rx_qenable(rxq->common);
1045
1046         rxq->init_state = SFXGE_RXQ_STARTED;
1047         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1048
1049         /* Try to fill the queue from the pool. */
1050         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1051
1052         SFXGE_EVQ_UNLOCK(evq);
1053
1054         return (0);
1055
1056 fail:
1057         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1058             EFX_RXQ_NBUFS(sc->rxq_entries));
1059         return (rc);
1060 }
1061
1062 void
1063 sfxge_rx_stop(struct sfxge_softc *sc)
1064 {
1065         int index;
1066
1067         efx_mac_filter_default_rxq_clear(sc->enp);
1068
1069         /* Stop the receive queue(s) */
1070         index = sc->rxq_count;
1071         while (--index >= 0)
1072                 sfxge_rx_qstop(sc, index);
1073
1074         sc->rx_prefix_size = 0;
1075         sc->rx_buffer_size = 0;
1076
1077         efx_rx_fini(sc->enp);
1078 }
1079
1080 int
1081 sfxge_rx_start(struct sfxge_softc *sc)
1082 {
1083         struct sfxge_intr *intr;
1084         const efx_nic_cfg_t *encp;
1085         size_t hdrlen, align, reserved;
1086         int index;
1087         int rc;
1088
1089         intr = &sc->intr;
1090
1091         /* Initialize the common code receive module. */
1092         if ((rc = efx_rx_init(sc->enp)) != 0)
1093                 return (rc);
1094
1095         encp = efx_nic_cfg_get(sc->enp);
1096         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1097
1098         /* Calculate the receive packet buffer size. */ 
1099         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1100
1101         /* Ensure IP headers are 32bit aligned */
1102         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1103         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1104
1105         sc->rx_buffer_size += sc->rx_buffer_align;
1106
1107         /* Align end of packet buffer for RX DMA end padding */
1108         align = MAX(1, encp->enc_rx_buf_align_end);
1109         EFSYS_ASSERT(ISP2(align));
1110         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1111
1112         /*
1113          * Standard mbuf zones only guarantee pointer-size alignment;
1114          * we need extra space to align to the cache line
1115          */
1116         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1117
1118         /* Select zone for packet buffers */
1119         if (reserved <= MCLBYTES)
1120                 sc->rx_buffer_zone = zone_clust;
1121         else if (reserved <= MJUMPAGESIZE)
1122                 sc->rx_buffer_zone = zone_jumbop;
1123         else if (reserved <= MJUM9BYTES)
1124                 sc->rx_buffer_zone = zone_jumbo9;
1125         else
1126                 sc->rx_buffer_zone = zone_jumbo16;
1127
1128         /*
1129          * Set up the scale table.  Enable all hash types and hash insertion.
1130          */
1131         for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1132                 sc->rx_indir_table[index] = index % sc->rxq_count;
1133         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1134                                        SFXGE_RX_SCALE_MAX)) != 0)
1135                 goto fail;
1136         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1137             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1138             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1139
1140         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1141                                        sizeof(toep_key))) != 0)
1142                 goto fail;
1143
1144         /* Start the receive queue(s). */
1145         for (index = 0; index < sc->rxq_count; index++) {
1146                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1147                         goto fail2;
1148         }
1149
1150         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1151                                             sc->intr.n_alloc > 1);
1152         if (rc != 0)
1153                 goto fail3;
1154
1155         return (0);
1156
1157 fail3:
1158 fail2:
1159         while (--index >= 0)
1160                 sfxge_rx_qstop(sc, index);
1161
1162 fail:
1163         efx_rx_fini(sc->enp);
1164
1165         return (rc);
1166 }
1167
1168 #ifdef SFXGE_LRO
1169
1170 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1171 {
1172         struct sfxge_lro_state *st = &rxq->lro;
1173         unsigned i;
1174
1175         st->conns_mask = lro_table_size - 1;
1176         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1177                 ("lro_table_size must be a power of 2"));
1178         st->sc = rxq->sc;
1179         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1180                            M_SFXGE, M_WAITOK);
1181         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1182                              M_SFXGE, M_WAITOK);
1183         for (i = 0; i <= st->conns_mask; ++i) {
1184                 TAILQ_INIT(&st->conns[i]);
1185                 st->conns_n[i] = 0;
1186         }
1187         LIST_INIT(&st->active_conns);
1188         TAILQ_INIT(&st->free_conns);
1189 }
1190
1191 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1192 {
1193         struct sfxge_lro_state *st = &rxq->lro;
1194         struct sfxge_lro_conn *c;
1195         unsigned i;
1196
1197         /* Return cleanly if sfxge_lro_init() has not been called. */
1198         if (st->conns == NULL)
1199                 return;
1200
1201         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1202
1203         for (i = 0; i <= st->conns_mask; ++i) {
1204                 while (!TAILQ_EMPTY(&st->conns[i])) {
1205                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1206                         sfxge_lro_drop(rxq, c);
1207                 }
1208         }
1209
1210         while (!TAILQ_EMPTY(&st->free_conns)) {
1211                 c = TAILQ_FIRST(&st->free_conns);
1212                 TAILQ_REMOVE(&st->free_conns, c, link);
1213                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1214                 free(c, M_SFXGE);
1215         }
1216
1217         free(st->conns_n, M_SFXGE);
1218         free(st->conns, M_SFXGE);
1219         st->conns = NULL;
1220 }
1221
1222 #else
1223
1224 static void
1225 sfxge_lro_init(struct sfxge_rxq *rxq)
1226 {
1227 }
1228
1229 static void
1230 sfxge_lro_fini(struct sfxge_rxq *rxq)
1231 {
1232 }
1233
1234 #endif  /* SFXGE_LRO */
1235
1236 static void
1237 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1238 {
1239         struct sfxge_rxq *rxq;
1240
1241         rxq = sc->rxq[index];
1242
1243         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1244             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1245
1246         /* Free the context array and the flow table. */
1247         free(rxq->queue, M_SFXGE);
1248         sfxge_lro_fini(rxq);
1249
1250         /* Release DMA memory. */
1251         sfxge_dma_free(&rxq->mem);
1252
1253         sc->rxq[index] = NULL;
1254
1255         free(rxq, M_SFXGE);
1256 }
1257
1258 static int
1259 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1260 {
1261         struct sfxge_rxq *rxq;
1262         struct sfxge_evq *evq;
1263         efsys_mem_t *esmp;
1264         int rc;
1265
1266         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1267
1268         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1269         rxq->sc = sc;
1270         rxq->index = index;
1271         rxq->entries = sc->rxq_entries;
1272         rxq->ptr_mask = rxq->entries - 1;
1273         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1274
1275         sc->rxq[index] = rxq;
1276         esmp = &rxq->mem;
1277
1278         evq = sc->evq[index];
1279
1280         /* Allocate and zero DMA space. */
1281         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1282                 return (rc);
1283
1284         /* Allocate buffer table entries. */
1285         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1286                                  &rxq->buf_base_id);
1287
1288         /* Allocate the context array and the flow table. */
1289         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1290             M_SFXGE, M_WAITOK | M_ZERO);
1291         sfxge_lro_init(rxq);
1292
1293         callout_init(&rxq->refill_callout, B_TRUE);
1294
1295         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1296
1297         return (0);
1298 }
1299
1300 static const struct {
1301         const char *name;
1302         size_t offset;
1303 } sfxge_rx_stats[] = {
1304 #define SFXGE_RX_STAT(name, member) \
1305         { #name, offsetof(struct sfxge_rxq, member) }
1306 #ifdef SFXGE_LRO
1307         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1308         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1309         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1310         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1311         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1312         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1313         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1314         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1315 #endif
1316 };
1317
1318 static int
1319 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1320 {
1321         struct sfxge_softc *sc = arg1;
1322         unsigned int id = arg2;
1323         unsigned int sum, index;
1324
1325         /* Sum across all RX queues */
1326         sum = 0;
1327         for (index = 0; index < sc->rxq_count; index++)
1328                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1329                                          sfxge_rx_stats[id].offset);
1330
1331         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1332 }
1333
1334 static void
1335 sfxge_rx_stat_init(struct sfxge_softc *sc)
1336 {
1337         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1338         struct sysctl_oid_list *stat_list;
1339         unsigned int id;
1340
1341         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1342
1343         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1344                 SYSCTL_ADD_PROC(
1345                         ctx, stat_list,
1346                         OID_AUTO, sfxge_rx_stats[id].name,
1347                         CTLTYPE_UINT|CTLFLAG_RD,
1348                         sc, id, sfxge_rx_stat_handler, "IU",
1349                         "");
1350         }
1351 }
1352
1353 void
1354 sfxge_rx_fini(struct sfxge_softc *sc)
1355 {
1356         int index;
1357
1358         index = sc->rxq_count;
1359         while (--index >= 0)
1360                 sfxge_rx_qfini(sc, index);
1361
1362         sc->rxq_count = 0;
1363 }
1364
1365 int
1366 sfxge_rx_init(struct sfxge_softc *sc)
1367 {
1368         struct sfxge_intr *intr;
1369         int index;
1370         int rc;
1371
1372 #ifdef SFXGE_LRO
1373         if (!ISP2(lro_table_size)) {
1374                 log(LOG_ERR, "%s=%u must be power of 2",
1375                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1376                 rc = EINVAL;
1377                 goto fail_lro_table_size;
1378         }
1379
1380         if (lro_idle_ticks == 0)
1381                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1382 #endif
1383
1384         intr = &sc->intr;
1385
1386         sc->rxq_count = intr->n_alloc;
1387
1388         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1389             ("intr->state != SFXGE_INTR_INITIALIZED"));
1390
1391         /* Initialize the receive queue(s) - one per interrupt. */
1392         for (index = 0; index < sc->rxq_count; index++) {
1393                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1394                         goto fail;
1395         }
1396
1397         sfxge_rx_stat_init(sc);
1398
1399         return (0);
1400
1401 fail:
1402         /* Tear down the receive queue(s). */
1403         while (--index >= 0)
1404                 sfxge_rx_qfini(sc, index);
1405
1406         sc->rxq_count = 0;
1407
1408 #ifdef SFXGE_LRO
1409 fail_lro_table_size:
1410 #endif
1411         return (rc);
1412 }