]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/sfxge/sfxge_rx.c
MFV r301238:
[FreeBSD/FreeBSD.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * Copyright (c) 2010-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/param.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/smp.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/syslog.h>
44 #include <sys/limits.h>
45 #include <sys/syslog.h>
46
47 #include <net/ethernet.h>
48 #include <net/if.h>
49 #include <net/if_vlan_var.h>
50
51 #include <netinet/in.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip6.h>
54 #include <netinet/tcp.h>
55
56 #include <machine/in_cksum.h>
57
58 #include "common/efx.h"
59
60
61 #include "sfxge.h"
62 #include "sfxge_rx.h"
63
64 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
65
66 #ifdef SFXGE_LRO
67
68 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
69             "Large receive offload (LRO) parameters");
70
71 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
72
73 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
74  * means we can accelerate a larger number of streams.
75  */
76 static unsigned lro_table_size = 128;
77 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
78 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
79             &lro_table_size, 0,
80             "Size of the LRO hash table (must be a power of 2)");
81
82 /* Maximum length of a hash chain.  If chains get too long then the lookup
83  * time increases and may exceed the benefit of LRO.
84  */
85 static unsigned lro_chain_max = 20;
86 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
87 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
88             &lro_chain_max, 0,
89             "The maximum length of a hash chain");
90
91 /* Maximum time (in ticks) that a connection can be idle before it's LRO
92  * state is discarded.
93  */
94 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
95 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
96 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
97             &lro_idle_ticks, 0,
98             "The maximum time (in ticks) that a connection can be idle "
99             "before it's LRO state is discarded");
100
101 /* Number of packets with payload that must arrive in-order before a
102  * connection is eligible for LRO.  The idea is we should avoid coalescing
103  * segments when the sender is in slow-start because reducing the ACK rate
104  * can damage performance.
105  */
106 static int lro_slow_start_packets = 2000;
107 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
108 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
109             &lro_slow_start_packets, 0,
110             "Number of packets with payload that must arrive in-order before "
111             "a connection is eligible for LRO");
112
113 /* Number of packets with payload that must arrive in-order following loss
114  * before a connection is eligible for LRO.  The idea is we should avoid
115  * coalescing segments when the sender is recovering from loss, because
116  * reducing the ACK rate can damage performance.
117  */
118 static int lro_loss_packets = 20;
119 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
120 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
121             &lro_loss_packets, 0,
122             "Number of packets with payload that must arrive in-order "
123             "following loss before a connection is eligible for LRO");
124
125 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
126 #define SFXGE_LRO_L2_ID_VLAN 0x4000
127 #define SFXGE_LRO_L2_ID_IPV6 0x8000
128 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
129 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
130
131 /* Compare IPv6 addresses, avoiding conditional branches */
132 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
133                                    const struct in6_addr *right)
134 {
135 #if LONG_BIT == 64
136         const uint64_t *left64 = (const uint64_t *)left;
137         const uint64_t *right64 = (const uint64_t *)right;
138         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
139 #else
140         return (left->s6_addr32[0] - right->s6_addr32[0]) |
141                (left->s6_addr32[1] - right->s6_addr32[1]) |
142                (left->s6_addr32[2] - right->s6_addr32[2]) |
143                (left->s6_addr32[3] - right->s6_addr32[3]);
144 #endif
145 }
146
147 #endif  /* SFXGE_LRO */
148
149 void
150 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
151 {
152
153         rxq->flush_state = SFXGE_FLUSH_DONE;
154 }
155
156 void
157 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
158 {
159
160         rxq->flush_state = SFXGE_FLUSH_FAILED;
161 }
162
163 static uint8_t toep_key[] = {
164         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
165         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
166         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
167         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
168         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
169 };
170
171 static void
172 sfxge_rx_post_refill(void *arg)
173 {
174         struct sfxge_rxq *rxq = arg;
175         struct sfxge_softc *sc;
176         unsigned int index;
177         struct sfxge_evq *evq;
178         uint16_t magic;
179
180         sc = rxq->sc;
181         index = rxq->index;
182         evq = sc->evq[index];
183         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
184
185         /* This is guaranteed due to the start/stop order of rx and ev */
186         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
187             ("evq not started"));
188         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
189             ("rxq not started"));
190         efx_ev_qpost(evq->common, magic);
191 }
192
193 static void
194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
195 {
196         /* Initially retry after 100 ms, but back off in case of
197          * repeated failures as we probably have to wait for the
198          * administrator to raise the pool limit. */
199         if (retrying)
200                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
201         else
202                 rxq->refill_delay = hz / 10;
203
204         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
205                              sfxge_rx_post_refill, rxq);
206 }
207
208 #define SFXGE_REFILL_BATCH  64
209
210 static void
211 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
212 {
213         struct sfxge_softc *sc;
214         unsigned int index;
215         struct sfxge_evq *evq;
216         unsigned int batch;
217         unsigned int rxfill;
218         unsigned int mblksize;
219         int ntodo;
220         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
221
222         sc = rxq->sc;
223         index = rxq->index;
224         evq = sc->evq[index];
225
226         prefetch_read_many(sc->enp);
227         prefetch_read_many(rxq->common);
228
229         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
230
231         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
232                 return;
233
234         rxfill = rxq->added - rxq->completed;
235         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
236             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
237         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
238         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
239             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
240
241         if (ntodo == 0)
242                 return;
243
244         batch = 0;
245         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
246         while (ntodo-- > 0) {
247                 unsigned int id;
248                 struct sfxge_rx_sw_desc *rx_desc;
249                 bus_dma_segment_t seg;
250                 struct mbuf *m;
251
252                 id = (rxq->added + batch) & rxq->ptr_mask;
253                 rx_desc = &rxq->queue[id];
254                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
255
256                 rx_desc->flags = EFX_DISCARD;
257                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
258                     sc->rx_cluster_size);
259                 if (m == NULL)
260                         break;
261
262                 /* m_len specifies length of area to be mapped for DMA */
263                 m->m_len  = mblksize;
264                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
265                 m->m_data += sc->rx_buffer_align;
266
267                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
268                 addr[batch++] = seg.ds_addr;
269
270                 if (batch == SFXGE_REFILL_BATCH) {
271                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
272                             rxq->completed, rxq->added);
273                         rxq->added += batch;
274                         batch = 0;
275                 }
276         }
277
278         if (ntodo != 0)
279                 sfxge_rx_schedule_refill(rxq, retrying);
280
281         if (batch != 0) {
282                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
283                     rxq->completed, rxq->added);
284                 rxq->added += batch;
285         }
286
287         /* Make the descriptors visible to the hardware */
288         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
289                         BUS_DMASYNC_PREWRITE);
290
291         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
292
293         /* The queue could still be empty if no descriptors were actually
294          * pushed, in which case there will be no event to cause the next
295          * refill, so we must schedule a refill ourselves.
296          */
297         if(rxq->pushed == rxq->completed) {
298                 sfxge_rx_schedule_refill(rxq, retrying);
299         }
300 }
301
302 void
303 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
304 {
305
306         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
307                 return;
308
309         /* Make sure the queue is full */
310         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
311 }
312
313 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
314 {
315         struct ifnet *ifp = sc->ifnet;
316
317         m->m_pkthdr.rcvif = ifp;
318         m->m_pkthdr.csum_data = 0xffff;
319         ifp->if_input(ifp, m);
320 }
321
322 static void
323 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
324 {
325         struct mbuf *m = rx_desc->mbuf;
326         int flags = rx_desc->flags;
327         int csum_flags;
328
329         /* Convert checksum flags */
330         csum_flags = (flags & EFX_CKSUM_IPV4) ?
331                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
332         if (flags & EFX_CKSUM_TCPUDP)
333                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
334
335         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
336                 m->m_pkthdr.flowid =
337                         efx_psuedo_hdr_hash_get(sc->enp,
338                                                 EFX_RX_HASHALG_TOEPLITZ,
339                                                 mtod(m, uint8_t *));
340                 /* The hash covers a 4-tuple for TCP only */
341                 M_HASHTYPE_SET(m,
342                     (flags & EFX_PKT_IPV4) ?
343                         ((flags & EFX_PKT_TCP) ?
344                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
345                         ((flags & EFX_PKT_TCP) ?
346                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
347         }
348         m->m_data += sc->rx_prefix_size;
349         m->m_len = rx_desc->size - sc->rx_prefix_size;
350         m->m_pkthdr.len = m->m_len;
351         m->m_pkthdr.csum_flags = csum_flags;
352         __sfxge_rx_deliver(sc, rx_desc->mbuf);
353
354         rx_desc->flags = EFX_DISCARD;
355         rx_desc->mbuf = NULL;
356 }
357
358 #ifdef SFXGE_LRO
359
360 static void
361 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
362 {
363         struct sfxge_softc *sc = st->sc;
364         struct mbuf *m = c->mbuf;
365         struct tcphdr *c_th;
366         int csum_flags;
367
368         KASSERT(m, ("no mbuf to deliver"));
369
370         ++st->n_bursts;
371
372         /* Finish off packet munging and recalculate IP header checksum. */
373         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
374                 struct ip *iph = c->nh;
375                 iph->ip_len = htons(iph->ip_len);
376                 iph->ip_sum = 0;
377                 iph->ip_sum = in_cksum_hdr(iph);
378                 c_th = (struct tcphdr *)(iph + 1);
379                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
380                               CSUM_IP_CHECKED | CSUM_IP_VALID);
381         } else {
382                 struct ip6_hdr *iph = c->nh;
383                 iph->ip6_plen = htons(iph->ip6_plen);
384                 c_th = (struct tcphdr *)(iph + 1);
385                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
386         }
387
388         c_th->th_win = c->th_last->th_win;
389         c_th->th_ack = c->th_last->th_ack;
390         if (c_th->th_off == c->th_last->th_off) {
391                 /* Copy TCP options (take care to avoid going negative). */
392                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
393                 memcpy(c_th + 1, c->th_last + 1, optlen);
394         }
395
396         m->m_pkthdr.flowid = c->conn_hash;
397         M_HASHTYPE_SET(m,
398             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
399                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
400
401         m->m_pkthdr.csum_flags = csum_flags;
402         __sfxge_rx_deliver(sc, m);
403
404         c->mbuf = NULL;
405         c->delivered = 1;
406 }
407
408 /* Drop the given connection, and add it to the free list. */
409 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
410 {
411         unsigned bucket;
412
413         KASSERT(!c->mbuf, ("found orphaned mbuf"));
414
415         if (c->next_buf.mbuf != NULL) {
416                 sfxge_rx_deliver(rxq->sc, &c->next_buf);
417                 LIST_REMOVE(c, active_link);
418         }
419
420         bucket = c->conn_hash & rxq->lro.conns_mask;
421         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
422         --rxq->lro.conns_n[bucket];
423         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
424         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
425 }
426
427 /* Stop tracking connections that have gone idle in order to keep hash
428  * chains short.
429  */
430 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
431 {
432         struct sfxge_lro_conn *c;
433         unsigned i;
434
435         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
436                 ("found active connections"));
437
438         rxq->lro.last_purge_ticks = now;
439         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
440                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
441                         continue;
442
443                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
444                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
445                         ++rxq->lro.n_drop_idle;
446                         sfxge_lro_drop(rxq, c);
447                 }
448         }
449 }
450
451 static void
452 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
453                 struct mbuf *mbuf, struct tcphdr *th)
454 {
455         struct tcphdr *c_th;
456
457         /* Tack the new mbuf onto the chain. */
458         KASSERT(!mbuf->m_next, ("mbuf already chained"));
459         c->mbuf_tail->m_next = mbuf;
460         c->mbuf_tail = mbuf;
461
462         /* Increase length appropriately */
463         c->mbuf->m_pkthdr.len += mbuf->m_len;
464
465         /* Update the connection state flags */
466         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
467                 struct ip *iph = c->nh;
468                 iph->ip_len += mbuf->m_len;
469                 c_th = (struct tcphdr *)(iph + 1);
470         } else {
471                 struct ip6_hdr *iph = c->nh;
472                 iph->ip6_plen += mbuf->m_len;
473                 c_th = (struct tcphdr *)(iph + 1);
474         }
475         c_th->th_flags |= (th->th_flags & TH_PUSH);
476         c->th_last = th;
477         ++st->n_merges;
478
479         /* Pass packet up now if another segment could overflow the IP
480          * length.
481          */
482         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
483                 sfxge_lro_deliver(st, c);
484 }
485
486 static void
487 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
488                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
489 {
490         /* Start the chain */
491         c->mbuf = mbuf;
492         c->mbuf_tail = c->mbuf;
493         c->nh = nh;
494         c->th_last = th;
495
496         mbuf->m_pkthdr.len = mbuf->m_len;
497
498         /* Mangle header fields for later processing */
499         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
500                 struct ip *iph = nh;
501                 iph->ip_len = ntohs(iph->ip_len);
502         } else {
503                 struct ip6_hdr *iph = nh;
504                 iph->ip6_plen = ntohs(iph->ip6_plen);
505         }
506 }
507
508 /* Try to merge or otherwise hold or deliver (as appropriate) the
509  * packet buffered for this connection (c->next_buf).  Return a flag
510  * indicating whether the connection is still active for LRO purposes.
511  */
512 static int
513 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
514 {
515         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
516         char *eh = c->next_eh;
517         int data_length, hdr_length, dont_merge;
518         unsigned th_seq, pkt_length;
519         struct tcphdr *th;
520         unsigned now;
521
522         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
523                 struct ip *iph = c->next_nh;
524                 th = (struct tcphdr *)(iph + 1);
525                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
526         } else {
527                 struct ip6_hdr *iph = c->next_nh;
528                 th = (struct tcphdr *)(iph + 1);
529                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
530         }
531
532         hdr_length = (char *) th + th->th_off * 4 - eh;
533         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
534                        hdr_length);
535         th_seq = ntohl(th->th_seq);
536         dont_merge = ((data_length <= 0)
537                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
538
539         /* Check for options other than aligned timestamp. */
540         if (th->th_off != 5) {
541                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
542                 if (th->th_off == 8 &&
543                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
544                                         (TCPOPT_NOP << 16) |
545                                         (TCPOPT_TIMESTAMP << 8) |
546                                         TCPOLEN_TIMESTAMP)) {
547                         /* timestamp option -- okay */
548                 } else {
549                         dont_merge = 1;
550                 }
551         }
552
553         if (__predict_false(th_seq != c->next_seq)) {
554                 /* Out-of-order, so start counting again. */
555                 if (c->mbuf != NULL)
556                         sfxge_lro_deliver(&rxq->lro, c);
557                 c->n_in_order_pkts -= lro_loss_packets;
558                 c->next_seq = th_seq + data_length;
559                 ++rxq->lro.n_misorder;
560                 goto deliver_buf_out;
561         }
562         c->next_seq = th_seq + data_length;
563
564         now = ticks;
565         if (now - c->last_pkt_ticks > lro_idle_ticks) {
566                 ++rxq->lro.n_drop_idle;
567                 if (c->mbuf != NULL)
568                         sfxge_lro_deliver(&rxq->lro, c);
569                 sfxge_lro_drop(rxq, c);
570                 return (0);
571         }
572         c->last_pkt_ticks = ticks;
573
574         if (c->n_in_order_pkts < lro_slow_start_packets) {
575                 /* May be in slow-start, so don't merge. */
576                 ++rxq->lro.n_slow_start;
577                 ++c->n_in_order_pkts;
578                 goto deliver_buf_out;
579         }
580
581         if (__predict_false(dont_merge)) {
582                 if (c->mbuf != NULL)
583                         sfxge_lro_deliver(&rxq->lro, c);
584                 if (th->th_flags & (TH_FIN | TH_RST)) {
585                         ++rxq->lro.n_drop_closed;
586                         sfxge_lro_drop(rxq, c);
587                         return (0);
588                 }
589                 goto deliver_buf_out;
590         }
591
592         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
593
594         if (__predict_true(c->mbuf != NULL)) {
595                 /* Remove headers and any padding */
596                 rx_buf->mbuf->m_data += hdr_length;
597                 rx_buf->mbuf->m_len = data_length;
598
599                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
600         } else {
601                 /* Remove any padding */
602                 rx_buf->mbuf->m_len = pkt_length;
603
604                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
605         }
606
607         rx_buf->mbuf = NULL;
608         return (1);
609
610  deliver_buf_out:
611         sfxge_rx_deliver(rxq->sc, rx_buf);
612         return (1);
613 }
614
615 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
616                                uint16_t l2_id, void *nh, struct tcphdr *th)
617 {
618         unsigned bucket = conn_hash & st->conns_mask;
619         struct sfxge_lro_conn *c;
620
621         if (st->conns_n[bucket] >= lro_chain_max) {
622                 ++st->n_too_many;
623                 return;
624         }
625
626         if (!TAILQ_EMPTY(&st->free_conns)) {
627                 c = TAILQ_FIRST(&st->free_conns);
628                 TAILQ_REMOVE(&st->free_conns, c, link);
629         } else {
630                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
631                 if (c == NULL)
632                         return;
633                 c->mbuf = NULL;
634                 c->next_buf.mbuf = NULL;
635         }
636
637         /* Create the connection tracking data */
638         ++st->conns_n[bucket];
639         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
640         c->l2_id = l2_id;
641         c->conn_hash = conn_hash;
642         c->source = th->th_sport;
643         c->dest = th->th_dport;
644         c->n_in_order_pkts = 0;
645         c->last_pkt_ticks = *(volatile int *)&ticks;
646         c->delivered = 0;
647         ++st->n_new_stream;
648         /* NB. We don't initialise c->next_seq, and it doesn't matter what
649          * value it has.  Most likely the next packet received for this
650          * connection will not match -- no harm done.
651          */
652 }
653
654 /* Process mbuf and decide whether to dispatch it to the stack now or
655  * later.
656  */
657 static void
658 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
659 {
660         struct sfxge_softc *sc = rxq->sc;
661         struct mbuf *m = rx_buf->mbuf;
662         struct ether_header *eh;
663         struct sfxge_lro_conn *c;
664         uint16_t l2_id;
665         uint16_t l3_proto;
666         void *nh;
667         struct tcphdr *th;
668         uint32_t conn_hash;
669         unsigned bucket;
670
671         /* Get the hardware hash */
672         conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
673                                             EFX_RX_HASHALG_TOEPLITZ,
674                                             mtod(m, uint8_t *));
675
676         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
677         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
678                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
679                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
680                         SFXGE_LRO_L2_ID_VLAN;
681                 l3_proto = veh->evl_proto;
682                 nh = veh + 1;
683         } else {
684                 l2_id = 0;
685                 l3_proto = eh->ether_type;
686                 nh = eh + 1;
687         }
688
689         /* Check whether this is a suitable packet (unfragmented
690          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
691          * length, and compute a hash if necessary.  If not, return.
692          */
693         if (l3_proto == htons(ETHERTYPE_IP)) {
694                 struct ip *iph = nh;
695
696                 KASSERT(iph->ip_p == IPPROTO_TCP,
697                     ("IPv4 protocol is not TCP, but packet marker is set"));
698                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
699                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
700                         goto deliver_now;
701                 th = (struct tcphdr *)(iph + 1);
702         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
703                 struct ip6_hdr *iph = nh;
704
705                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
706                     ("IPv6 next header is not TCP, but packet marker is set"));
707                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
708                 th = (struct tcphdr *)(iph + 1);
709         } else {
710                 goto deliver_now;
711         }
712
713         bucket = conn_hash & rxq->lro.conns_mask;
714
715         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
716                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
717                         continue;
718                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
719                         continue;
720                 if (c->mbuf != NULL) {
721                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
722                                 struct ip *c_iph, *iph = nh;
723                                 c_iph = c->nh;
724                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
725                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
726                                         continue;
727                         } else {
728                                 struct ip6_hdr *c_iph, *iph = nh;
729                                 c_iph = c->nh;
730                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
731                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
732                                         continue;
733                         }
734                 }
735
736                 /* Re-insert at head of list to reduce lookup time. */
737                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
738                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
739
740                 if (c->next_buf.mbuf != NULL) {
741                         if (!sfxge_lro_try_merge(rxq, c))
742                                 goto deliver_now;
743                 } else {
744                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
745                             active_link);
746                 }
747                 c->next_buf = *rx_buf;
748                 c->next_eh = eh;
749                 c->next_nh = nh;
750
751                 rx_buf->mbuf = NULL;
752                 rx_buf->flags = EFX_DISCARD;
753                 return;
754         }
755
756         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
757  deliver_now:
758         sfxge_rx_deliver(sc, rx_buf);
759 }
760
761 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
762 {
763         struct sfxge_lro_state *st = &rxq->lro;
764         struct sfxge_lro_conn *c;
765         unsigned t;
766
767         while (!LIST_EMPTY(&st->active_conns)) {
768                 c = LIST_FIRST(&st->active_conns);
769                 if (!c->delivered && c->mbuf != NULL)
770                         sfxge_lro_deliver(st, c);
771                 if (sfxge_lro_try_merge(rxq, c)) {
772                         if (c->mbuf != NULL)
773                                 sfxge_lro_deliver(st, c);
774                         LIST_REMOVE(c, active_link);
775                 }
776                 c->delivered = 0;
777         }
778
779         t = *(volatile int *)&ticks;
780         if (__predict_false(t != st->last_purge_ticks))
781                 sfxge_lro_purge_idle(rxq, t);
782 }
783
784 #else   /* !SFXGE_LRO */
785
786 static void
787 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
788 {
789 }
790
791 static void
792 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
793 {
794 }
795
796 #endif  /* SFXGE_LRO */
797
798 void
799 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
800 {
801         struct sfxge_softc *sc = rxq->sc;
802         int if_capenable = sc->ifnet->if_capenable;
803         int lro_enabled = if_capenable & IFCAP_LRO;
804         unsigned int index;
805         struct sfxge_evq *evq;
806         unsigned int completed;
807         unsigned int level;
808         struct mbuf *m;
809         struct sfxge_rx_sw_desc *prev = NULL;
810
811         index = rxq->index;
812         evq = sc->evq[index];
813
814         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
815
816         completed = rxq->completed;
817         while (completed != rxq->pending) {
818                 unsigned int id;
819                 struct sfxge_rx_sw_desc *rx_desc;
820
821                 id = completed++ & rxq->ptr_mask;
822                 rx_desc = &rxq->queue[id];
823                 m = rx_desc->mbuf;
824
825                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
826                         goto discard;
827
828                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
829                         goto discard;
830
831                 /* Read the length from the pseudo header if required */
832                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
833                         uint16_t tmp_size;
834                         int rc;
835                         rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
836                                                            mtod(m, uint8_t *),
837                                                            &tmp_size);
838                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
839                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
840                 }
841
842                 prefetch_read_many(mtod(m, caddr_t));
843
844                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
845                 case EFX_PKT_IPV4:
846                         if (~if_capenable & IFCAP_RXCSUM)
847                                 rx_desc->flags &=
848                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
849                         break;
850                 case EFX_PKT_IPV6:
851                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
852                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
853                         break;
854                 case 0:
855                         /* Check for loopback packets */
856                         {
857                                 struct ether_header *etherhp;
858
859                                 /*LINTED*/
860                                 etherhp = mtod(m, struct ether_header *);
861
862                                 if (etherhp->ether_type ==
863                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
864                                         EFSYS_PROBE(loopback);
865
866                                         rxq->loopback++;
867                                         goto discard;
868                                 }
869                         }
870                         break;
871                 default:
872                         KASSERT(B_FALSE,
873                             ("Rx descriptor with both IPv4 and IPv6 flags"));
874                         goto discard;
875                 }
876
877                 /* Pass packet up the stack or into LRO (pipelined) */
878                 if (prev != NULL) {
879                         if (lro_enabled &&
880                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
881                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
882                                 sfxge_lro(rxq, prev);
883                         else
884                                 sfxge_rx_deliver(sc, prev);
885                 }
886                 prev = rx_desc;
887                 continue;
888
889 discard:
890                 /* Return the packet to the pool */
891                 m_free(m);
892                 rx_desc->mbuf = NULL;
893         }
894         rxq->completed = completed;
895
896         level = rxq->added - rxq->completed;
897
898         /* Pass last packet up the stack or into LRO */
899         if (prev != NULL) {
900                 if (lro_enabled &&
901                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
902                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
903                         sfxge_lro(rxq, prev);
904                 else
905                         sfxge_rx_deliver(sc, prev);
906         }
907
908         /*
909          * If there are any pending flows and this is the end of the
910          * poll then they must be completed.
911          */
912         if (eop)
913                 sfxge_lro_end_of_burst(rxq);
914
915         /* Top up the queue if necessary */
916         if (level < rxq->refill_threshold)
917                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
918 }
919
920 static void
921 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
922 {
923         struct sfxge_rxq *rxq;
924         struct sfxge_evq *evq;
925         unsigned int count;
926         unsigned int retry = 3;
927
928         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
929
930         rxq = sc->rxq[index];
931         evq = sc->evq[index];
932
933         SFXGE_EVQ_LOCK(evq);
934
935         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
936             ("rxq not started"));
937
938         rxq->init_state = SFXGE_RXQ_INITIALIZED;
939
940         callout_stop(&rxq->refill_callout);
941
942         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
943                 rxq->flush_state = SFXGE_FLUSH_PENDING;
944
945                 SFXGE_EVQ_UNLOCK(evq);
946
947                 /* Flush the receive queue */
948                 if (efx_rx_qflush(rxq->common) != 0) {
949                         SFXGE_EVQ_LOCK(evq);
950                         rxq->flush_state = SFXGE_FLUSH_FAILED;
951                         break;
952                 }
953
954                 count = 0;
955                 do {
956                         /* Spin for 100 ms */
957                         DELAY(100000);
958
959                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
960                                 break;
961
962                 } while (++count < 20);
963
964                 SFXGE_EVQ_LOCK(evq);
965
966                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
967                         /* Flush timeout - neither done nor failed */
968                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
969                             device_get_nameunit(sc->dev), index);
970                         rxq->flush_state = SFXGE_FLUSH_DONE;
971                 }
972                 retry--;
973         }
974         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
975                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
976                     device_get_nameunit(sc->dev), index);
977                 rxq->flush_state = SFXGE_FLUSH_DONE;
978         }
979
980         rxq->pending = rxq->added;
981         sfxge_rx_qcomplete(rxq, B_TRUE);
982
983         KASSERT(rxq->completed == rxq->pending,
984             ("rxq->completed != rxq->pending"));
985
986         rxq->added = 0;
987         rxq->pushed = 0;
988         rxq->pending = 0;
989         rxq->completed = 0;
990         rxq->loopback = 0;
991
992         /* Destroy the common code receive queue. */
993         efx_rx_qdestroy(rxq->common);
994
995         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
996             EFX_RXQ_NBUFS(sc->rxq_entries));
997
998         SFXGE_EVQ_UNLOCK(evq);
999 }
1000
1001 static int
1002 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1003 {
1004         struct sfxge_rxq *rxq;
1005         efsys_mem_t *esmp;
1006         struct sfxge_evq *evq;
1007         int rc;
1008
1009         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1010
1011         rxq = sc->rxq[index];
1012         esmp = &rxq->mem;
1013         evq = sc->evq[index];
1014
1015         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1016             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1017         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1018             ("evq->init_state != SFXGE_EVQ_STARTED"));
1019
1020         /* Program the buffer table. */
1021         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1022             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1023                 return (rc);
1024
1025         /* Create the common code receive queue. */
1026         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1027             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1028             &rxq->common)) != 0)
1029                 goto fail;
1030
1031         SFXGE_EVQ_LOCK(evq);
1032
1033         /* Enable the receive queue. */
1034         efx_rx_qenable(rxq->common);
1035
1036         rxq->init_state = SFXGE_RXQ_STARTED;
1037         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1038
1039         /* Try to fill the queue from the pool. */
1040         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1041
1042         SFXGE_EVQ_UNLOCK(evq);
1043
1044         return (0);
1045
1046 fail:
1047         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1048             EFX_RXQ_NBUFS(sc->rxq_entries));
1049         return (rc);
1050 }
1051
1052 void
1053 sfxge_rx_stop(struct sfxge_softc *sc)
1054 {
1055         int index;
1056
1057         efx_mac_filter_default_rxq_clear(sc->enp);
1058
1059         /* Stop the receive queue(s) */
1060         index = sc->rxq_count;
1061         while (--index >= 0)
1062                 sfxge_rx_qstop(sc, index);
1063
1064         sc->rx_prefix_size = 0;
1065         sc->rx_buffer_size = 0;
1066
1067         efx_rx_fini(sc->enp);
1068 }
1069
1070 int
1071 sfxge_rx_start(struct sfxge_softc *sc)
1072 {
1073         struct sfxge_intr *intr;
1074         const efx_nic_cfg_t *encp;
1075         size_t hdrlen, align, reserved;
1076         int index;
1077         int rc;
1078
1079         intr = &sc->intr;
1080
1081         /* Initialize the common code receive module. */
1082         if ((rc = efx_rx_init(sc->enp)) != 0)
1083                 return (rc);
1084
1085         encp = efx_nic_cfg_get(sc->enp);
1086         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1087
1088         /* Calculate the receive packet buffer size. */ 
1089         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1090
1091         /* Ensure IP headers are 32bit aligned */
1092         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1093         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1094
1095         sc->rx_buffer_size += sc->rx_buffer_align;
1096
1097         /* Align end of packet buffer for RX DMA end padding */
1098         align = MAX(1, encp->enc_rx_buf_align_end);
1099         EFSYS_ASSERT(ISP2(align));
1100         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1101
1102         /*
1103          * Standard mbuf zones only guarantee pointer-size alignment;
1104          * we need extra space to align to the cache line
1105          */
1106         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1107
1108         /* Select zone for packet buffers */
1109         if (reserved <= MCLBYTES)
1110                 sc->rx_cluster_size = MCLBYTES;
1111         else if (reserved <= MJUMPAGESIZE)
1112                 sc->rx_cluster_size = MJUMPAGESIZE;
1113         else if (reserved <= MJUM9BYTES)
1114                 sc->rx_cluster_size = MJUM9BYTES;
1115         else
1116                 sc->rx_cluster_size = MJUM16BYTES;
1117
1118         /*
1119          * Set up the scale table.  Enable all hash types and hash insertion.
1120          */
1121         for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1122                 sc->rx_indir_table[index] = index % sc->rxq_count;
1123         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1124                                        SFXGE_RX_SCALE_MAX)) != 0)
1125                 goto fail;
1126         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1127             (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1128             (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1129
1130         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1131                                        sizeof(toep_key))) != 0)
1132                 goto fail;
1133
1134         /* Start the receive queue(s). */
1135         for (index = 0; index < sc->rxq_count; index++) {
1136                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1137                         goto fail2;
1138         }
1139
1140         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1141                                             sc->intr.n_alloc > 1);
1142         if (rc != 0)
1143                 goto fail3;
1144
1145         return (0);
1146
1147 fail3:
1148 fail2:
1149         while (--index >= 0)
1150                 sfxge_rx_qstop(sc, index);
1151
1152 fail:
1153         efx_rx_fini(sc->enp);
1154
1155         return (rc);
1156 }
1157
1158 #ifdef SFXGE_LRO
1159
1160 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1161 {
1162         struct sfxge_lro_state *st = &rxq->lro;
1163         unsigned i;
1164
1165         st->conns_mask = lro_table_size - 1;
1166         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1167                 ("lro_table_size must be a power of 2"));
1168         st->sc = rxq->sc;
1169         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1170                            M_SFXGE, M_WAITOK);
1171         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1172                              M_SFXGE, M_WAITOK);
1173         for (i = 0; i <= st->conns_mask; ++i) {
1174                 TAILQ_INIT(&st->conns[i]);
1175                 st->conns_n[i] = 0;
1176         }
1177         LIST_INIT(&st->active_conns);
1178         TAILQ_INIT(&st->free_conns);
1179 }
1180
1181 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1182 {
1183         struct sfxge_lro_state *st = &rxq->lro;
1184         struct sfxge_lro_conn *c;
1185         unsigned i;
1186
1187         /* Return cleanly if sfxge_lro_init() has not been called. */
1188         if (st->conns == NULL)
1189                 return;
1190
1191         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1192
1193         for (i = 0; i <= st->conns_mask; ++i) {
1194                 while (!TAILQ_EMPTY(&st->conns[i])) {
1195                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1196                         sfxge_lro_drop(rxq, c);
1197                 }
1198         }
1199
1200         while (!TAILQ_EMPTY(&st->free_conns)) {
1201                 c = TAILQ_FIRST(&st->free_conns);
1202                 TAILQ_REMOVE(&st->free_conns, c, link);
1203                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1204                 free(c, M_SFXGE);
1205         }
1206
1207         free(st->conns_n, M_SFXGE);
1208         free(st->conns, M_SFXGE);
1209         st->conns = NULL;
1210 }
1211
1212 #else
1213
1214 static void
1215 sfxge_lro_init(struct sfxge_rxq *rxq)
1216 {
1217 }
1218
1219 static void
1220 sfxge_lro_fini(struct sfxge_rxq *rxq)
1221 {
1222 }
1223
1224 #endif  /* SFXGE_LRO */
1225
1226 static void
1227 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1228 {
1229         struct sfxge_rxq *rxq;
1230
1231         rxq = sc->rxq[index];
1232
1233         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1234             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1235
1236         /* Free the context array and the flow table. */
1237         free(rxq->queue, M_SFXGE);
1238         sfxge_lro_fini(rxq);
1239
1240         /* Release DMA memory. */
1241         sfxge_dma_free(&rxq->mem);
1242
1243         sc->rxq[index] = NULL;
1244
1245         free(rxq, M_SFXGE);
1246 }
1247
1248 static int
1249 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1250 {
1251         struct sfxge_rxq *rxq;
1252         struct sfxge_evq *evq;
1253         efsys_mem_t *esmp;
1254         int rc;
1255
1256         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1257
1258         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1259         rxq->sc = sc;
1260         rxq->index = index;
1261         rxq->entries = sc->rxq_entries;
1262         rxq->ptr_mask = rxq->entries - 1;
1263         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1264
1265         sc->rxq[index] = rxq;
1266         esmp = &rxq->mem;
1267
1268         evq = sc->evq[index];
1269
1270         /* Allocate and zero DMA space. */
1271         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1272                 return (rc);
1273
1274         /* Allocate buffer table entries. */
1275         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1276                                  &rxq->buf_base_id);
1277
1278         /* Allocate the context array and the flow table. */
1279         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1280             M_SFXGE, M_WAITOK | M_ZERO);
1281         sfxge_lro_init(rxq);
1282
1283         callout_init(&rxq->refill_callout, 1);
1284
1285         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1286
1287         return (0);
1288 }
1289
1290 static const struct {
1291         const char *name;
1292         size_t offset;
1293 } sfxge_rx_stats[] = {
1294 #define SFXGE_RX_STAT(name, member) \
1295         { #name, offsetof(struct sfxge_rxq, member) }
1296 #ifdef SFXGE_LRO
1297         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1298         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1299         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1300         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1301         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1302         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1303         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1304         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1305 #endif
1306 };
1307
1308 static int
1309 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1310 {
1311         struct sfxge_softc *sc = arg1;
1312         unsigned int id = arg2;
1313         unsigned int sum, index;
1314
1315         /* Sum across all RX queues */
1316         sum = 0;
1317         for (index = 0; index < sc->rxq_count; index++)
1318                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1319                                          sfxge_rx_stats[id].offset);
1320
1321         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1322 }
1323
1324 static void
1325 sfxge_rx_stat_init(struct sfxge_softc *sc)
1326 {
1327         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1328         struct sysctl_oid_list *stat_list;
1329         unsigned int id;
1330
1331         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1332
1333         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1334                 SYSCTL_ADD_PROC(
1335                         ctx, stat_list,
1336                         OID_AUTO, sfxge_rx_stats[id].name,
1337                         CTLTYPE_UINT|CTLFLAG_RD,
1338                         sc, id, sfxge_rx_stat_handler, "IU",
1339                         "");
1340         }
1341 }
1342
1343 void
1344 sfxge_rx_fini(struct sfxge_softc *sc)
1345 {
1346         int index;
1347
1348         index = sc->rxq_count;
1349         while (--index >= 0)
1350                 sfxge_rx_qfini(sc, index);
1351
1352         sc->rxq_count = 0;
1353 }
1354
1355 int
1356 sfxge_rx_init(struct sfxge_softc *sc)
1357 {
1358         struct sfxge_intr *intr;
1359         int index;
1360         int rc;
1361
1362 #ifdef SFXGE_LRO
1363         if (!ISP2(lro_table_size)) {
1364                 log(LOG_ERR, "%s=%u must be power of 2",
1365                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1366                 rc = EINVAL;
1367                 goto fail_lro_table_size;
1368         }
1369
1370         if (lro_idle_ticks == 0)
1371                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1372 #endif
1373
1374         intr = &sc->intr;
1375
1376         sc->rxq_count = intr->n_alloc;
1377
1378         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1379             ("intr->state != SFXGE_INTR_INITIALIZED"));
1380
1381         /* Initialize the receive queue(s) - one per interrupt. */
1382         for (index = 0; index < sc->rxq_count; index++) {
1383                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1384                         goto fail;
1385         }
1386
1387         sfxge_rx_stat_init(sc);
1388
1389         return (0);
1390
1391 fail:
1392         /* Tear down the receive queue(s). */
1393         while (--index >= 0)
1394                 sfxge_rx_qfini(sc, index);
1395
1396         sc->rxq_count = 0;
1397
1398 #ifdef SFXGE_LRO
1399 fail_lro_table_size:
1400 #endif
1401         return (rc);
1402 }