]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/sfxge/sfxge_rx.c
bus: Make BUS_TRANSLATE_RESOURCE behave more like other bus methods
[FreeBSD/FreeBSD.git] / sys / dev / sfxge / sfxge_rx.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
5  * All rights reserved.
6  *
7  * This software was developed in part by Philip Paeps under contract for
8  * Solarflare Communications, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright notice,
14  *    this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright notice,
16  *    this list of conditions and the following disclaimer in the documentation
17  *    and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * The views and conclusions contained in the software and documentation are
32  * those of the authors and should not be interpreted as representing official
33  * policies, either expressed or implied, of the FreeBSD Project.
34  */
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include "opt_rss.h"
40
41 #include <sys/param.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/smp.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/limits.h>
49 #include <sys/syslog.h>
50
51 #include <net/ethernet.h>
52 #include <net/if.h>
53 #include <net/if_vlan_var.h>
54
55 #include <netinet/in.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet/tcp.h>
59
60 #include <machine/in_cksum.h>
61
62 #ifdef RSS
63 #include <net/rss_config.h>
64 #endif
65
66 #include "common/efx.h"
67
68 #include "sfxge.h"
69 #include "sfxge_rx.h"
70
71 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
72
73 #ifdef SFXGE_LRO
74
75 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
76     "Large receive offload (LRO) parameters");
77
78 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
79
80 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
81  * means we can accelerate a larger number of streams.
82  */
83 static unsigned lro_table_size = 128;
84 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
85 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
86             &lro_table_size, 0,
87             "Size of the LRO hash table (must be a power of 2)");
88
89 /* Maximum length of a hash chain.  If chains get too long then the lookup
90  * time increases and may exceed the benefit of LRO.
91  */
92 static unsigned lro_chain_max = 20;
93 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
94 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
95             &lro_chain_max, 0,
96             "The maximum length of a hash chain");
97
98 /* Maximum time (in ticks) that a connection can be idle before it's LRO
99  * state is discarded.
100  */
101 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
102 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
103 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
104             &lro_idle_ticks, 0,
105             "The maximum time (in ticks) that a connection can be idle "
106             "before it's LRO state is discarded");
107
108 /* Number of packets with payload that must arrive in-order before a
109  * connection is eligible for LRO.  The idea is we should avoid coalescing
110  * segments when the sender is in slow-start because reducing the ACK rate
111  * can damage performance.
112  */
113 static int lro_slow_start_packets = 2000;
114 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
115 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
116             &lro_slow_start_packets, 0,
117             "Number of packets with payload that must arrive in-order before "
118             "a connection is eligible for LRO");
119
120 /* Number of packets with payload that must arrive in-order following loss
121  * before a connection is eligible for LRO.  The idea is we should avoid
122  * coalescing segments when the sender is recovering from loss, because
123  * reducing the ACK rate can damage performance.
124  */
125 static int lro_loss_packets = 20;
126 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
127 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
128             &lro_loss_packets, 0,
129             "Number of packets with payload that must arrive in-order "
130             "following loss before a connection is eligible for LRO");
131
132 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
133 #define SFXGE_LRO_L2_ID_VLAN 0x4000
134 #define SFXGE_LRO_L2_ID_IPV6 0x8000
135 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
136 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
137
138 /* Compare IPv6 addresses, avoiding conditional branches */
139 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
140                                    const struct in6_addr *right)
141 {
142 #if LONG_BIT == 64
143         const uint64_t *left64 = (const uint64_t *)left;
144         const uint64_t *right64 = (const uint64_t *)right;
145         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
146 #else
147         return (left->s6_addr32[0] - right->s6_addr32[0]) |
148                (left->s6_addr32[1] - right->s6_addr32[1]) |
149                (left->s6_addr32[2] - right->s6_addr32[2]) |
150                (left->s6_addr32[3] - right->s6_addr32[3]);
151 #endif
152 }
153
154 #endif  /* SFXGE_LRO */
155
156 void
157 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
158 {
159
160         rxq->flush_state = SFXGE_FLUSH_DONE;
161 }
162
163 void
164 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
165 {
166
167         rxq->flush_state = SFXGE_FLUSH_FAILED;
168 }
169
170 #ifdef RSS
171 static uint8_t toep_key[RSS_KEYSIZE];
172 #else
173 static uint8_t toep_key[] = {
174         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
175         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
176         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
177         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
178         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
179 };
180 #endif
181
182 static void
183 sfxge_rx_post_refill(void *arg)
184 {
185         struct sfxge_rxq *rxq = arg;
186         struct sfxge_softc *sc;
187         unsigned int index;
188         struct sfxge_evq *evq;
189         uint16_t magic;
190
191         sc = rxq->sc;
192         index = rxq->index;
193         evq = sc->evq[index];
194         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
195
196         /* This is guaranteed due to the start/stop order of rx and ev */
197         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
198             ("evq not started"));
199         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
200             ("rxq not started"));
201         efx_ev_qpost(evq->common, magic);
202 }
203
204 static void
205 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
206 {
207         /* Initially retry after 100 ms, but back off in case of
208          * repeated failures as we probably have to wait for the
209          * administrator to raise the pool limit. */
210         if (retrying)
211                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
212         else
213                 rxq->refill_delay = hz / 10;
214
215         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
216                              sfxge_rx_post_refill, rxq);
217 }
218
219 #define SFXGE_REFILL_BATCH  64
220
221 static void
222 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
223 {
224         struct sfxge_softc *sc;
225         unsigned int index;
226         struct sfxge_evq *evq;
227         unsigned int batch;
228         unsigned int rxfill;
229         unsigned int mblksize;
230         int ntodo;
231         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
232
233         sc = rxq->sc;
234         index = rxq->index;
235         evq = sc->evq[index];
236
237         prefetch_read_many(sc->enp);
238         prefetch_read_many(rxq->common);
239
240         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
241
242         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
243                 return;
244
245         rxfill = rxq->added - rxq->completed;
246         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
247             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
248         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
249         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
250             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
251
252         if (ntodo == 0)
253                 return;
254
255         batch = 0;
256         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
257         while (ntodo-- > 0) {
258                 unsigned int id;
259                 struct sfxge_rx_sw_desc *rx_desc;
260                 bus_dma_segment_t seg;
261                 struct mbuf *m;
262
263                 id = (rxq->added + batch) & rxq->ptr_mask;
264                 rx_desc = &rxq->queue[id];
265                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
266
267                 rx_desc->flags = EFX_DISCARD;
268                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
269                     sc->rx_cluster_size);
270                 if (m == NULL)
271                         break;
272
273                 /* m_len specifies length of area to be mapped for DMA */
274                 m->m_len  = mblksize;
275                 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
276                                                    CACHE_LINE_SIZE);
277                 m->m_data += sc->rx_buffer_align;
278
279                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
280                 addr[batch++] = seg.ds_addr;
281
282                 if (batch == SFXGE_REFILL_BATCH) {
283                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
284                             rxq->completed, rxq->added);
285                         rxq->added += batch;
286                         batch = 0;
287                 }
288         }
289
290         if (ntodo != 0)
291                 sfxge_rx_schedule_refill(rxq, retrying);
292
293         if (batch != 0) {
294                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
295                     rxq->completed, rxq->added);
296                 rxq->added += batch;
297         }
298
299         /* Make the descriptors visible to the hardware */
300         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
301                         BUS_DMASYNC_PREWRITE);
302
303         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
304
305         /* The queue could still be empty if no descriptors were actually
306          * pushed, in which case there will be no event to cause the next
307          * refill, so we must schedule a refill ourselves.
308          */
309         if(rxq->pushed == rxq->completed) {
310                 sfxge_rx_schedule_refill(rxq, retrying);
311         }
312 }
313
314 void
315 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
316 {
317
318         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
319                 return;
320
321         /* Make sure the queue is full */
322         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
323 }
324
325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
326 {
327         struct ifnet *ifp = sc->ifnet;
328
329         m->m_pkthdr.rcvif = ifp;
330         m->m_pkthdr.csum_data = 0xffff;
331         ifp->if_input(ifp, m);
332 }
333
334 static void
335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
336 {
337         struct sfxge_softc *sc = rxq->sc;
338         struct mbuf *m = rx_desc->mbuf;
339         int flags = rx_desc->flags;
340         int csum_flags;
341
342         /* Convert checksum flags */
343         csum_flags = (flags & EFX_CKSUM_IPV4) ?
344                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
345         if (flags & EFX_CKSUM_TCPUDP)
346                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
347
348         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
349                 m->m_pkthdr.flowid =
350                         efx_pseudo_hdr_hash_get(rxq->common,
351                                                 EFX_RX_HASHALG_TOEPLITZ,
352                                                 mtod(m, uint8_t *));
353                 /* The hash covers a 4-tuple for TCP only */
354                 M_HASHTYPE_SET(m,
355                     (flags & EFX_PKT_IPV4) ?
356                         ((flags & EFX_PKT_TCP) ?
357                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
358                         ((flags & EFX_PKT_TCP) ?
359                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
360         }
361         m->m_data += sc->rx_prefix_size;
362         m->m_len = rx_desc->size - sc->rx_prefix_size;
363         m->m_pkthdr.len = m->m_len;
364         m->m_pkthdr.csum_flags = csum_flags;
365         __sfxge_rx_deliver(sc, rx_desc->mbuf);
366
367         rx_desc->flags = EFX_DISCARD;
368         rx_desc->mbuf = NULL;
369 }
370
371 #ifdef SFXGE_LRO
372
373 static void
374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
375 {
376         struct sfxge_softc *sc = st->sc;
377         struct mbuf *m = c->mbuf;
378         struct tcphdr *c_th;
379         int csum_flags;
380
381         KASSERT(m, ("no mbuf to deliver"));
382
383         ++st->n_bursts;
384
385         /* Finish off packet munging and recalculate IP header checksum. */
386         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
387                 struct ip *iph = c->nh;
388                 iph->ip_len = htons(iph->ip_len);
389                 iph->ip_sum = 0;
390                 iph->ip_sum = in_cksum_hdr(iph);
391                 c_th = (struct tcphdr *)(iph + 1);
392                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
393                               CSUM_IP_CHECKED | CSUM_IP_VALID);
394         } else {
395                 struct ip6_hdr *iph = c->nh;
396                 iph->ip6_plen = htons(iph->ip6_plen);
397                 c_th = (struct tcphdr *)(iph + 1);
398                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
399         }
400
401         c_th->th_win = c->th_last->th_win;
402         c_th->th_ack = c->th_last->th_ack;
403         if (c_th->th_off == c->th_last->th_off) {
404                 /* Copy TCP options (take care to avoid going negative). */
405                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
406                 memcpy(c_th + 1, c->th_last + 1, optlen);
407         }
408
409         m->m_pkthdr.flowid = c->conn_hash;
410         M_HASHTYPE_SET(m,
411             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
412                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
413
414         m->m_pkthdr.csum_flags = csum_flags;
415         __sfxge_rx_deliver(sc, m);
416
417         c->mbuf = NULL;
418         c->delivered = 1;
419 }
420
421 /* Drop the given connection, and add it to the free list. */
422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
423 {
424         unsigned bucket;
425
426         KASSERT(!c->mbuf, ("found orphaned mbuf"));
427
428         if (c->next_buf.mbuf != NULL) {
429                 sfxge_rx_deliver(rxq, &c->next_buf);
430                 LIST_REMOVE(c, active_link);
431         }
432
433         bucket = c->conn_hash & rxq->lro.conns_mask;
434         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
435         --rxq->lro.conns_n[bucket];
436         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
437         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
438 }
439
440 /* Stop tracking connections that have gone idle in order to keep hash
441  * chains short.
442  */
443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
444 {
445         struct sfxge_lro_conn *c;
446         unsigned i;
447
448         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
449                 ("found active connections"));
450
451         rxq->lro.last_purge_ticks = now;
452         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
453                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
454                         continue;
455
456                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
457                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
458                         ++rxq->lro.n_drop_idle;
459                         sfxge_lro_drop(rxq, c);
460                 }
461         }
462 }
463
464 static void
465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
466                 struct mbuf *mbuf, struct tcphdr *th)
467 {
468         struct tcphdr *c_th;
469
470         /* Tack the new mbuf onto the chain. */
471         KASSERT(!mbuf->m_next, ("mbuf already chained"));
472         c->mbuf_tail->m_next = mbuf;
473         c->mbuf_tail = mbuf;
474
475         /* Increase length appropriately */
476         c->mbuf->m_pkthdr.len += mbuf->m_len;
477
478         /* Update the connection state flags */
479         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
480                 struct ip *iph = c->nh;
481                 iph->ip_len += mbuf->m_len;
482                 c_th = (struct tcphdr *)(iph + 1);
483         } else {
484                 struct ip6_hdr *iph = c->nh;
485                 iph->ip6_plen += mbuf->m_len;
486                 c_th = (struct tcphdr *)(iph + 1);
487         }
488         c_th->th_flags |= (th->th_flags & TH_PUSH);
489         c->th_last = th;
490         ++st->n_merges;
491
492         /* Pass packet up now if another segment could overflow the IP
493          * length.
494          */
495         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
496                 sfxge_lro_deliver(st, c);
497 }
498
499 static void
500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
501                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
502 {
503         /* Start the chain */
504         c->mbuf = mbuf;
505         c->mbuf_tail = c->mbuf;
506         c->nh = nh;
507         c->th_last = th;
508
509         mbuf->m_pkthdr.len = mbuf->m_len;
510
511         /* Mangle header fields for later processing */
512         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
513                 struct ip *iph = nh;
514                 iph->ip_len = ntohs(iph->ip_len);
515         } else {
516                 struct ip6_hdr *iph = nh;
517                 iph->ip6_plen = ntohs(iph->ip6_plen);
518         }
519 }
520
521 /* Try to merge or otherwise hold or deliver (as appropriate) the
522  * packet buffered for this connection (c->next_buf).  Return a flag
523  * indicating whether the connection is still active for LRO purposes.
524  */
525 static int
526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
527 {
528         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
529         char *eh = c->next_eh;
530         int data_length, hdr_length, dont_merge;
531         unsigned th_seq, pkt_length;
532         struct tcphdr *th;
533         unsigned now;
534
535         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
536                 struct ip *iph = c->next_nh;
537                 th = (struct tcphdr *)(iph + 1);
538                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
539         } else {
540                 struct ip6_hdr *iph = c->next_nh;
541                 th = (struct tcphdr *)(iph + 1);
542                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
543         }
544
545         hdr_length = (char *) th + th->th_off * 4 - eh;
546         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
547                        hdr_length);
548         th_seq = ntohl(th->th_seq);
549         dont_merge = ((data_length <= 0)
550                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
551
552         /* Check for options other than aligned timestamp. */
553         if (th->th_off != 5) {
554                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
555                 if (th->th_off == 8 &&
556                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
557                                         (TCPOPT_NOP << 16) |
558                                         (TCPOPT_TIMESTAMP << 8) |
559                                         TCPOLEN_TIMESTAMP)) {
560                         /* timestamp option -- okay */
561                 } else {
562                         dont_merge = 1;
563                 }
564         }
565
566         if (__predict_false(th_seq != c->next_seq)) {
567                 /* Out-of-order, so start counting again. */
568                 if (c->mbuf != NULL)
569                         sfxge_lro_deliver(&rxq->lro, c);
570                 c->n_in_order_pkts -= lro_loss_packets;
571                 c->next_seq = th_seq + data_length;
572                 ++rxq->lro.n_misorder;
573                 goto deliver_buf_out;
574         }
575         c->next_seq = th_seq + data_length;
576
577         now = ticks;
578         if (now - c->last_pkt_ticks > lro_idle_ticks) {
579                 ++rxq->lro.n_drop_idle;
580                 if (c->mbuf != NULL)
581                         sfxge_lro_deliver(&rxq->lro, c);
582                 sfxge_lro_drop(rxq, c);
583                 return (0);
584         }
585         c->last_pkt_ticks = ticks;
586
587         if (c->n_in_order_pkts < lro_slow_start_packets) {
588                 /* May be in slow-start, so don't merge. */
589                 ++rxq->lro.n_slow_start;
590                 ++c->n_in_order_pkts;
591                 goto deliver_buf_out;
592         }
593
594         if (__predict_false(dont_merge)) {
595                 if (c->mbuf != NULL)
596                         sfxge_lro_deliver(&rxq->lro, c);
597                 if (th->th_flags & (TH_FIN | TH_RST)) {
598                         ++rxq->lro.n_drop_closed;
599                         sfxge_lro_drop(rxq, c);
600                         return (0);
601                 }
602                 goto deliver_buf_out;
603         }
604
605         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
606
607         if (__predict_true(c->mbuf != NULL)) {
608                 /* Remove headers and any padding */
609                 rx_buf->mbuf->m_data += hdr_length;
610                 rx_buf->mbuf->m_len = data_length;
611
612                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
613         } else {
614                 /* Remove any padding */
615                 rx_buf->mbuf->m_len = pkt_length;
616
617                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
618         }
619
620         rx_buf->mbuf = NULL;
621         return (1);
622
623  deliver_buf_out:
624         sfxge_rx_deliver(rxq, rx_buf);
625         return (1);
626 }
627
628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
629                                uint16_t l2_id, void *nh, struct tcphdr *th)
630 {
631         unsigned bucket = conn_hash & st->conns_mask;
632         struct sfxge_lro_conn *c;
633
634         if (st->conns_n[bucket] >= lro_chain_max) {
635                 ++st->n_too_many;
636                 return;
637         }
638
639         if (!TAILQ_EMPTY(&st->free_conns)) {
640                 c = TAILQ_FIRST(&st->free_conns);
641                 TAILQ_REMOVE(&st->free_conns, c, link);
642         } else {
643                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
644                 if (c == NULL)
645                         return;
646                 c->mbuf = NULL;
647                 c->next_buf.mbuf = NULL;
648         }
649
650         /* Create the connection tracking data */
651         ++st->conns_n[bucket];
652         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
653         c->l2_id = l2_id;
654         c->conn_hash = conn_hash;
655         c->source = th->th_sport;
656         c->dest = th->th_dport;
657         c->n_in_order_pkts = 0;
658         c->last_pkt_ticks = *(volatile int *)&ticks;
659         c->delivered = 0;
660         ++st->n_new_stream;
661         /* NB. We don't initialise c->next_seq, and it doesn't matter what
662          * value it has.  Most likely the next packet received for this
663          * connection will not match -- no harm done.
664          */
665 }
666
667 /* Process mbuf and decide whether to dispatch it to the stack now or
668  * later.
669  */
670 static void
671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
672 {
673         struct sfxge_softc *sc = rxq->sc;
674         struct mbuf *m = rx_buf->mbuf;
675         struct ether_header *eh;
676         struct sfxge_lro_conn *c;
677         uint16_t l2_id;
678         uint16_t l3_proto;
679         void *nh;
680         struct tcphdr *th;
681         uint32_t conn_hash;
682         unsigned bucket;
683
684         /* Get the hardware hash */
685         conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
686                                             EFX_RX_HASHALG_TOEPLITZ,
687                                             mtod(m, uint8_t *));
688
689         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
690         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
691                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
692                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
693                         SFXGE_LRO_L2_ID_VLAN;
694                 l3_proto = veh->evl_proto;
695                 nh = veh + 1;
696         } else {
697                 l2_id = 0;
698                 l3_proto = eh->ether_type;
699                 nh = eh + 1;
700         }
701
702         /* Check whether this is a suitable packet (unfragmented
703          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
704          * length, and compute a hash if necessary.  If not, return.
705          */
706         if (l3_proto == htons(ETHERTYPE_IP)) {
707                 struct ip *iph = nh;
708
709                 KASSERT(iph->ip_p == IPPROTO_TCP,
710                     ("IPv4 protocol is not TCP, but packet marker is set"));
711                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
712                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
713                         goto deliver_now;
714                 th = (struct tcphdr *)(iph + 1);
715         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
716                 struct ip6_hdr *iph = nh;
717
718                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
719                     ("IPv6 next header is not TCP, but packet marker is set"));
720                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
721                 th = (struct tcphdr *)(iph + 1);
722         } else {
723                 goto deliver_now;
724         }
725
726         bucket = conn_hash & rxq->lro.conns_mask;
727
728         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
729                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
730                         continue;
731                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
732                         continue;
733                 if (c->mbuf != NULL) {
734                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
735                                 struct ip *c_iph, *iph = nh;
736                                 c_iph = c->nh;
737                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
738                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
739                                         continue;
740                         } else {
741                                 struct ip6_hdr *c_iph, *iph = nh;
742                                 c_iph = c->nh;
743                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
744                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
745                                         continue;
746                         }
747                 }
748
749                 /* Re-insert at head of list to reduce lookup time. */
750                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
751                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
752
753                 if (c->next_buf.mbuf != NULL) {
754                         if (!sfxge_lro_try_merge(rxq, c))
755                                 goto deliver_now;
756                 } else {
757                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
758                             active_link);
759                 }
760                 c->next_buf = *rx_buf;
761                 c->next_eh = eh;
762                 c->next_nh = nh;
763
764                 rx_buf->mbuf = NULL;
765                 rx_buf->flags = EFX_DISCARD;
766                 return;
767         }
768
769         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
770  deliver_now:
771         sfxge_rx_deliver(rxq, rx_buf);
772 }
773
774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
775 {
776         struct sfxge_lro_state *st = &rxq->lro;
777         struct sfxge_lro_conn *c;
778         unsigned t;
779
780         while (!LIST_EMPTY(&st->active_conns)) {
781                 c = LIST_FIRST(&st->active_conns);
782                 if (!c->delivered && c->mbuf != NULL)
783                         sfxge_lro_deliver(st, c);
784                 if (sfxge_lro_try_merge(rxq, c)) {
785                         if (c->mbuf != NULL)
786                                 sfxge_lro_deliver(st, c);
787                         LIST_REMOVE(c, active_link);
788                 }
789                 c->delivered = 0;
790         }
791
792         t = *(volatile int *)&ticks;
793         if (__predict_false(t != st->last_purge_ticks))
794                 sfxge_lro_purge_idle(rxq, t);
795 }
796
797 #else   /* !SFXGE_LRO */
798
799 static void
800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
801 {
802 }
803
804 static void
805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
806 {
807 }
808
809 #endif  /* SFXGE_LRO */
810
811 void
812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
813 {
814         struct sfxge_softc *sc = rxq->sc;
815         int if_capenable = sc->ifnet->if_capenable;
816         int lro_enabled = if_capenable & IFCAP_LRO;
817         unsigned int index;
818         struct sfxge_evq *evq;
819         unsigned int completed;
820         unsigned int level;
821         struct mbuf *m;
822         struct sfxge_rx_sw_desc *prev = NULL;
823
824         index = rxq->index;
825         evq = sc->evq[index];
826
827         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
828
829         completed = rxq->completed;
830         while (completed != rxq->pending) {
831                 unsigned int id;
832                 struct sfxge_rx_sw_desc *rx_desc;
833
834                 id = completed++ & rxq->ptr_mask;
835                 rx_desc = &rxq->queue[id];
836                 m = rx_desc->mbuf;
837
838                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
839                         goto discard;
840
841                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
842                         goto discard;
843
844                 /* Read the length from the pseudo header if required */
845                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
846                         uint16_t tmp_size;
847                         int rc;
848                         rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
849                                                            mtod(m, uint8_t *),
850                                                            &tmp_size);
851                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
852                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
853                 }
854
855                 prefetch_read_many(mtod(m, caddr_t));
856
857                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
858                 case EFX_PKT_IPV4:
859                         if (~if_capenable & IFCAP_RXCSUM)
860                                 rx_desc->flags &=
861                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
862                         break;
863                 case EFX_PKT_IPV6:
864                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
865                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
866                         break;
867                 case 0:
868                         /* Check for loopback packets */
869                         {
870                                 struct ether_header *etherhp;
871
872                                 /*LINTED*/
873                                 etherhp = mtod(m, struct ether_header *);
874
875                                 if (etherhp->ether_type ==
876                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
877                                         EFSYS_PROBE(loopback);
878
879                                         rxq->loopback++;
880                                         goto discard;
881                                 }
882                         }
883                         break;
884                 default:
885                         KASSERT(B_FALSE,
886                             ("Rx descriptor with both IPv4 and IPv6 flags"));
887                         goto discard;
888                 }
889
890                 /* Pass packet up the stack or into LRO (pipelined) */
891                 if (prev != NULL) {
892                         if (lro_enabled &&
893                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
894                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
895                                 sfxge_lro(rxq, prev);
896                         else
897                                 sfxge_rx_deliver(rxq, prev);
898                 }
899                 prev = rx_desc;
900                 continue;
901
902 discard:
903                 /* Return the packet to the pool */
904                 m_free(m);
905                 rx_desc->mbuf = NULL;
906         }
907         rxq->completed = completed;
908
909         level = rxq->added - rxq->completed;
910
911         /* Pass last packet up the stack or into LRO */
912         if (prev != NULL) {
913                 if (lro_enabled &&
914                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
915                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
916                         sfxge_lro(rxq, prev);
917                 else
918                         sfxge_rx_deliver(rxq, prev);
919         }
920
921         /*
922          * If there are any pending flows and this is the end of the
923          * poll then they must be completed.
924          */
925         if (eop)
926                 sfxge_lro_end_of_burst(rxq);
927
928         /* Top up the queue if necessary */
929         if (level < rxq->refill_threshold)
930                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
931 }
932
933 static void
934 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
935 {
936         struct sfxge_rxq *rxq;
937         struct sfxge_evq *evq;
938         unsigned int count;
939         unsigned int retry = 3;
940
941         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
942
943         rxq = sc->rxq[index];
944         evq = sc->evq[index];
945
946         SFXGE_EVQ_LOCK(evq);
947
948         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
949             ("rxq not started"));
950
951         rxq->init_state = SFXGE_RXQ_INITIALIZED;
952
953         callout_stop(&rxq->refill_callout);
954
955         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
956                 rxq->flush_state = SFXGE_FLUSH_PENDING;
957
958                 SFXGE_EVQ_UNLOCK(evq);
959
960                 /* Flush the receive queue */
961                 if (efx_rx_qflush(rxq->common) != 0) {
962                         SFXGE_EVQ_LOCK(evq);
963                         rxq->flush_state = SFXGE_FLUSH_FAILED;
964                         break;
965                 }
966
967                 count = 0;
968                 do {
969                         /* Spin for 100 ms */
970                         DELAY(100000);
971
972                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
973                                 break;
974
975                 } while (++count < 20);
976
977                 SFXGE_EVQ_LOCK(evq);
978
979                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
980                         /* Flush timeout - neither done nor failed */
981                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
982                             device_get_nameunit(sc->dev), index);
983                         rxq->flush_state = SFXGE_FLUSH_DONE;
984                 }
985                 retry--;
986         }
987         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
988                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
989                     device_get_nameunit(sc->dev), index);
990                 rxq->flush_state = SFXGE_FLUSH_DONE;
991         }
992
993         rxq->pending = rxq->added;
994         sfxge_rx_qcomplete(rxq, B_TRUE);
995
996         KASSERT(rxq->completed == rxq->pending,
997             ("rxq->completed != rxq->pending"));
998
999         rxq->added = 0;
1000         rxq->pushed = 0;
1001         rxq->pending = 0;
1002         rxq->completed = 0;
1003         rxq->loopback = 0;
1004
1005         /* Destroy the common code receive queue. */
1006         efx_rx_qdestroy(rxq->common);
1007
1008         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1009             EFX_RXQ_NBUFS(sc->rxq_entries));
1010
1011         SFXGE_EVQ_UNLOCK(evq);
1012 }
1013
1014 static int
1015 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1016 {
1017         struct sfxge_rxq *rxq;
1018         efsys_mem_t *esmp;
1019         struct sfxge_evq *evq;
1020         int rc;
1021
1022         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1023
1024         rxq = sc->rxq[index];
1025         esmp = &rxq->mem;
1026         evq = sc->evq[index];
1027
1028         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1029             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1030         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1031             ("evq->init_state != SFXGE_EVQ_STARTED"));
1032
1033         /* Program the buffer table. */
1034         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1035             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1036                 return (rc);
1037
1038         /* Create the common code receive queue. */
1039         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1040             esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1041             evq->common, &rxq->common)) != 0)
1042                 goto fail;
1043
1044         SFXGE_EVQ_LOCK(evq);
1045
1046         /* Enable the receive queue. */
1047         efx_rx_qenable(rxq->common);
1048
1049         rxq->init_state = SFXGE_RXQ_STARTED;
1050         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1051
1052         /* Try to fill the queue from the pool. */
1053         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1054
1055         SFXGE_EVQ_UNLOCK(evq);
1056
1057         return (0);
1058
1059 fail:
1060         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1061             EFX_RXQ_NBUFS(sc->rxq_entries));
1062         return (rc);
1063 }
1064
1065 void
1066 sfxge_rx_stop(struct sfxge_softc *sc)
1067 {
1068         int index;
1069
1070         efx_mac_filter_default_rxq_clear(sc->enp);
1071
1072         /* Stop the receive queue(s) */
1073         index = sc->rxq_count;
1074         while (--index >= 0)
1075                 sfxge_rx_qstop(sc, index);
1076
1077         sc->rx_prefix_size = 0;
1078         sc->rx_buffer_size = 0;
1079
1080         efx_rx_fini(sc->enp);
1081 }
1082
1083 int
1084 sfxge_rx_start(struct sfxge_softc *sc)
1085 {
1086         struct sfxge_intr *intr;
1087         const efx_nic_cfg_t *encp;
1088         size_t hdrlen, align, reserved;
1089         int index;
1090         int rc;
1091
1092         intr = &sc->intr;
1093
1094         /* Initialize the common code receive module. */
1095         if ((rc = efx_rx_init(sc->enp)) != 0)
1096                 return (rc);
1097
1098         encp = efx_nic_cfg_get(sc->enp);
1099         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1100
1101         /* Calculate the receive packet buffer size. */
1102         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1103
1104         /* Ensure IP headers are 32bit aligned */
1105         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1106         sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1107
1108         sc->rx_buffer_size += sc->rx_buffer_align;
1109
1110         /* Align end of packet buffer for RX DMA end padding */
1111         align = MAX(1, encp->enc_rx_buf_align_end);
1112         EFSYS_ASSERT(ISP2(align));
1113         sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1114
1115         /*
1116          * Standard mbuf zones only guarantee pointer-size alignment;
1117          * we need extra space to align to the cache line
1118          */
1119         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1120
1121         /* Select zone for packet buffers */
1122         if (reserved <= MCLBYTES)
1123                 sc->rx_cluster_size = MCLBYTES;
1124         else if (reserved <= MJUMPAGESIZE)
1125                 sc->rx_cluster_size = MJUMPAGESIZE;
1126         else if (reserved <= MJUM9BYTES)
1127                 sc->rx_cluster_size = MJUM9BYTES;
1128         else
1129                 sc->rx_cluster_size = MJUM16BYTES;
1130
1131         /*
1132          * Set up the scale table.  Enable all hash types and hash insertion.
1133          */
1134         for (index = 0; index < nitems(sc->rx_indir_table); index++)
1135 #ifdef RSS
1136                 sc->rx_indir_table[index] =
1137                         rss_get_indirection_to_bucket(index) % sc->rxq_count;
1138 #else
1139                 sc->rx_indir_table[index] = index % sc->rxq_count;
1140 #endif
1141         if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1142                                        sc->rx_indir_table,
1143                                        nitems(sc->rx_indir_table))) != 0)
1144                 goto fail;
1145         (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1146             EFX_RX_HASHALG_TOEPLITZ,
1147             EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1148             EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1149
1150 #ifdef RSS
1151         rss_getkey(toep_key);
1152 #endif
1153         if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1154                                        toep_key,
1155                                        sizeof(toep_key))) != 0)
1156                 goto fail;
1157
1158         /* Start the receive queue(s). */
1159         for (index = 0; index < sc->rxq_count; index++) {
1160                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1161                         goto fail2;
1162         }
1163
1164         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1165                                             sc->intr.n_alloc > 1);
1166         if (rc != 0)
1167                 goto fail3;
1168
1169         return (0);
1170
1171 fail3:
1172 fail2:
1173         while (--index >= 0)
1174                 sfxge_rx_qstop(sc, index);
1175
1176 fail:
1177         efx_rx_fini(sc->enp);
1178
1179         return (rc);
1180 }
1181
1182 #ifdef SFXGE_LRO
1183
1184 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1185 {
1186         struct sfxge_lro_state *st = &rxq->lro;
1187         unsigned i;
1188
1189         st->conns_mask = lro_table_size - 1;
1190         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1191                 ("lro_table_size must be a power of 2"));
1192         st->sc = rxq->sc;
1193         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1194                            M_SFXGE, M_WAITOK);
1195         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1196                              M_SFXGE, M_WAITOK);
1197         for (i = 0; i <= st->conns_mask; ++i) {
1198                 TAILQ_INIT(&st->conns[i]);
1199                 st->conns_n[i] = 0;
1200         }
1201         LIST_INIT(&st->active_conns);
1202         TAILQ_INIT(&st->free_conns);
1203 }
1204
1205 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1206 {
1207         struct sfxge_lro_state *st = &rxq->lro;
1208         struct sfxge_lro_conn *c;
1209         unsigned i;
1210
1211         /* Return cleanly if sfxge_lro_init() has not been called. */
1212         if (st->conns == NULL)
1213                 return;
1214
1215         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1216
1217         for (i = 0; i <= st->conns_mask; ++i) {
1218                 while (!TAILQ_EMPTY(&st->conns[i])) {
1219                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1220                         sfxge_lro_drop(rxq, c);
1221                 }
1222         }
1223
1224         while (!TAILQ_EMPTY(&st->free_conns)) {
1225                 c = TAILQ_FIRST(&st->free_conns);
1226                 TAILQ_REMOVE(&st->free_conns, c, link);
1227                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1228                 free(c, M_SFXGE);
1229         }
1230
1231         free(st->conns_n, M_SFXGE);
1232         free(st->conns, M_SFXGE);
1233         st->conns = NULL;
1234 }
1235
1236 #else
1237
1238 static void
1239 sfxge_lro_init(struct sfxge_rxq *rxq)
1240 {
1241 }
1242
1243 static void
1244 sfxge_lro_fini(struct sfxge_rxq *rxq)
1245 {
1246 }
1247
1248 #endif  /* SFXGE_LRO */
1249
1250 static void
1251 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1252 {
1253         struct sfxge_rxq *rxq;
1254
1255         rxq = sc->rxq[index];
1256
1257         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1258             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1259
1260         /* Free the context array and the flow table. */
1261         free(rxq->queue, M_SFXGE);
1262         sfxge_lro_fini(rxq);
1263
1264         /* Release DMA memory. */
1265         sfxge_dma_free(&rxq->mem);
1266
1267         sc->rxq[index] = NULL;
1268
1269         free(rxq, M_SFXGE);
1270 }
1271
1272 static int
1273 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1274 {
1275         struct sfxge_rxq *rxq;
1276         struct sfxge_evq *evq;
1277         efsys_mem_t *esmp;
1278         int rc;
1279
1280         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1281
1282         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1283         rxq->sc = sc;
1284         rxq->index = index;
1285         rxq->entries = sc->rxq_entries;
1286         rxq->ptr_mask = rxq->entries - 1;
1287         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1288
1289         sc->rxq[index] = rxq;
1290         esmp = &rxq->mem;
1291
1292         evq = sc->evq[index];
1293
1294         /* Allocate and zero DMA space. */
1295         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1296                 return (rc);
1297
1298         /* Allocate buffer table entries. */
1299         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1300                                  &rxq->buf_base_id);
1301
1302         /* Allocate the context array and the flow table. */
1303         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1304             M_SFXGE, M_WAITOK | M_ZERO);
1305         sfxge_lro_init(rxq);
1306
1307         callout_init(&rxq->refill_callout, 1);
1308
1309         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1310
1311         return (0);
1312 }
1313
1314 static const struct {
1315         const char *name;
1316         size_t offset;
1317 } sfxge_rx_stats[] = {
1318 #define SFXGE_RX_STAT(name, member) \
1319         { #name, offsetof(struct sfxge_rxq, member) }
1320 #ifdef SFXGE_LRO
1321         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1322         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1323         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1324         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1325         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1326         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1327         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1328         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1329 #endif
1330 };
1331
1332 static int
1333 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1334 {
1335         struct sfxge_softc *sc = arg1;
1336         unsigned int id = arg2;
1337         unsigned int sum, index;
1338
1339         /* Sum across all RX queues */
1340         sum = 0;
1341         for (index = 0; index < sc->rxq_count; index++)
1342                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1343                                          sfxge_rx_stats[id].offset);
1344
1345         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1346 }
1347
1348 static void
1349 sfxge_rx_stat_init(struct sfxge_softc *sc)
1350 {
1351         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1352         struct sysctl_oid_list *stat_list;
1353         unsigned int id;
1354
1355         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1356
1357         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1358                 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
1359                     sfxge_rx_stats[id].name,
1360                     CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
1361                     sc, id, sfxge_rx_stat_handler, "IU", "");
1362         }
1363 }
1364
1365 void
1366 sfxge_rx_fini(struct sfxge_softc *sc)
1367 {
1368         int index;
1369
1370         index = sc->rxq_count;
1371         while (--index >= 0)
1372                 sfxge_rx_qfini(sc, index);
1373
1374         sc->rxq_count = 0;
1375 }
1376
1377 int
1378 sfxge_rx_init(struct sfxge_softc *sc)
1379 {
1380         struct sfxge_intr *intr;
1381         int index;
1382         int rc;
1383
1384 #ifdef SFXGE_LRO
1385         if (!ISP2(lro_table_size)) {
1386                 log(LOG_ERR, "%s=%u must be power of 2",
1387                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1388                 rc = EINVAL;
1389                 goto fail_lro_table_size;
1390         }
1391
1392         if (lro_idle_ticks == 0)
1393                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1394 #endif
1395
1396         intr = &sc->intr;
1397
1398         sc->rxq_count = intr->n_alloc;
1399
1400         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1401             ("intr->state != SFXGE_INTR_INITIALIZED"));
1402
1403         /* Initialize the receive queue(s) - one per interrupt. */
1404         for (index = 0; index < sc->rxq_count; index++) {
1405                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1406                         goto fail;
1407         }
1408
1409         sfxge_rx_stat_init(sc);
1410
1411         return (0);
1412
1413 fail:
1414         /* Tear down the receive queue(s). */
1415         while (--index >= 0)
1416                 sfxge_rx_qfini(sc, index);
1417
1418         sc->rxq_count = 0;
1419
1420 #ifdef SFXGE_LRO
1421 fail_lro_table_size:
1422 #endif
1423         return (rc);
1424 }