sys/dev/sfxge/sfxge_rx.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
   5  * All rights reserved.
   6  *
   7  * This software was developed in part by Philip Paeps under contract for
   8  * Solarflare Communications, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright notice,
  14  *    this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright notice,
  16  *    this list of conditions and the following disclaimer in the documentation
  17  *    and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * The views and conclusions contained in the software and documentation are
  32  * those of the authors and should not be interpreted as representing official
  33  * policies, either expressed or implied, of the FreeBSD Project.
  34  */
  35
  36 #include <sys/cdefs.h>
  37 __FBSDID("$FreeBSD$");
  38
  39 #include "opt_rss.h"
  40
  41 #include <sys/param.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mbuf.h>
  44 #include <sys/smp.h>
  45 #include <sys/socket.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/syslog.h>
  48 #include <sys/limits.h>
  49 #include <sys/syslog.h>
  50
  51 #include <net/ethernet.h>
  52 #include <net/if.h>
  53 #include <net/if_vlan_var.h>
  54
  55 #include <netinet/in.h>
  56 #include <netinet/ip.h>
  57 #include <netinet/ip6.h>
  58 #include <netinet/tcp.h>
  59
  60 #include <machine/in_cksum.h>
  61
  62 #ifdef RSS
  63 #include <net/rss_config.h>
  64 #endif
  65
  66 #include "common/efx.h"
  67
  68
  69 #include "sfxge.h"
  70 #include "sfxge_rx.h"
  71
  72 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
  73
  74 #ifdef SFXGE_LRO
  75
  76 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
  77             "Large receive offload (LRO) parameters");
  78
  79 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
  80
  81 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
  82  * means we can accelerate a larger number of streams.
  83  */
  84 static unsigned lro_table_size = 128;
  85 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
  86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
  87             &lro_table_size, 0,
  88             "Size of the LRO hash table (must be a power of 2)");
  89
  90 /* Maximum length of a hash chain.  If chains get too long then the lookup
  91  * time increases and may exceed the benefit of LRO.
  92  */
  93 static unsigned lro_chain_max = 20;
  94 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
  95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
  96             &lro_chain_max, 0,
  97             "The maximum length of a hash chain");
  98
  99 /* Maximum time (in ticks) that a connection can be idle before it's LRO
 100  * state is discarded.
 101  */
 102 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
 103 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
 104 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
 105             &lro_idle_ticks, 0,
 106             "The maximum time (in ticks) that a connection can be idle "
 107             "before it's LRO state is discarded");
 108
 109 /* Number of packets with payload that must arrive in-order before a
 110  * connection is eligible for LRO.  The idea is we should avoid coalescing
 111  * segments when the sender is in slow-start because reducing the ACK rate
 112  * can damage performance.
 113  */
 114 static int lro_slow_start_packets = 2000;
 115 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
 116 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
 117             &lro_slow_start_packets, 0,
 118             "Number of packets with payload that must arrive in-order before "
 119             "a connection is eligible for LRO");
 120
 121 /* Number of packets with payload that must arrive in-order following loss
 122  * before a connection is eligible for LRO.  The idea is we should avoid
 123  * coalescing segments when the sender is recovering from loss, because
 124  * reducing the ACK rate can damage performance.
 125  */
 126 static int lro_loss_packets = 20;
 127 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
 128 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
 129             &lro_loss_packets, 0,
 130             "Number of packets with payload that must arrive in-order "
 131             "following loss before a connection is eligible for LRO");
 132
 133 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
 134 #define SFXGE_LRO_L2_ID_VLAN 0x4000
 135 #define SFXGE_LRO_L2_ID_IPV6 0x8000
 136 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
 137 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
 138
 139 /* Compare IPv6 addresses, avoiding conditional branches */
 140 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
 141                                    const struct in6_addr *right)
 142 {
 143 #if LONG_BIT == 64
 144         const uint64_t *left64 = (const uint64_t *)left;
 145         const uint64_t *right64 = (const uint64_t *)right;
 146         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
 147 #else
 148         return (left->s6_addr32[0] - right->s6_addr32[0]) |
 149                (left->s6_addr32[1] - right->s6_addr32[1]) |
 150                (left->s6_addr32[2] - right->s6_addr32[2]) |
 151                (left->s6_addr32[3] - right->s6_addr32[3]);
 152 #endif
 153 }
 154
 155 #endif  /* SFXGE_LRO */
 156
 157 void
 158 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
 159 {
 160
 161         rxq->flush_state = SFXGE_FLUSH_DONE;
 162 }
 163
 164 void
 165 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
 166 {
 167
 168         rxq->flush_state = SFXGE_FLUSH_FAILED;
 169 }
 170
 171 #ifdef RSS
 172 static uint8_t toep_key[RSS_KEYSIZE];
 173 #else
 174 static uint8_t toep_key[] = {
 175         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
 176         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
 177         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
 178         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
 179         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
 180 };
 181 #endif
 182
 183 static void
 184 sfxge_rx_post_refill(void *arg)
 185 {
 186         struct sfxge_rxq *rxq = arg;
 187         struct sfxge_softc *sc;
 188         unsigned int index;
 189         struct sfxge_evq *evq;
 190         uint16_t magic;
 191
 192         sc = rxq->sc;
 193         index = rxq->index;
 194         evq = sc->evq[index];
 195         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
 196
 197         /* This is guaranteed due to the start/stop order of rx and ev */
 198         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
 199             ("evq not started"));
 200         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
 201             ("rxq not started"));
 202         efx_ev_qpost(evq->common, magic);
 203 }
 204
 205 static void
 206 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
 207 {
 208         /* Initially retry after 100 ms, but back off in case of
 209          * repeated failures as we probably have to wait for the
 210          * administrator to raise the pool limit. */
 211         if (retrying)
 212                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
 213         else
 214                 rxq->refill_delay = hz / 10;
 215
 216         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
 217                              sfxge_rx_post_refill, rxq);
 218 }
 219
 220 #define SFXGE_REFILL_BATCH  64
 221
 222 static void
 223 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
 224 {
 225         struct sfxge_softc *sc;
 226         unsigned int index;
 227         struct sfxge_evq *evq;
 228         unsigned int batch;
 229         unsigned int rxfill;
 230         unsigned int mblksize;
 231         int ntodo;
 232         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
 233
 234         sc = rxq->sc;
 235         index = rxq->index;
 236         evq = sc->evq[index];
 237
 238         prefetch_read_many(sc->enp);
 239         prefetch_read_many(rxq->common);
 240
 241         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
 242
 243         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 244                 return;
 245
 246         rxfill = rxq->added - rxq->completed;
 247         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
 248             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
 249         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
 250         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
 251             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
 252
 253         if (ntodo == 0)
 254                 return;
 255
 256         batch = 0;
 257         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
 258         while (ntodo-- > 0) {
 259                 unsigned int id;
 260                 struct sfxge_rx_sw_desc *rx_desc;
 261                 bus_dma_segment_t seg;
 262                 struct mbuf *m;
 263
 264                 id = (rxq->added + batch) & rxq->ptr_mask;
 265                 rx_desc = &rxq->queue[id];
 266                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
 267
 268                 rx_desc->flags = EFX_DISCARD;
 269                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
 270                     sc->rx_cluster_size);
 271                 if (m == NULL)
 272                         break;
 273
 274                 /* m_len specifies length of area to be mapped for DMA */
 275                 m->m_len  = mblksize;
 276                 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
 277                                                    CACHE_LINE_SIZE);
 278                 m->m_data += sc->rx_buffer_align;
 279
 280                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
 281                 addr[batch++] = seg.ds_addr;
 282
 283                 if (batch == SFXGE_REFILL_BATCH) {
 284                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
 285                             rxq->completed, rxq->added);
 286                         rxq->added += batch;
 287                         batch = 0;
 288                 }
 289         }
 290
 291         if (ntodo != 0)
 292                 sfxge_rx_schedule_refill(rxq, retrying);
 293
 294         if (batch != 0) {
 295                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
 296                     rxq->completed, rxq->added);
 297                 rxq->added += batch;
 298         }
 299
 300         /* Make the descriptors visible to the hardware */
 301         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
 302                         BUS_DMASYNC_PREWRITE);
 303
 304         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
 305
 306         /* The queue could still be empty if no descriptors were actually
 307          * pushed, in which case there will be no event to cause the next
 308          * refill, so we must schedule a refill ourselves.
 309          */
 310         if(rxq->pushed == rxq->completed) {
 311                 sfxge_rx_schedule_refill(rxq, retrying);
 312         }
 313 }
 314
 315 void
 316 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
 317 {
 318
 319         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 320                 return;
 321
 322         /* Make sure the queue is full */
 323         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
 324 }
 325
 326 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
 327 {
 328         struct ifnet *ifp = sc->ifnet;
 329
 330         m->m_pkthdr.rcvif = ifp;
 331         m->m_pkthdr.csum_data = 0xffff;
 332         ifp->if_input(ifp, m);
 333 }
 334
 335 static void
 336 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
 337 {
 338         struct sfxge_softc *sc = rxq->sc;
 339         struct mbuf *m = rx_desc->mbuf;
 340         int flags = rx_desc->flags;
 341         int csum_flags;
 342
 343         /* Convert checksum flags */
 344         csum_flags = (flags & EFX_CKSUM_IPV4) ?
 345                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
 346         if (flags & EFX_CKSUM_TCPUDP)
 347                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 348
 349         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
 350                 m->m_pkthdr.flowid =
 351                         efx_pseudo_hdr_hash_get(rxq->common,
 352                                                 EFX_RX_HASHALG_TOEPLITZ,
 353                                                 mtod(m, uint8_t *));
 354                 /* The hash covers a 4-tuple for TCP only */
 355                 M_HASHTYPE_SET(m,
 356                     (flags & EFX_PKT_IPV4) ?
 357                         ((flags & EFX_PKT_TCP) ?
 358                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
 359                         ((flags & EFX_PKT_TCP) ?
 360                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
 361         }
 362         m->m_data += sc->rx_prefix_size;
 363         m->m_len = rx_desc->size - sc->rx_prefix_size;
 364         m->m_pkthdr.len = m->m_len;
 365         m->m_pkthdr.csum_flags = csum_flags;
 366         __sfxge_rx_deliver(sc, rx_desc->mbuf);
 367
 368         rx_desc->flags = EFX_DISCARD;
 369         rx_desc->mbuf = NULL;
 370 }
 371
 372 #ifdef SFXGE_LRO
 373
 374 static void
 375 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
 376 {
 377         struct sfxge_softc *sc = st->sc;
 378         struct mbuf *m = c->mbuf;
 379         struct tcphdr *c_th;
 380         int csum_flags;
 381
 382         KASSERT(m, ("no mbuf to deliver"));
 383
 384         ++st->n_bursts;
 385
 386         /* Finish off packet munging and recalculate IP header checksum. */
 387         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 388                 struct ip *iph = c->nh;
 389                 iph->ip_len = htons(iph->ip_len);
 390                 iph->ip_sum = 0;
 391                 iph->ip_sum = in_cksum_hdr(iph);
 392                 c_th = (struct tcphdr *)(iph + 1);
 393                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
 394                               CSUM_IP_CHECKED | CSUM_IP_VALID);
 395         } else {
 396                 struct ip6_hdr *iph = c->nh;
 397                 iph->ip6_plen = htons(iph->ip6_plen);
 398                 c_th = (struct tcphdr *)(iph + 1);
 399                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 400         }
 401
 402         c_th->th_win = c->th_last->th_win;
 403         c_th->th_ack = c->th_last->th_ack;
 404         if (c_th->th_off == c->th_last->th_off) {
 405                 /* Copy TCP options (take care to avoid going negative). */
 406                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
 407                 memcpy(c_th + 1, c->th_last + 1, optlen);
 408         }
 409
 410         m->m_pkthdr.flowid = c->conn_hash;
 411         M_HASHTYPE_SET(m,
 412             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
 413                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
 414
 415         m->m_pkthdr.csum_flags = csum_flags;
 416         __sfxge_rx_deliver(sc, m);
 417
 418         c->mbuf = NULL;
 419         c->delivered = 1;
 420 }
 421
 422 /* Drop the given connection, and add it to the free list. */
 423 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
 424 {
 425         unsigned bucket;
 426
 427         KASSERT(!c->mbuf, ("found orphaned mbuf"));
 428
 429         if (c->next_buf.mbuf != NULL) {
 430                 sfxge_rx_deliver(rxq, &c->next_buf);
 431                 LIST_REMOVE(c, active_link);
 432         }
 433
 434         bucket = c->conn_hash & rxq->lro.conns_mask;
 435         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
 436         --rxq->lro.conns_n[bucket];
 437         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
 438         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
 439 }
 440
 441 /* Stop tracking connections that have gone idle in order to keep hash
 442  * chains short.
 443  */
 444 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
 445 {
 446         struct sfxge_lro_conn *c;
 447         unsigned i;
 448
 449         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
 450                 ("found active connections"));
 451
 452         rxq->lro.last_purge_ticks = now;
 453         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
 454                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
 455                         continue;
 456
 457                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
 458                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
 459                         ++rxq->lro.n_drop_idle;
 460                         sfxge_lro_drop(rxq, c);
 461                 }
 462         }
 463 }
 464
 465 static void
 466 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
 467                 struct mbuf *mbuf, struct tcphdr *th)
 468 {
 469         struct tcphdr *c_th;
 470
 471         /* Tack the new mbuf onto the chain. */
 472         KASSERT(!mbuf->m_next, ("mbuf already chained"));
 473         c->mbuf_tail->m_next = mbuf;
 474         c->mbuf_tail = mbuf;
 475
 476         /* Increase length appropriately */
 477         c->mbuf->m_pkthdr.len += mbuf->m_len;
 478
 479         /* Update the connection state flags */
 480         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 481                 struct ip *iph = c->nh;
 482                 iph->ip_len += mbuf->m_len;
 483                 c_th = (struct tcphdr *)(iph + 1);
 484         } else {
 485                 struct ip6_hdr *iph = c->nh;
 486                 iph->ip6_plen += mbuf->m_len;
 487                 c_th = (struct tcphdr *)(iph + 1);
 488         }
 489         c_th->th_flags |= (th->th_flags & TH_PUSH);
 490         c->th_last = th;
 491         ++st->n_merges;
 492
 493         /* Pass packet up now if another segment could overflow the IP
 494          * length.
 495          */
 496         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
 497                 sfxge_lro_deliver(st, c);
 498 }
 499
 500 static void
 501 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
 502                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
 503 {
 504         /* Start the chain */
 505         c->mbuf = mbuf;
 506         c->mbuf_tail = c->mbuf;
 507         c->nh = nh;
 508         c->th_last = th;
 509
 510         mbuf->m_pkthdr.len = mbuf->m_len;
 511
 512         /* Mangle header fields for later processing */
 513         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 514                 struct ip *iph = nh;
 515                 iph->ip_len = ntohs(iph->ip_len);
 516         } else {
 517                 struct ip6_hdr *iph = nh;
 518                 iph->ip6_plen = ntohs(iph->ip6_plen);
 519         }
 520 }
 521
 522 /* Try to merge or otherwise hold or deliver (as appropriate) the
 523  * packet buffered for this connection (c->next_buf).  Return a flag
 524  * indicating whether the connection is still active for LRO purposes.
 525  */
 526 static int
 527 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
 528 {
 529         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
 530         char *eh = c->next_eh;
 531         int data_length, hdr_length, dont_merge;
 532         unsigned th_seq, pkt_length;
 533         struct tcphdr *th;
 534         unsigned now;
 535
 536         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 537                 struct ip *iph = c->next_nh;
 538                 th = (struct tcphdr *)(iph + 1);
 539                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
 540         } else {
 541                 struct ip6_hdr *iph = c->next_nh;
 542                 th = (struct tcphdr *)(iph + 1);
 543                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
 544         }
 545
 546         hdr_length = (char *) th + th->th_off * 4 - eh;
 547         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
 548                        hdr_length);
 549         th_seq = ntohl(th->th_seq);
 550         dont_merge = ((data_length <= 0)
 551                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
 552
 553         /* Check for options other than aligned timestamp. */
 554         if (th->th_off != 5) {
 555                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
 556                 if (th->th_off == 8 &&
 557                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
 558                                         (TCPOPT_NOP << 16) |
 559                                         (TCPOPT_TIMESTAMP << 8) |
 560                                         TCPOLEN_TIMESTAMP)) {
 561                         /* timestamp option -- okay */
 562                 } else {
 563                         dont_merge = 1;
 564                 }
 565         }
 566
 567         if (__predict_false(th_seq != c->next_seq)) {
 568                 /* Out-of-order, so start counting again. */
 569                 if (c->mbuf != NULL)
 570                         sfxge_lro_deliver(&rxq->lro, c);
 571                 c->n_in_order_pkts -= lro_loss_packets;
 572                 c->next_seq = th_seq + data_length;
 573                 ++rxq->lro.n_misorder;
 574                 goto deliver_buf_out;
 575         }
 576         c->next_seq = th_seq + data_length;
 577
 578         now = ticks;
 579         if (now - c->last_pkt_ticks > lro_idle_ticks) {
 580                 ++rxq->lro.n_drop_idle;
 581                 if (c->mbuf != NULL)
 582                         sfxge_lro_deliver(&rxq->lro, c);
 583                 sfxge_lro_drop(rxq, c);
 584                 return (0);
 585         }
 586         c->last_pkt_ticks = ticks;
 587
 588         if (c->n_in_order_pkts < lro_slow_start_packets) {
 589                 /* May be in slow-start, so don't merge. */
 590                 ++rxq->lro.n_slow_start;
 591                 ++c->n_in_order_pkts;
 592                 goto deliver_buf_out;
 593         }
 594
 595         if (__predict_false(dont_merge)) {
 596                 if (c->mbuf != NULL)
 597                         sfxge_lro_deliver(&rxq->lro, c);
 598                 if (th->th_flags & (TH_FIN | TH_RST)) {
 599                         ++rxq->lro.n_drop_closed;
 600                         sfxge_lro_drop(rxq, c);
 601                         return (0);
 602                 }
 603                 goto deliver_buf_out;
 604         }
 605
 606         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
 607
 608         if (__predict_true(c->mbuf != NULL)) {
 609                 /* Remove headers and any padding */
 610                 rx_buf->mbuf->m_data += hdr_length;
 611                 rx_buf->mbuf->m_len = data_length;
 612
 613                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
 614         } else {
 615                 /* Remove any padding */
 616                 rx_buf->mbuf->m_len = pkt_length;
 617
 618                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
 619         }
 620
 621         rx_buf->mbuf = NULL;
 622         return (1);
 623
 624  deliver_buf_out:
 625         sfxge_rx_deliver(rxq, rx_buf);
 626         return (1);
 627 }
 628
 629 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
 630                                uint16_t l2_id, void *nh, struct tcphdr *th)
 631 {
 632         unsigned bucket = conn_hash & st->conns_mask;
 633         struct sfxge_lro_conn *c;
 634
 635         if (st->conns_n[bucket] >= lro_chain_max) {
 636                 ++st->n_too_many;
 637                 return;
 638         }
 639
 640         if (!TAILQ_EMPTY(&st->free_conns)) {
 641                 c = TAILQ_FIRST(&st->free_conns);
 642                 TAILQ_REMOVE(&st->free_conns, c, link);
 643         } else {
 644                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
 645                 if (c == NULL)
 646                         return;
 647                 c->mbuf = NULL;
 648                 c->next_buf.mbuf = NULL;
 649         }
 650
 651         /* Create the connection tracking data */
 652         ++st->conns_n[bucket];
 653         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
 654         c->l2_id = l2_id;
 655         c->conn_hash = conn_hash;
 656         c->source = th->th_sport;
 657         c->dest = th->th_dport;
 658         c->n_in_order_pkts = 0;
 659         c->last_pkt_ticks = *(volatile int *)&ticks;
 660         c->delivered = 0;
 661         ++st->n_new_stream;
 662         /* NB. We don't initialise c->next_seq, and it doesn't matter what
 663          * value it has.  Most likely the next packet received for this
 664          * connection will not match -- no harm done.
 665          */
 666 }
 667
 668 /* Process mbuf and decide whether to dispatch it to the stack now or
 669  * later.
 670  */
 671 static void
 672 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
 673 {
 674         struct sfxge_softc *sc = rxq->sc;
 675         struct mbuf *m = rx_buf->mbuf;
 676         struct ether_header *eh;
 677         struct sfxge_lro_conn *c;
 678         uint16_t l2_id;
 679         uint16_t l3_proto;
 680         void *nh;
 681         struct tcphdr *th;
 682         uint32_t conn_hash;
 683         unsigned bucket;
 684
 685         /* Get the hardware hash */
 686         conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
 687                                             EFX_RX_HASHALG_TOEPLITZ,
 688                                             mtod(m, uint8_t *));
 689
 690         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
 691         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 692                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
 693                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
 694                         SFXGE_LRO_L2_ID_VLAN;
 695                 l3_proto = veh->evl_proto;
 696                 nh = veh + 1;
 697         } else {
 698                 l2_id = 0;
 699                 l3_proto = eh->ether_type;
 700                 nh = eh + 1;
 701         }
 702
 703         /* Check whether this is a suitable packet (unfragmented
 704          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
 705          * length, and compute a hash if necessary.  If not, return.
 706          */
 707         if (l3_proto == htons(ETHERTYPE_IP)) {
 708                 struct ip *iph = nh;
 709
 710                 KASSERT(iph->ip_p == IPPROTO_TCP,
 711                     ("IPv4 protocol is not TCP, but packet marker is set"));
 712                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
 713                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
 714                         goto deliver_now;
 715                 th = (struct tcphdr *)(iph + 1);
 716         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
 717                 struct ip6_hdr *iph = nh;
 718
 719                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
 720                     ("IPv6 next header is not TCP, but packet marker is set"));
 721                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
 722                 th = (struct tcphdr *)(iph + 1);
 723         } else {
 724                 goto deliver_now;
 725         }
 726
 727         bucket = conn_hash & rxq->lro.conns_mask;
 728
 729         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
 730                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
 731                         continue;
 732                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
 733                         continue;
 734                 if (c->mbuf != NULL) {
 735                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 736                                 struct ip *c_iph, *iph = nh;
 737                                 c_iph = c->nh;
 738                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
 739                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
 740                                         continue;
 741                         } else {
 742                                 struct ip6_hdr *c_iph, *iph = nh;
 743                                 c_iph = c->nh;
 744                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
 745                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
 746                                         continue;
 747                         }
 748                 }
 749
 750                 /* Re-insert at head of list to reduce lookup time. */
 751                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
 752                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
 753
 754                 if (c->next_buf.mbuf != NULL) {
 755                         if (!sfxge_lro_try_merge(rxq, c))
 756                                 goto deliver_now;
 757                 } else {
 758                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
 759                             active_link);
 760                 }
 761                 c->next_buf = *rx_buf;
 762                 c->next_eh = eh;
 763                 c->next_nh = nh;
 764
 765                 rx_buf->mbuf = NULL;
 766                 rx_buf->flags = EFX_DISCARD;
 767                 return;
 768         }
 769
 770         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
 771  deliver_now:
 772         sfxge_rx_deliver(rxq, rx_buf);
 773 }
 774
 775 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
 776 {
 777         struct sfxge_lro_state *st = &rxq->lro;
 778         struct sfxge_lro_conn *c;
 779         unsigned t;
 780
 781         while (!LIST_EMPTY(&st->active_conns)) {
 782                 c = LIST_FIRST(&st->active_conns);
 783                 if (!c->delivered && c->mbuf != NULL)
 784                         sfxge_lro_deliver(st, c);
 785                 if (sfxge_lro_try_merge(rxq, c)) {
 786                         if (c->mbuf != NULL)
 787                                 sfxge_lro_deliver(st, c);
 788                         LIST_REMOVE(c, active_link);
 789                 }
 790                 c->delivered = 0;
 791         }
 792
 793         t = *(volatile int *)&ticks;
 794         if (__predict_false(t != st->last_purge_ticks))
 795                 sfxge_lro_purge_idle(rxq, t);
 796 }
 797
 798 #else   /* !SFXGE_LRO */
 799
 800 static void
 801 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
 802 {
 803 }
 804
 805 static void
 806 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
 807 {
 808 }
 809
 810 #endif  /* SFXGE_LRO */
 811
 812 void
 813 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
 814 {
 815         struct sfxge_softc *sc = rxq->sc;
 816         int if_capenable = sc->ifnet->if_capenable;
 817         int lro_enabled = if_capenable & IFCAP_LRO;
 818         unsigned int index;
 819         struct sfxge_evq *evq;
 820         unsigned int completed;
 821         unsigned int level;
 822         struct mbuf *m;
 823         struct sfxge_rx_sw_desc *prev = NULL;
 824
 825         index = rxq->index;
 826         evq = sc->evq[index];
 827
 828         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
 829
 830         completed = rxq->completed;
 831         while (completed != rxq->pending) {
 832                 unsigned int id;
 833                 struct sfxge_rx_sw_desc *rx_desc;
 834
 835                 id = completed++ & rxq->ptr_mask;
 836                 rx_desc = &rxq->queue[id];
 837                 m = rx_desc->mbuf;
 838
 839                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 840                         goto discard;
 841
 842                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
 843                         goto discard;
 844
 845                 /* Read the length from the pseudo header if required */
 846                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
 847                         uint16_t tmp_size;
 848                         int rc;
 849                         rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
 850                                                            mtod(m, uint8_t *),
 851                                                            &tmp_size);
 852                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
 853                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
 854                 }
 855
 856                 prefetch_read_many(mtod(m, caddr_t));
 857
 858                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
 859                 case EFX_PKT_IPV4:
 860                         if (~if_capenable & IFCAP_RXCSUM)
 861                                 rx_desc->flags &=
 862                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
 863                         break;
 864                 case EFX_PKT_IPV6:
 865                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
 866                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
 867                         break;
 868                 case 0:
 869                         /* Check for loopback packets */
 870                         {
 871                                 struct ether_header *etherhp;
 872
 873                                 /*LINTED*/
 874                                 etherhp = mtod(m, struct ether_header *);
 875
 876                                 if (etherhp->ether_type ==
 877                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
 878                                         EFSYS_PROBE(loopback);
 879
 880                                         rxq->loopback++;
 881                                         goto discard;
 882                                 }
 883                         }
 884                         break;
 885                 default:
 886                         KASSERT(B_FALSE,
 887                             ("Rx descriptor with both IPv4 and IPv6 flags"));
 888                         goto discard;
 889                 }
 890
 891                 /* Pass packet up the stack or into LRO (pipelined) */
 892                 if (prev != NULL) {
 893                         if (lro_enabled &&
 894                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
 895                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
 896                                 sfxge_lro(rxq, prev);
 897                         else
 898                                 sfxge_rx_deliver(rxq, prev);
 899                 }
 900                 prev = rx_desc;
 901                 continue;
 902
 903 discard:
 904                 /* Return the packet to the pool */
 905                 m_free(m);
 906                 rx_desc->mbuf = NULL;
 907         }
 908         rxq->completed = completed;
 909
 910         level = rxq->added - rxq->completed;
 911
 912         /* Pass last packet up the stack or into LRO */
 913         if (prev != NULL) {
 914                 if (lro_enabled &&
 915                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
 916                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
 917                         sfxge_lro(rxq, prev);
 918                 else
 919                         sfxge_rx_deliver(rxq, prev);
 920         }
 921
 922         /*
 923          * If there are any pending flows and this is the end of the
 924          * poll then they must be completed.
 925          */
 926         if (eop)
 927                 sfxge_lro_end_of_burst(rxq);
 928
 929         /* Top up the queue if necessary */
 930         if (level < rxq->refill_threshold)
 931                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
 932 }
 933
 934 static void
 935 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
 936 {
 937         struct sfxge_rxq *rxq;
 938         struct sfxge_evq *evq;
 939         unsigned int count;
 940         unsigned int retry = 3;
 941
 942         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
 943
 944         rxq = sc->rxq[index];
 945         evq = sc->evq[index];
 946
 947         SFXGE_EVQ_LOCK(evq);
 948
 949         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
 950             ("rxq not started"));
 951
 952         rxq->init_state = SFXGE_RXQ_INITIALIZED;
 953
 954         callout_stop(&rxq->refill_callout);
 955
 956         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
 957                 rxq->flush_state = SFXGE_FLUSH_PENDING;
 958
 959                 SFXGE_EVQ_UNLOCK(evq);
 960
 961                 /* Flush the receive queue */
 962                 if (efx_rx_qflush(rxq->common) != 0) {
 963                         SFXGE_EVQ_LOCK(evq);
 964                         rxq->flush_state = SFXGE_FLUSH_FAILED;
 965                         break;
 966                 }
 967
 968                 count = 0;
 969                 do {
 970                         /* Spin for 100 ms */
 971                         DELAY(100000);
 972
 973                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
 974                                 break;
 975
 976                 } while (++count < 20);
 977
 978                 SFXGE_EVQ_LOCK(evq);
 979
 980                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
 981                         /* Flush timeout - neither done nor failed */
 982                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
 983                             device_get_nameunit(sc->dev), index);
 984                         rxq->flush_state = SFXGE_FLUSH_DONE;
 985                 }
 986                 retry--;
 987         }
 988         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
 989                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
 990                     device_get_nameunit(sc->dev), index);
 991                 rxq->flush_state = SFXGE_FLUSH_DONE;
 992         }
 993
 994         rxq->pending = rxq->added;
 995         sfxge_rx_qcomplete(rxq, B_TRUE);
 996
 997         KASSERT(rxq->completed == rxq->pending,
 998             ("rxq->completed != rxq->pending"));
 999
1000         rxq->added = 0;
1001         rxq->pushed = 0;
1002         rxq->pending = 0;
1003         rxq->completed = 0;
1004         rxq->loopback = 0;
1005
1006         /* Destroy the common code receive queue. */
1007         efx_rx_qdestroy(rxq->common);
1008
1009         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1010             EFX_RXQ_NBUFS(sc->rxq_entries));
1011
1012         SFXGE_EVQ_UNLOCK(evq);
1013 }
1014
1015 static int
1016 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1017 {
1018         struct sfxge_rxq *rxq;
1019         efsys_mem_t *esmp;
1020         struct sfxge_evq *evq;
1021         int rc;
1022
1023         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1024
1025         rxq = sc->rxq[index];
1026         esmp = &rxq->mem;
1027         evq = sc->evq[index];
1028
1029         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1030             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1031         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1032             ("evq->init_state != SFXGE_EVQ_STARTED"));
1033
1034         /* Program the buffer table. */
1035         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1036             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1037                 return (rc);
1038
1039         /* Create the common code receive queue. */
1040         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1041             esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1042             evq->common, &rxq->common)) != 0)
1043                 goto fail;
1044
1045         SFXGE_EVQ_LOCK(evq);
1046
1047         /* Enable the receive queue. */
1048         efx_rx_qenable(rxq->common);
1049
1050         rxq->init_state = SFXGE_RXQ_STARTED;
1051         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1052
1053         /* Try to fill the queue from the pool. */
1054         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1055
1056         SFXGE_EVQ_UNLOCK(evq);
1057
1058         return (0);
1059
1060 fail:
1061         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1062             EFX_RXQ_NBUFS(sc->rxq_entries));
1063         return (rc);
1064 }
1065
1066 void
1067 sfxge_rx_stop(struct sfxge_softc *sc)
1068 {
1069         int index;
1070
1071         efx_mac_filter_default_rxq_clear(sc->enp);
1072
1073         /* Stop the receive queue(s) */
1074         index = sc->rxq_count;
1075         while (--index >= 0)
1076                 sfxge_rx_qstop(sc, index);
1077
1078         sc->rx_prefix_size = 0;
1079         sc->rx_buffer_size = 0;
1080
1081         efx_rx_fini(sc->enp);
1082 }
1083
1084 int
1085 sfxge_rx_start(struct sfxge_softc *sc)
1086 {
1087         struct sfxge_intr *intr;
1088         const efx_nic_cfg_t *encp;
1089         size_t hdrlen, align, reserved;
1090         int index;
1091         int rc;
1092
1093         intr = &sc->intr;
1094
1095         /* Initialize the common code receive module. */
1096         if ((rc = efx_rx_init(sc->enp)) != 0)
1097                 return (rc);
1098
1099         encp = efx_nic_cfg_get(sc->enp);
1100         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1101
1102         /* Calculate the receive packet buffer size. */
1103         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1104
1105         /* Ensure IP headers are 32bit aligned */
1106         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1107         sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1108
1109         sc->rx_buffer_size += sc->rx_buffer_align;
1110
1111         /* Align end of packet buffer for RX DMA end padding */
1112         align = MAX(1, encp->enc_rx_buf_align_end);
1113         EFSYS_ASSERT(ISP2(align));
1114         sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1115
1116         /*
1117          * Standard mbuf zones only guarantee pointer-size alignment;
1118          * we need extra space to align to the cache line
1119          */
1120         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1121
1122         /* Select zone for packet buffers */
1123         if (reserved <= MCLBYTES)
1124                 sc->rx_cluster_size = MCLBYTES;
1125         else if (reserved <= MJUMPAGESIZE)
1126                 sc->rx_cluster_size = MJUMPAGESIZE;
1127         else if (reserved <= MJUM9BYTES)
1128                 sc->rx_cluster_size = MJUM9BYTES;
1129         else
1130                 sc->rx_cluster_size = MJUM16BYTES;
1131
1132         /*
1133          * Set up the scale table.  Enable all hash types and hash insertion.
1134          */
1135         for (index = 0; index < nitems(sc->rx_indir_table); index++)
1136 #ifdef RSS
1137                 sc->rx_indir_table[index] =
1138                         rss_get_indirection_to_bucket(index) % sc->rxq_count;
1139 #else
1140                 sc->rx_indir_table[index] = index % sc->rxq_count;
1141 #endif
1142         if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1143                                        sc->rx_indir_table,
1144                                        nitems(sc->rx_indir_table))) != 0)
1145                 goto fail;
1146         (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1147             EFX_RX_HASHALG_TOEPLITZ,
1148             EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1149             EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1150
1151 #ifdef RSS
1152         rss_getkey(toep_key);
1153 #endif
1154         if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1155                                        toep_key,
1156                                        sizeof(toep_key))) != 0)
1157                 goto fail;
1158
1159         /* Start the receive queue(s). */
1160         for (index = 0; index < sc->rxq_count; index++) {
1161                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1162                         goto fail2;
1163         }
1164
1165         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1166                                             sc->intr.n_alloc > 1);
1167         if (rc != 0)
1168                 goto fail3;
1169
1170         return (0);
1171
1172 fail3:
1173 fail2:
1174         while (--index >= 0)
1175                 sfxge_rx_qstop(sc, index);
1176
1177 fail:
1178         efx_rx_fini(sc->enp);
1179
1180         return (rc);
1181 }
1182
1183 #ifdef SFXGE_LRO
1184
1185 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1186 {
1187         struct sfxge_lro_state *st = &rxq->lro;
1188         unsigned i;
1189
1190         st->conns_mask = lro_table_size - 1;
1191         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1192                 ("lro_table_size must be a power of 2"));
1193         st->sc = rxq->sc;
1194         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1195                            M_SFXGE, M_WAITOK);
1196         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1197                              M_SFXGE, M_WAITOK);
1198         for (i = 0; i <= st->conns_mask; ++i) {
1199                 TAILQ_INIT(&st->conns[i]);
1200                 st->conns_n[i] = 0;
1201         }
1202         LIST_INIT(&st->active_conns);
1203         TAILQ_INIT(&st->free_conns);
1204 }
1205
1206 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1207 {
1208         struct sfxge_lro_state *st = &rxq->lro;
1209         struct sfxge_lro_conn *c;
1210         unsigned i;
1211
1212         /* Return cleanly if sfxge_lro_init() has not been called. */
1213         if (st->conns == NULL)
1214                 return;
1215
1216         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1217
1218         for (i = 0; i <= st->conns_mask; ++i) {
1219                 while (!TAILQ_EMPTY(&st->conns[i])) {
1220                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1221                         sfxge_lro_drop(rxq, c);
1222                 }
1223         }
1224
1225         while (!TAILQ_EMPTY(&st->free_conns)) {
1226                 c = TAILQ_FIRST(&st->free_conns);
1227                 TAILQ_REMOVE(&st->free_conns, c, link);
1228                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1229                 free(c, M_SFXGE);
1230         }
1231
1232         free(st->conns_n, M_SFXGE);
1233         free(st->conns, M_SFXGE);
1234         st->conns = NULL;
1235 }
1236
1237 #else
1238
1239 static void
1240 sfxge_lro_init(struct sfxge_rxq *rxq)
1241 {
1242 }
1243
1244 static void
1245 sfxge_lro_fini(struct sfxge_rxq *rxq)
1246 {
1247 }
1248
1249 #endif  /* SFXGE_LRO */
1250
1251 static void
1252 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1253 {
1254         struct sfxge_rxq *rxq;
1255
1256         rxq = sc->rxq[index];
1257
1258         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1259             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1260
1261         /* Free the context array and the flow table. */
1262         free(rxq->queue, M_SFXGE);
1263         sfxge_lro_fini(rxq);
1264
1265         /* Release DMA memory. */
1266         sfxge_dma_free(&rxq->mem);
1267
1268         sc->rxq[index] = NULL;
1269
1270         free(rxq, M_SFXGE);
1271 }
1272
1273 static int
1274 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1275 {
1276         struct sfxge_rxq *rxq;
1277         struct sfxge_evq *evq;
1278         efsys_mem_t *esmp;
1279         int rc;
1280
1281         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1282
1283         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1284         rxq->sc = sc;
1285         rxq->index = index;
1286         rxq->entries = sc->rxq_entries;
1287         rxq->ptr_mask = rxq->entries - 1;
1288         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1289
1290         sc->rxq[index] = rxq;
1291         esmp = &rxq->mem;
1292
1293         evq = sc->evq[index];
1294
1295         /* Allocate and zero DMA space. */
1296         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1297                 return (rc);
1298
1299         /* Allocate buffer table entries. */
1300         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1301                                  &rxq->buf_base_id);
1302
1303         /* Allocate the context array and the flow table. */
1304         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1305             M_SFXGE, M_WAITOK | M_ZERO);
1306         sfxge_lro_init(rxq);
1307
1308         callout_init(&rxq->refill_callout, 1);
1309
1310         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1311
1312         return (0);
1313 }
1314
1315 static const struct {
1316         const char *name;
1317         size_t offset;
1318 } sfxge_rx_stats[] = {
1319 #define SFXGE_RX_STAT(name, member) \
1320         { #name, offsetof(struct sfxge_rxq, member) }
1321 #ifdef SFXGE_LRO
1322         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1323         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1324         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1325         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1326         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1327         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1328         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1329         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1330 #endif
1331 };
1332
1333 static int
1334 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1335 {
1336         struct sfxge_softc *sc = arg1;
1337         unsigned int id = arg2;
1338         unsigned int sum, index;
1339
1340         /* Sum across all RX queues */
1341         sum = 0;
1342         for (index = 0; index < sc->rxq_count; index++)
1343                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1344                                          sfxge_rx_stats[id].offset);
1345
1346         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1347 }
1348
1349 static void
1350 sfxge_rx_stat_init(struct sfxge_softc *sc)
1351 {
1352         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1353         struct sysctl_oid_list *stat_list;
1354         unsigned int id;
1355
1356         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1357
1358         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1359                 SYSCTL_ADD_PROC(
1360                         ctx, stat_list,
1361                         OID_AUTO, sfxge_rx_stats[id].name,
1362                         CTLTYPE_UINT|CTLFLAG_RD,
1363                         sc, id, sfxge_rx_stat_handler, "IU",
1364                         "");
1365         }
1366 }
1367
1368 void
1369 sfxge_rx_fini(struct sfxge_softc *sc)
1370 {
1371         int index;
1372
1373         index = sc->rxq_count;
1374         while (--index >= 0)
1375                 sfxge_rx_qfini(sc, index);
1376
1377         sc->rxq_count = 0;
1378 }
1379
1380 int
1381 sfxge_rx_init(struct sfxge_softc *sc)
1382 {
1383         struct sfxge_intr *intr;
1384         int index;
1385         int rc;
1386
1387 #ifdef SFXGE_LRO
1388         if (!ISP2(lro_table_size)) {
1389                 log(LOG_ERR, "%s=%u must be power of 2",
1390                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1391                 rc = EINVAL;
1392                 goto fail_lro_table_size;
1393         }
1394
1395         if (lro_idle_ticks == 0)
1396                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1397 #endif
1398
1399         intr = &sc->intr;
1400
1401         sc->rxq_count = intr->n_alloc;
1402
1403         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1404             ("intr->state != SFXGE_INTR_INITIALIZED"));
1405
1406         /* Initialize the receive queue(s) - one per interrupt. */
1407         for (index = 0; index < sc->rxq_count; index++) {
1408                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1409                         goto fail;
1410         }
1411
1412         sfxge_rx_stat_init(sc);
1413
1414         return (0);
1415
1416 fail:
1417         /* Tear down the receive queue(s). */
1418         while (--index >= 0)
1419                 sfxge_rx_qfini(sc, index);
1420
1421         sc->rxq_count = 0;
1422
1423 #ifdef SFXGE_LRO
1424 fail_lro_table_size:
1425 #endif
1426         return (rc);
1427 }