sys/dev/sfxge/sfxge_rx.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
   5  * All rights reserved.
   6  *
   7  * This software was developed in part by Philip Paeps under contract for
   8  * Solarflare Communications, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright notice,
  14  *    this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright notice,
  16  *    this list of conditions and the following disclaimer in the documentation
  17  *    and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * The views and conclusions contained in the software and documentation are
  32  * those of the authors and should not be interpreted as representing official
  33  * policies, either expressed or implied, of the FreeBSD Project.
  34  */
  35
  36 #include <sys/cdefs.h>
  37 __FBSDID("$FreeBSD$");
  38
  39 #include "opt_rss.h"
  40
  41 #include <sys/param.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mbuf.h>
  44 #include <sys/smp.h>
  45 #include <sys/socket.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/syslog.h>
  48 #include <sys/limits.h>
  49 #include <sys/syslog.h>
  50
  51 #include <net/ethernet.h>
  52 #include <net/if.h>
  53 #include <net/if_vlan_var.h>
  54
  55 #include <netinet/in.h>
  56 #include <netinet/ip.h>
  57 #include <netinet/ip6.h>
  58 #include <netinet/tcp.h>
  59
  60 #include <machine/in_cksum.h>
  61
  62 #ifdef RSS
  63 #include <net/rss_config.h>
  64 #endif
  65
  66 #include "common/efx.h"
  67
  68
  69 #include "sfxge.h"
  70 #include "sfxge_rx.h"
  71
  72 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
  73
  74 #ifdef SFXGE_LRO
  75
  76 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
  77             "Large receive offload (LRO) parameters");
  78
  79 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
  80
  81 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
  82  * means we can accelerate a larger number of streams.
  83  */
  84 static unsigned lro_table_size = 128;
  85 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
  86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
  87             &lro_table_size, 0,
  88             "Size of the LRO hash table (must be a power of 2)");
  89
  90 /* Maximum length of a hash chain.  If chains get too long then the lookup
  91  * time increases and may exceed the benefit of LRO.
  92  */
  93 static unsigned lro_chain_max = 20;
  94 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
  95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
  96             &lro_chain_max, 0,
  97             "The maximum length of a hash chain");
  98
  99 /* Maximum time (in ticks) that a connection can be idle before it's LRO
 100  * state is discarded.
 101  */
 102 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
 103 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
 104 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
 105             &lro_idle_ticks, 0,
 106             "The maximum time (in ticks) that a connection can be idle "
 107             "before it's LRO state is discarded");
 108
 109 /* Number of packets with payload that must arrive in-order before a
 110  * connection is eligible for LRO.  The idea is we should avoid coalescing
 111  * segments when the sender is in slow-start because reducing the ACK rate
 112  * can damage performance.
 113  */
 114 static int lro_slow_start_packets = 2000;
 115 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
 116 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
 117             &lro_slow_start_packets, 0,
 118             "Number of packets with payload that must arrive in-order before "
 119             "a connection is eligible for LRO");
 120
 121 /* Number of packets with payload that must arrive in-order following loss
 122  * before a connection is eligible for LRO.  The idea is we should avoid
 123  * coalescing segments when the sender is recovering from loss, because
 124  * reducing the ACK rate can damage performance.
 125  */
 126 static int lro_loss_packets = 20;
 127 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
 128 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
 129             &lro_loss_packets, 0,
 130             "Number of packets with payload that must arrive in-order "
 131             "following loss before a connection is eligible for LRO");
 132
 133 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
 134 #define SFXGE_LRO_L2_ID_VLAN 0x4000
 135 #define SFXGE_LRO_L2_ID_IPV6 0x8000
 136 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
 137 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
 138
 139 /* Compare IPv6 addresses, avoiding conditional branches */
 140 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
 141                                    const struct in6_addr *right)
 142 {
 143 #if LONG_BIT == 64
 144         const uint64_t *left64 = (const uint64_t *)left;
 145         const uint64_t *right64 = (const uint64_t *)right;
 146         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
 147 #else
 148         return (left->s6_addr32[0] - right->s6_addr32[0]) |
 149                (left->s6_addr32[1] - right->s6_addr32[1]) |
 150                (left->s6_addr32[2] - right->s6_addr32[2]) |
 151                (left->s6_addr32[3] - right->s6_addr32[3]);
 152 #endif
 153 }
 154
 155 #endif  /* SFXGE_LRO */
 156
 157 void
 158 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
 159 {
 160
 161         rxq->flush_state = SFXGE_FLUSH_DONE;
 162 }
 163
 164 void
 165 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
 166 {
 167
 168         rxq->flush_state = SFXGE_FLUSH_FAILED;
 169 }
 170
 171 #ifdef RSS
 172 static uint8_t toep_key[RSS_KEYSIZE];
 173 #else
 174 static uint8_t toep_key[] = {
 175         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
 176         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
 177         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
 178         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
 179         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
 180 };
 181 #endif
 182
 183 static void
 184 sfxge_rx_post_refill(void *arg)
 185 {
 186         struct sfxge_rxq *rxq = arg;
 187         struct sfxge_softc *sc;
 188         unsigned int index;
 189         struct sfxge_evq *evq;
 190         uint16_t magic;
 191
 192         sc = rxq->sc;
 193         index = rxq->index;
 194         evq = sc->evq[index];
 195         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
 196
 197         /* This is guaranteed due to the start/stop order of rx and ev */
 198         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
 199             ("evq not started"));
 200         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
 201             ("rxq not started"));
 202         efx_ev_qpost(evq->common, magic);
 203 }
 204
 205 static void
 206 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
 207 {
 208         /* Initially retry after 100 ms, but back off in case of
 209          * repeated failures as we probably have to wait for the
 210          * administrator to raise the pool limit. */
 211         if (retrying)
 212                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
 213         else
 214                 rxq->refill_delay = hz / 10;
 215
 216         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
 217                              sfxge_rx_post_refill, rxq);
 218 }
 219
 220 #define SFXGE_REFILL_BATCH  64
 221
 222 static void
 223 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
 224 {
 225         struct sfxge_softc *sc;
 226         unsigned int index;
 227         struct sfxge_evq *evq;
 228         unsigned int batch;
 229         unsigned int rxfill;
 230         unsigned int mblksize;
 231         int ntodo;
 232         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
 233
 234         sc = rxq->sc;
 235         index = rxq->index;
 236         evq = sc->evq[index];
 237
 238         prefetch_read_many(sc->enp);
 239         prefetch_read_many(rxq->common);
 240
 241         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
 242
 243         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 244                 return;
 245
 246         rxfill = rxq->added - rxq->completed;
 247         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
 248             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
 249         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
 250         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
 251             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
 252
 253         if (ntodo == 0)
 254                 return;
 255
 256         batch = 0;
 257         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
 258         while (ntodo-- > 0) {
 259                 unsigned int id;
 260                 struct sfxge_rx_sw_desc *rx_desc;
 261                 bus_dma_segment_t seg;
 262                 struct mbuf *m;
 263
 264                 id = (rxq->added + batch) & rxq->ptr_mask;
 265                 rx_desc = &rxq->queue[id];
 266                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
 267
 268                 rx_desc->flags = EFX_DISCARD;
 269                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
 270                     sc->rx_cluster_size);
 271                 if (m == NULL)
 272                         break;
 273
 274                 /* m_len specifies length of area to be mapped for DMA */
 275                 m->m_len  = mblksize;
 276                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
 277                 m->m_data += sc->rx_buffer_align;
 278
 279                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
 280                 addr[batch++] = seg.ds_addr;
 281
 282                 if (batch == SFXGE_REFILL_BATCH) {
 283                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
 284                             rxq->completed, rxq->added);
 285                         rxq->added += batch;
 286                         batch = 0;
 287                 }
 288         }
 289
 290         if (ntodo != 0)
 291                 sfxge_rx_schedule_refill(rxq, retrying);
 292
 293         if (batch != 0) {
 294                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
 295                     rxq->completed, rxq->added);
 296                 rxq->added += batch;
 297         }
 298
 299         /* Make the descriptors visible to the hardware */
 300         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
 301                         BUS_DMASYNC_PREWRITE);
 302
 303         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
 304
 305         /* The queue could still be empty if no descriptors were actually
 306          * pushed, in which case there will be no event to cause the next
 307          * refill, so we must schedule a refill ourselves.
 308          */
 309         if(rxq->pushed == rxq->completed) {
 310                 sfxge_rx_schedule_refill(rxq, retrying);
 311         }
 312 }
 313
 314 void
 315 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
 316 {
 317
 318         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 319                 return;
 320
 321         /* Make sure the queue is full */
 322         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
 323 }
 324
 325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
 326 {
 327         struct ifnet *ifp = sc->ifnet;
 328
 329         m->m_pkthdr.rcvif = ifp;
 330         m->m_pkthdr.csum_data = 0xffff;
 331         ifp->if_input(ifp, m);
 332 }
 333
 334 static void
 335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
 336 {
 337         struct sfxge_softc *sc = rxq->sc;
 338         struct mbuf *m = rx_desc->mbuf;
 339         int flags = rx_desc->flags;
 340         int csum_flags;
 341
 342         /* Convert checksum flags */
 343         csum_flags = (flags & EFX_CKSUM_IPV4) ?
 344                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
 345         if (flags & EFX_CKSUM_TCPUDP)
 346                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 347
 348         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
 349                 m->m_pkthdr.flowid =
 350                         efx_pseudo_hdr_hash_get(rxq->common,
 351                                                 EFX_RX_HASHALG_TOEPLITZ,
 352                                                 mtod(m, uint8_t *));
 353                 /* The hash covers a 4-tuple for TCP only */
 354                 M_HASHTYPE_SET(m,
 355                     (flags & EFX_PKT_IPV4) ?
 356                         ((flags & EFX_PKT_TCP) ?
 357                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
 358                         ((flags & EFX_PKT_TCP) ?
 359                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
 360         }
 361         m->m_data += sc->rx_prefix_size;
 362         m->m_len = rx_desc->size - sc->rx_prefix_size;
 363         m->m_pkthdr.len = m->m_len;
 364         m->m_pkthdr.csum_flags = csum_flags;
 365         __sfxge_rx_deliver(sc, rx_desc->mbuf);
 366
 367         rx_desc->flags = EFX_DISCARD;
 368         rx_desc->mbuf = NULL;
 369 }
 370
 371 #ifdef SFXGE_LRO
 372
 373 static void
 374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
 375 {
 376         struct sfxge_softc *sc = st->sc;
 377         struct mbuf *m = c->mbuf;
 378         struct tcphdr *c_th;
 379         int csum_flags;
 380
 381         KASSERT(m, ("no mbuf to deliver"));
 382
 383         ++st->n_bursts;
 384
 385         /* Finish off packet munging and recalculate IP header checksum. */
 386         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 387                 struct ip *iph = c->nh;
 388                 iph->ip_len = htons(iph->ip_len);
 389                 iph->ip_sum = 0;
 390                 iph->ip_sum = in_cksum_hdr(iph);
 391                 c_th = (struct tcphdr *)(iph + 1);
 392                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
 393                               CSUM_IP_CHECKED | CSUM_IP_VALID);
 394         } else {
 395                 struct ip6_hdr *iph = c->nh;
 396                 iph->ip6_plen = htons(iph->ip6_plen);
 397                 c_th = (struct tcphdr *)(iph + 1);
 398                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 399         }
 400
 401         c_th->th_win = c->th_last->th_win;
 402         c_th->th_ack = c->th_last->th_ack;
 403         if (c_th->th_off == c->th_last->th_off) {
 404                 /* Copy TCP options (take care to avoid going negative). */
 405                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
 406                 memcpy(c_th + 1, c->th_last + 1, optlen);
 407         }
 408
 409         m->m_pkthdr.flowid = c->conn_hash;
 410         M_HASHTYPE_SET(m,
 411             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
 412                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
 413
 414         m->m_pkthdr.csum_flags = csum_flags;
 415         __sfxge_rx_deliver(sc, m);
 416
 417         c->mbuf = NULL;
 418         c->delivered = 1;
 419 }
 420
 421 /* Drop the given connection, and add it to the free list. */
 422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
 423 {
 424         unsigned bucket;
 425
 426         KASSERT(!c->mbuf, ("found orphaned mbuf"));
 427
 428         if (c->next_buf.mbuf != NULL) {
 429                 sfxge_rx_deliver(rxq, &c->next_buf);
 430                 LIST_REMOVE(c, active_link);
 431         }
 432
 433         bucket = c->conn_hash & rxq->lro.conns_mask;
 434         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
 435         --rxq->lro.conns_n[bucket];
 436         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
 437         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
 438 }
 439
 440 /* Stop tracking connections that have gone idle in order to keep hash
 441  * chains short.
 442  */
 443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
 444 {
 445         struct sfxge_lro_conn *c;
 446         unsigned i;
 447
 448         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
 449                 ("found active connections"));
 450
 451         rxq->lro.last_purge_ticks = now;
 452         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
 453                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
 454                         continue;
 455
 456                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
 457                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
 458                         ++rxq->lro.n_drop_idle;
 459                         sfxge_lro_drop(rxq, c);
 460                 }
 461         }
 462 }
 463
 464 static void
 465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
 466                 struct mbuf *mbuf, struct tcphdr *th)
 467 {
 468         struct tcphdr *c_th;
 469
 470         /* Tack the new mbuf onto the chain. */
 471         KASSERT(!mbuf->m_next, ("mbuf already chained"));
 472         c->mbuf_tail->m_next = mbuf;
 473         c->mbuf_tail = mbuf;
 474
 475         /* Increase length appropriately */
 476         c->mbuf->m_pkthdr.len += mbuf->m_len;
 477
 478         /* Update the connection state flags */
 479         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 480                 struct ip *iph = c->nh;
 481                 iph->ip_len += mbuf->m_len;
 482                 c_th = (struct tcphdr *)(iph + 1);
 483         } else {
 484                 struct ip6_hdr *iph = c->nh;
 485                 iph->ip6_plen += mbuf->m_len;
 486                 c_th = (struct tcphdr *)(iph + 1);
 487         }
 488         c_th->th_flags |= (th->th_flags & TH_PUSH);
 489         c->th_last = th;
 490         ++st->n_merges;
 491
 492         /* Pass packet up now if another segment could overflow the IP
 493          * length.
 494          */
 495         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
 496                 sfxge_lro_deliver(st, c);
 497 }
 498
 499 static void
 500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
 501                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
 502 {
 503         /* Start the chain */
 504         c->mbuf = mbuf;
 505         c->mbuf_tail = c->mbuf;
 506         c->nh = nh;
 507         c->th_last = th;
 508
 509         mbuf->m_pkthdr.len = mbuf->m_len;
 510
 511         /* Mangle header fields for later processing */
 512         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 513                 struct ip *iph = nh;
 514                 iph->ip_len = ntohs(iph->ip_len);
 515         } else {
 516                 struct ip6_hdr *iph = nh;
 517                 iph->ip6_plen = ntohs(iph->ip6_plen);
 518         }
 519 }
 520
 521 /* Try to merge or otherwise hold or deliver (as appropriate) the
 522  * packet buffered for this connection (c->next_buf).  Return a flag
 523  * indicating whether the connection is still active for LRO purposes.
 524  */
 525 static int
 526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
 527 {
 528         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
 529         char *eh = c->next_eh;
 530         int data_length, hdr_length, dont_merge;
 531         unsigned th_seq, pkt_length;
 532         struct tcphdr *th;
 533         unsigned now;
 534
 535         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 536                 struct ip *iph = c->next_nh;
 537                 th = (struct tcphdr *)(iph + 1);
 538                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
 539         } else {
 540                 struct ip6_hdr *iph = c->next_nh;
 541                 th = (struct tcphdr *)(iph + 1);
 542                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
 543         }
 544
 545         hdr_length = (char *) th + th->th_off * 4 - eh;
 546         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
 547                        hdr_length);
 548         th_seq = ntohl(th->th_seq);
 549         dont_merge = ((data_length <= 0)
 550                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
 551
 552         /* Check for options other than aligned timestamp. */
 553         if (th->th_off != 5) {
 554                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
 555                 if (th->th_off == 8 &&
 556                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
 557                                         (TCPOPT_NOP << 16) |
 558                                         (TCPOPT_TIMESTAMP << 8) |
 559                                         TCPOLEN_TIMESTAMP)) {
 560                         /* timestamp option -- okay */
 561                 } else {
 562                         dont_merge = 1;
 563                 }
 564         }
 565
 566         if (__predict_false(th_seq != c->next_seq)) {
 567                 /* Out-of-order, so start counting again. */
 568                 if (c->mbuf != NULL)
 569                         sfxge_lro_deliver(&rxq->lro, c);
 570                 c->n_in_order_pkts -= lro_loss_packets;
 571                 c->next_seq = th_seq + data_length;
 572                 ++rxq->lro.n_misorder;
 573                 goto deliver_buf_out;
 574         }
 575         c->next_seq = th_seq + data_length;
 576
 577         now = ticks;
 578         if (now - c->last_pkt_ticks > lro_idle_ticks) {
 579                 ++rxq->lro.n_drop_idle;
 580                 if (c->mbuf != NULL)
 581                         sfxge_lro_deliver(&rxq->lro, c);
 582                 sfxge_lro_drop(rxq, c);
 583                 return (0);
 584         }
 585         c->last_pkt_ticks = ticks;
 586
 587         if (c->n_in_order_pkts < lro_slow_start_packets) {
 588                 /* May be in slow-start, so don't merge. */
 589                 ++rxq->lro.n_slow_start;
 590                 ++c->n_in_order_pkts;
 591                 goto deliver_buf_out;
 592         }
 593
 594         if (__predict_false(dont_merge)) {
 595                 if (c->mbuf != NULL)
 596                         sfxge_lro_deliver(&rxq->lro, c);
 597                 if (th->th_flags & (TH_FIN | TH_RST)) {
 598                         ++rxq->lro.n_drop_closed;
 599                         sfxge_lro_drop(rxq, c);
 600                         return (0);
 601                 }
 602                 goto deliver_buf_out;
 603         }
 604
 605         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
 606
 607         if (__predict_true(c->mbuf != NULL)) {
 608                 /* Remove headers and any padding */
 609                 rx_buf->mbuf->m_data += hdr_length;
 610                 rx_buf->mbuf->m_len = data_length;
 611
 612                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
 613         } else {
 614                 /* Remove any padding */
 615                 rx_buf->mbuf->m_len = pkt_length;
 616
 617                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
 618         }
 619
 620         rx_buf->mbuf = NULL;
 621         return (1);
 622
 623  deliver_buf_out:
 624         sfxge_rx_deliver(rxq, rx_buf);
 625         return (1);
 626 }
 627
 628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
 629                                uint16_t l2_id, void *nh, struct tcphdr *th)
 630 {
 631         unsigned bucket = conn_hash & st->conns_mask;
 632         struct sfxge_lro_conn *c;
 633
 634         if (st->conns_n[bucket] >= lro_chain_max) {
 635                 ++st->n_too_many;
 636                 return;
 637         }
 638
 639         if (!TAILQ_EMPTY(&st->free_conns)) {
 640                 c = TAILQ_FIRST(&st->free_conns);
 641                 TAILQ_REMOVE(&st->free_conns, c, link);
 642         } else {
 643                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
 644                 if (c == NULL)
 645                         return;
 646                 c->mbuf = NULL;
 647                 c->next_buf.mbuf = NULL;
 648         }
 649
 650         /* Create the connection tracking data */
 651         ++st->conns_n[bucket];
 652         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
 653         c->l2_id = l2_id;
 654         c->conn_hash = conn_hash;
 655         c->source = th->th_sport;
 656         c->dest = th->th_dport;
 657         c->n_in_order_pkts = 0;
 658         c->last_pkt_ticks = *(volatile int *)&ticks;
 659         c->delivered = 0;
 660         ++st->n_new_stream;
 661         /* NB. We don't initialise c->next_seq, and it doesn't matter what
 662          * value it has.  Most likely the next packet received for this
 663          * connection will not match -- no harm done.
 664          */
 665 }
 666
 667 /* Process mbuf and decide whether to dispatch it to the stack now or
 668  * later.
 669  */
 670 static void
 671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
 672 {
 673         struct sfxge_softc *sc = rxq->sc;
 674         struct mbuf *m = rx_buf->mbuf;
 675         struct ether_header *eh;
 676         struct sfxge_lro_conn *c;
 677         uint16_t l2_id;
 678         uint16_t l3_proto;
 679         void *nh;
 680         struct tcphdr *th;
 681         uint32_t conn_hash;
 682         unsigned bucket;
 683
 684         /* Get the hardware hash */
 685         conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
 686                                             EFX_RX_HASHALG_TOEPLITZ,
 687                                             mtod(m, uint8_t *));
 688
 689         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
 690         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 691                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
 692                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
 693                         SFXGE_LRO_L2_ID_VLAN;
 694                 l3_proto = veh->evl_proto;
 695                 nh = veh + 1;
 696         } else {
 697                 l2_id = 0;
 698                 l3_proto = eh->ether_type;
 699                 nh = eh + 1;
 700         }
 701
 702         /* Check whether this is a suitable packet (unfragmented
 703          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
 704          * length, and compute a hash if necessary.  If not, return.
 705          */
 706         if (l3_proto == htons(ETHERTYPE_IP)) {
 707                 struct ip *iph = nh;
 708
 709                 KASSERT(iph->ip_p == IPPROTO_TCP,
 710                     ("IPv4 protocol is not TCP, but packet marker is set"));
 711                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
 712                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
 713                         goto deliver_now;
 714                 th = (struct tcphdr *)(iph + 1);
 715         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
 716                 struct ip6_hdr *iph = nh;
 717
 718                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
 719                     ("IPv6 next header is not TCP, but packet marker is set"));
 720                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
 721                 th = (struct tcphdr *)(iph + 1);
 722         } else {
 723                 goto deliver_now;
 724         }
 725
 726         bucket = conn_hash & rxq->lro.conns_mask;
 727
 728         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
 729                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
 730                         continue;
 731                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
 732                         continue;
 733                 if (c->mbuf != NULL) {
 734                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
 735                                 struct ip *c_iph, *iph = nh;
 736                                 c_iph = c->nh;
 737                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
 738                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
 739                                         continue;
 740                         } else {
 741                                 struct ip6_hdr *c_iph, *iph = nh;
 742                                 c_iph = c->nh;
 743                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
 744                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
 745                                         continue;
 746                         }
 747                 }
 748
 749                 /* Re-insert at head of list to reduce lookup time. */
 750                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
 751                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
 752
 753                 if (c->next_buf.mbuf != NULL) {
 754                         if (!sfxge_lro_try_merge(rxq, c))
 755                                 goto deliver_now;
 756                 } else {
 757                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
 758                             active_link);
 759                 }
 760                 c->next_buf = *rx_buf;
 761                 c->next_eh = eh;
 762                 c->next_nh = nh;
 763
 764                 rx_buf->mbuf = NULL;
 765                 rx_buf->flags = EFX_DISCARD;
 766                 return;
 767         }
 768
 769         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
 770  deliver_now:
 771         sfxge_rx_deliver(rxq, rx_buf);
 772 }
 773
 774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
 775 {
 776         struct sfxge_lro_state *st = &rxq->lro;
 777         struct sfxge_lro_conn *c;
 778         unsigned t;
 779
 780         while (!LIST_EMPTY(&st->active_conns)) {
 781                 c = LIST_FIRST(&st->active_conns);
 782                 if (!c->delivered && c->mbuf != NULL)
 783                         sfxge_lro_deliver(st, c);
 784                 if (sfxge_lro_try_merge(rxq, c)) {
 785                         if (c->mbuf != NULL)
 786                                 sfxge_lro_deliver(st, c);
 787                         LIST_REMOVE(c, active_link);
 788                 }
 789                 c->delivered = 0;
 790         }
 791
 792         t = *(volatile int *)&ticks;
 793         if (__predict_false(t != st->last_purge_ticks))
 794                 sfxge_lro_purge_idle(rxq, t);
 795 }
 796
 797 #else   /* !SFXGE_LRO */
 798
 799 static void
 800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
 801 {
 802 }
 803
 804 static void
 805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
 806 {
 807 }
 808
 809 #endif  /* SFXGE_LRO */
 810
 811 void
 812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
 813 {
 814         struct sfxge_softc *sc = rxq->sc;
 815         int if_capenable = sc->ifnet->if_capenable;
 816         int lro_enabled = if_capenable & IFCAP_LRO;
 817         unsigned int index;
 818         struct sfxge_evq *evq;
 819         unsigned int completed;
 820         unsigned int level;
 821         struct mbuf *m;
 822         struct sfxge_rx_sw_desc *prev = NULL;
 823
 824         index = rxq->index;
 825         evq = sc->evq[index];
 826
 827         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
 828
 829         completed = rxq->completed;
 830         while (completed != rxq->pending) {
 831                 unsigned int id;
 832                 struct sfxge_rx_sw_desc *rx_desc;
 833
 834                 id = completed++ & rxq->ptr_mask;
 835                 rx_desc = &rxq->queue[id];
 836                 m = rx_desc->mbuf;
 837
 838                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
 839                         goto discard;
 840
 841                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
 842                         goto discard;
 843
 844                 /* Read the length from the pseudo header if required */
 845                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
 846                         uint16_t tmp_size;
 847                         int rc;
 848                         rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
 849                                                            mtod(m, uint8_t *),
 850                                                            &tmp_size);
 851                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
 852                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
 853                 }
 854
 855                 prefetch_read_many(mtod(m, caddr_t));
 856
 857                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
 858                 case EFX_PKT_IPV4:
 859                         if (~if_capenable & IFCAP_RXCSUM)
 860                                 rx_desc->flags &=
 861                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
 862                         break;
 863                 case EFX_PKT_IPV6:
 864                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
 865                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
 866                         break;
 867                 case 0:
 868                         /* Check for loopback packets */
 869                         {
 870                                 struct ether_header *etherhp;
 871
 872                                 /*LINTED*/
 873                                 etherhp = mtod(m, struct ether_header *);
 874
 875                                 if (etherhp->ether_type ==
 876                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
 877                                         EFSYS_PROBE(loopback);
 878
 879                                         rxq->loopback++;
 880                                         goto discard;
 881                                 }
 882                         }
 883                         break;
 884                 default:
 885                         KASSERT(B_FALSE,
 886                             ("Rx descriptor with both IPv4 and IPv6 flags"));
 887                         goto discard;
 888                 }
 889
 890                 /* Pass packet up the stack or into LRO (pipelined) */
 891                 if (prev != NULL) {
 892                         if (lro_enabled &&
 893                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
 894                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
 895                                 sfxge_lro(rxq, prev);
 896                         else
 897                                 sfxge_rx_deliver(rxq, prev);
 898                 }
 899                 prev = rx_desc;
 900                 continue;
 901
 902 discard:
 903                 /* Return the packet to the pool */
 904                 m_free(m);
 905                 rx_desc->mbuf = NULL;
 906         }
 907         rxq->completed = completed;
 908
 909         level = rxq->added - rxq->completed;
 910
 911         /* Pass last packet up the stack or into LRO */
 912         if (prev != NULL) {
 913                 if (lro_enabled &&
 914                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
 915                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
 916                         sfxge_lro(rxq, prev);
 917                 else
 918                         sfxge_rx_deliver(rxq, prev);
 919         }
 920
 921         /*
 922          * If there are any pending flows and this is the end of the
 923          * poll then they must be completed.
 924          */
 925         if (eop)
 926                 sfxge_lro_end_of_burst(rxq);
 927
 928         /* Top up the queue if necessary */
 929         if (level < rxq->refill_threshold)
 930                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
 931 }
 932
 933 static void
 934 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
 935 {
 936         struct sfxge_rxq *rxq;
 937         struct sfxge_evq *evq;
 938         unsigned int count;
 939         unsigned int retry = 3;
 940
 941         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
 942
 943         rxq = sc->rxq[index];
 944         evq = sc->evq[index];
 945
 946         SFXGE_EVQ_LOCK(evq);
 947
 948         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
 949             ("rxq not started"));
 950
 951         rxq->init_state = SFXGE_RXQ_INITIALIZED;
 952
 953         callout_stop(&rxq->refill_callout);
 954
 955         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
 956                 rxq->flush_state = SFXGE_FLUSH_PENDING;
 957
 958                 SFXGE_EVQ_UNLOCK(evq);
 959
 960                 /* Flush the receive queue */
 961                 if (efx_rx_qflush(rxq->common) != 0) {
 962                         SFXGE_EVQ_LOCK(evq);
 963                         rxq->flush_state = SFXGE_FLUSH_FAILED;
 964                         break;
 965                 }
 966
 967                 count = 0;
 968                 do {
 969                         /* Spin for 100 ms */
 970                         DELAY(100000);
 971
 972                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
 973                                 break;
 974
 975                 } while (++count < 20);
 976
 977                 SFXGE_EVQ_LOCK(evq);
 978
 979                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
 980                         /* Flush timeout - neither done nor failed */
 981                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
 982                             device_get_nameunit(sc->dev), index);
 983                         rxq->flush_state = SFXGE_FLUSH_DONE;
 984                 }
 985                 retry--;
 986         }
 987         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
 988                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
 989                     device_get_nameunit(sc->dev), index);
 990                 rxq->flush_state = SFXGE_FLUSH_DONE;
 991         }
 992
 993         rxq->pending = rxq->added;
 994         sfxge_rx_qcomplete(rxq, B_TRUE);
 995
 996         KASSERT(rxq->completed == rxq->pending,
 997             ("rxq->completed != rxq->pending"));
 998
 999         rxq->added = 0;
1000         rxq->pushed = 0;
1001         rxq->pending = 0;
1002         rxq->completed = 0;
1003         rxq->loopback = 0;
1004
1005         /* Destroy the common code receive queue. */
1006         efx_rx_qdestroy(rxq->common);
1007
1008         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1009             EFX_RXQ_NBUFS(sc->rxq_entries));
1010
1011         SFXGE_EVQ_UNLOCK(evq);
1012 }
1013
1014 static int
1015 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1016 {
1017         struct sfxge_rxq *rxq;
1018         efsys_mem_t *esmp;
1019         struct sfxge_evq *evq;
1020         int rc;
1021
1022         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1023
1024         rxq = sc->rxq[index];
1025         esmp = &rxq->mem;
1026         evq = sc->evq[index];
1027
1028         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1029             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1030         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1031             ("evq->init_state != SFXGE_EVQ_STARTED"));
1032
1033         /* Program the buffer table. */
1034         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1035             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1036                 return (rc);
1037
1038         /* Create the common code receive queue. */
1039         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1040             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1041             &rxq->common)) != 0)
1042                 goto fail;
1043
1044         SFXGE_EVQ_LOCK(evq);
1045
1046         /* Enable the receive queue. */
1047         efx_rx_qenable(rxq->common);
1048
1049         rxq->init_state = SFXGE_RXQ_STARTED;
1050         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1051
1052         /* Try to fill the queue from the pool. */
1053         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1054
1055         SFXGE_EVQ_UNLOCK(evq);
1056
1057         return (0);
1058
1059 fail:
1060         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1061             EFX_RXQ_NBUFS(sc->rxq_entries));
1062         return (rc);
1063 }
1064
1065 void
1066 sfxge_rx_stop(struct sfxge_softc *sc)
1067 {
1068         int index;
1069
1070         efx_mac_filter_default_rxq_clear(sc->enp);
1071
1072         /* Stop the receive queue(s) */
1073         index = sc->rxq_count;
1074         while (--index >= 0)
1075                 sfxge_rx_qstop(sc, index);
1076
1077         sc->rx_prefix_size = 0;
1078         sc->rx_buffer_size = 0;
1079
1080         efx_rx_fini(sc->enp);
1081 }
1082
1083 int
1084 sfxge_rx_start(struct sfxge_softc *sc)
1085 {
1086         struct sfxge_intr *intr;
1087         const efx_nic_cfg_t *encp;
1088         size_t hdrlen, align, reserved;
1089         int index;
1090         int rc;
1091
1092         intr = &sc->intr;
1093
1094         /* Initialize the common code receive module. */
1095         if ((rc = efx_rx_init(sc->enp)) != 0)
1096                 return (rc);
1097
1098         encp = efx_nic_cfg_get(sc->enp);
1099         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1100
1101         /* Calculate the receive packet buffer size. */
1102         sc->rx_prefix_size = encp->enc_rx_prefix_size;
1103
1104         /* Ensure IP headers are 32bit aligned */
1105         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1106         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1107
1108         sc->rx_buffer_size += sc->rx_buffer_align;
1109
1110         /* Align end of packet buffer for RX DMA end padding */
1111         align = MAX(1, encp->enc_rx_buf_align_end);
1112         EFSYS_ASSERT(ISP2(align));
1113         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1114
1115         /*
1116          * Standard mbuf zones only guarantee pointer-size alignment;
1117          * we need extra space to align to the cache line
1118          */
1119         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1120
1121         /* Select zone for packet buffers */
1122         if (reserved <= MCLBYTES)
1123                 sc->rx_cluster_size = MCLBYTES;
1124         else if (reserved <= MJUMPAGESIZE)
1125                 sc->rx_cluster_size = MJUMPAGESIZE;
1126         else if (reserved <= MJUM9BYTES)
1127                 sc->rx_cluster_size = MJUM9BYTES;
1128         else
1129                 sc->rx_cluster_size = MJUM16BYTES;
1130
1131         /*
1132          * Set up the scale table.  Enable all hash types and hash insertion.
1133          */
1134         for (index = 0; index < nitems(sc->rx_indir_table); index++)
1135 #ifdef RSS
1136                 sc->rx_indir_table[index] =
1137                         rss_get_indirection_to_bucket(index) % sc->rxq_count;
1138 #else
1139                 sc->rx_indir_table[index] = index % sc->rxq_count;
1140 #endif
1141         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1142                                        nitems(sc->rx_indir_table))) != 0)
1143                 goto fail;
1144         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1145             EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1146             EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1147
1148 #ifdef RSS
1149         rss_getkey(toep_key);
1150 #endif
1151         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1152                                        sizeof(toep_key))) != 0)
1153                 goto fail;
1154
1155         /* Start the receive queue(s). */
1156         for (index = 0; index < sc->rxq_count; index++) {
1157                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1158                         goto fail2;
1159         }
1160
1161         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1162                                             sc->intr.n_alloc > 1);
1163         if (rc != 0)
1164                 goto fail3;
1165
1166         return (0);
1167
1168 fail3:
1169 fail2:
1170         while (--index >= 0)
1171                 sfxge_rx_qstop(sc, index);
1172
1173 fail:
1174         efx_rx_fini(sc->enp);
1175
1176         return (rc);
1177 }
1178
1179 #ifdef SFXGE_LRO
1180
1181 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1182 {
1183         struct sfxge_lro_state *st = &rxq->lro;
1184         unsigned i;
1185
1186         st->conns_mask = lro_table_size - 1;
1187         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1188                 ("lro_table_size must be a power of 2"));
1189         st->sc = rxq->sc;
1190         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1191                            M_SFXGE, M_WAITOK);
1192         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1193                              M_SFXGE, M_WAITOK);
1194         for (i = 0; i <= st->conns_mask; ++i) {
1195                 TAILQ_INIT(&st->conns[i]);
1196                 st->conns_n[i] = 0;
1197         }
1198         LIST_INIT(&st->active_conns);
1199         TAILQ_INIT(&st->free_conns);
1200 }
1201
1202 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1203 {
1204         struct sfxge_lro_state *st = &rxq->lro;
1205         struct sfxge_lro_conn *c;
1206         unsigned i;
1207
1208         /* Return cleanly if sfxge_lro_init() has not been called. */
1209         if (st->conns == NULL)
1210                 return;
1211
1212         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1213
1214         for (i = 0; i <= st->conns_mask; ++i) {
1215                 while (!TAILQ_EMPTY(&st->conns[i])) {
1216                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1217                         sfxge_lro_drop(rxq, c);
1218                 }
1219         }
1220
1221         while (!TAILQ_EMPTY(&st->free_conns)) {
1222                 c = TAILQ_FIRST(&st->free_conns);
1223                 TAILQ_REMOVE(&st->free_conns, c, link);
1224                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1225                 free(c, M_SFXGE);
1226         }
1227
1228         free(st->conns_n, M_SFXGE);
1229         free(st->conns, M_SFXGE);
1230         st->conns = NULL;
1231 }
1232
1233 #else
1234
1235 static void
1236 sfxge_lro_init(struct sfxge_rxq *rxq)
1237 {
1238 }
1239
1240 static void
1241 sfxge_lro_fini(struct sfxge_rxq *rxq)
1242 {
1243 }
1244
1245 #endif  /* SFXGE_LRO */
1246
1247 static void
1248 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1249 {
1250         struct sfxge_rxq *rxq;
1251
1252         rxq = sc->rxq[index];
1253
1254         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1255             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1256
1257         /* Free the context array and the flow table. */
1258         free(rxq->queue, M_SFXGE);
1259         sfxge_lro_fini(rxq);
1260
1261         /* Release DMA memory. */
1262         sfxge_dma_free(&rxq->mem);
1263
1264         sc->rxq[index] = NULL;
1265
1266         free(rxq, M_SFXGE);
1267 }
1268
1269 static int
1270 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1271 {
1272         struct sfxge_rxq *rxq;
1273         struct sfxge_evq *evq;
1274         efsys_mem_t *esmp;
1275         int rc;
1276
1277         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1278
1279         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1280         rxq->sc = sc;
1281         rxq->index = index;
1282         rxq->entries = sc->rxq_entries;
1283         rxq->ptr_mask = rxq->entries - 1;
1284         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1285
1286         sc->rxq[index] = rxq;
1287         esmp = &rxq->mem;
1288
1289         evq = sc->evq[index];
1290
1291         /* Allocate and zero DMA space. */
1292         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1293                 return (rc);
1294
1295         /* Allocate buffer table entries. */
1296         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1297                                  &rxq->buf_base_id);
1298
1299         /* Allocate the context array and the flow table. */
1300         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1301             M_SFXGE, M_WAITOK | M_ZERO);
1302         sfxge_lro_init(rxq);
1303
1304         callout_init(&rxq->refill_callout, 1);
1305
1306         rxq->init_state = SFXGE_RXQ_INITIALIZED;
1307
1308         return (0);
1309 }
1310
1311 static const struct {
1312         const char *name;
1313         size_t offset;
1314 } sfxge_rx_stats[] = {
1315 #define SFXGE_RX_STAT(name, member) \
1316         { #name, offsetof(struct sfxge_rxq, member) }
1317 #ifdef SFXGE_LRO
1318         SFXGE_RX_STAT(lro_merges, lro.n_merges),
1319         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1320         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1321         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1322         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1323         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1324         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1325         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1326 #endif
1327 };
1328
1329 static int
1330 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1331 {
1332         struct sfxge_softc *sc = arg1;
1333         unsigned int id = arg2;
1334         unsigned int sum, index;
1335
1336         /* Sum across all RX queues */
1337         sum = 0;
1338         for (index = 0; index < sc->rxq_count; index++)
1339                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1340                                          sfxge_rx_stats[id].offset);
1341
1342         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1343 }
1344
1345 static void
1346 sfxge_rx_stat_init(struct sfxge_softc *sc)
1347 {
1348         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1349         struct sysctl_oid_list *stat_list;
1350         unsigned int id;
1351
1352         stat_list = SYSCTL_CHILDREN(sc->stats_node);
1353
1354         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1355                 SYSCTL_ADD_PROC(
1356                         ctx, stat_list,
1357                         OID_AUTO, sfxge_rx_stats[id].name,
1358                         CTLTYPE_UINT|CTLFLAG_RD,
1359                         sc, id, sfxge_rx_stat_handler, "IU",
1360                         "");
1361         }
1362 }
1363
1364 void
1365 sfxge_rx_fini(struct sfxge_softc *sc)
1366 {
1367         int index;
1368
1369         index = sc->rxq_count;
1370         while (--index >= 0)
1371                 sfxge_rx_qfini(sc, index);
1372
1373         sc->rxq_count = 0;
1374 }
1375
1376 int
1377 sfxge_rx_init(struct sfxge_softc *sc)
1378 {
1379         struct sfxge_intr *intr;
1380         int index;
1381         int rc;
1382
1383 #ifdef SFXGE_LRO
1384         if (!ISP2(lro_table_size)) {
1385                 log(LOG_ERR, "%s=%u must be power of 2",
1386                     SFXGE_LRO_PARAM(table_size), lro_table_size);
1387                 rc = EINVAL;
1388                 goto fail_lro_table_size;
1389         }
1390
1391         if (lro_idle_ticks == 0)
1392                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1393 #endif
1394
1395         intr = &sc->intr;
1396
1397         sc->rxq_count = intr->n_alloc;
1398
1399         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1400             ("intr->state != SFXGE_INTR_INITIALIZED"));
1401
1402         /* Initialize the receive queue(s) - one per interrupt. */
1403         for (index = 0; index < sc->rxq_count; index++) {
1404                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1405                         goto fail;
1406         }
1407
1408         sfxge_rx_stat_init(sc);
1409
1410         return (0);
1411
1412 fail:
1413         /* Tear down the receive queue(s). */
1414         while (--index >= 0)
1415                 sfxge_rx_qfini(sc, index);
1416
1417         sc->rxq_count = 0;
1418
1419 #ifdef SFXGE_LRO
1420 fail_lro_table_size:
1421 #endif
1422         return (rc);
1423 }