From 7b1a84eef6c7b18234b5ad485c815c46e5bc7df1 Mon Sep 17 00:00:00 2001 From: np Date: Wed, 7 May 2014 18:15:20 +0000 Subject: [PATCH] MFC r259527, r260210 (by adrian@), r261533, r261536, r261537, r261558 (by scottl@), r263317, r263412, r263457, and r264621 (by emax@). r259527: Do not create a hardware IPv6 server if the listen address is not in6addr_any and is not in the CLIP table either. This fixes a reported TOE+IPv6 NULL-dereference panic in do_pass_open_rpl(). While here, stop creating hardware servers for any loopback address. It's just a waste of server tids. r260210: Add an option to enable or disable the small RX packet copying that is done to improve performance of small frames. When doing RX packing, the RX copying isn't necessarily required. r261533: cxgbe(4): Use the port's tx channel to identify it to t4_clr_port_stats. r261536: cxgbe(4): The T5 allows for a different freelist starvation threshold for queues with buffer packing. Use the correct value to calculate a freelist's low water mark. r261537: cxgbe(4): Use the rx channel map (instead of the tx channel map) as the congestion channel map. r261558: Add a new sysctl, dev.cxgbe.N.rsrv_noflow, and a companion tunable, hw.cxgbe.rsrv_noflow. When set, queue 0 of the port is reserved for TX packets without a flowid. The hash value of packets with a flowid is bumped up by 1. The intent is to provide a private queue for link-level packets like LACP that is unlikely to overflow or suffer deep queue latency. r263317: cxgbe(4): significant rx rework. - More flexible cluster size selection, including the ability to fall back to a safe cluster size (PAGE_SIZE from zone_jumbop by default) in case an allocation of a larger size fails. - A single get_fl_payload() function that assembles the payload into an mbuf chain for any kind of freelist. This replaces two variants: one for freelists with buffer packing enabled and another for those without. - Buffer packing with any sized cluster. It was limited to 4K clusters only before this change. - Enable buffer packing for TOE rx queues as well. - Statistics and tunables to go with all these changes. The driver's man page will be updated separately. r263412: cxgbe(4): if_iqdrops statistic should include tunnel congestion drops. r263457: cxgbe(4): Recognize the "spider" configuration where a T5 card's 40G QSFP port is presented as 4 distinct 10G SFP+ ports to the driver. r264621: use correct (integer) type for the temperature sysctl git-svn-id: svn://svn.freebsd.org/base/stable/9@265582 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- sys/dev/cxgbe/adapter.h | 79 ++- sys/dev/cxgbe/common/t4_hw.c | 1 + sys/dev/cxgbe/common/t4_hw.h | 1 + sys/dev/cxgbe/t4_main.c | 65 +- sys/dev/cxgbe/t4_sge.c | 1142 +++++++++++++++++---------------- sys/dev/cxgbe/tom/t4_listen.c | 20 + sys/dev/cxgbe/tom/t4_tom.h | 1 + 7 files changed, 726 insertions(+), 583 deletions(-) diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 670f195a1..e7b2b16d3 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -128,10 +128,11 @@ enum { RX_FL_ESIZE = EQ_ESIZE, /* 8 64bit addresses */ #if MJUMPAGESIZE != MCLBYTES - FL_BUF_SIZES_MAX = 5, /* cluster, jumbop, jumbo9k, jumbo16k, extra */ + SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */ #else - FL_BUF_SIZES_MAX = 4, /* cluster, jumbo9k, jumbo16k, extra */ + SW_ZONE_SIZES = 3, /* cluster, jumbo9k, jumbo16k */ #endif + CL_METADATA_SIZE = CACHE_LINE_SIZE, CTRL_EQ_QSIZE = 128, @@ -203,10 +204,12 @@ struct port_info { uint8_t mod_type; uint8_t port_id; uint8_t tx_chan; + uint8_t rx_chan_map; /* rx MPS channel bitmap */ /* These need to be int as they are used in sysctl */ int ntxq; /* # of tx queues */ int first_txq; /* index of first tx queue */ + int rsrv_noflowq; /* Reserve queue 0 for non-flowid packets */ int nrxq; /* # of rx queues */ int first_rxq; /* index of first rx queue */ #ifdef TCP_OFFLOAD @@ -232,15 +235,28 @@ struct port_info { uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */ }; -struct fl_sdesc { - bus_dmamap_t map; - caddr_t cl; - uint8_t tag_idx; /* the fl->tag entry this map comes from */ +/* Where the cluster came from, how it has been carved up. */ +struct cluster_layout { + int8_t zidx; + int8_t hwidx; + uint16_t region1; /* mbufs laid out within this region */ + /* region2 is the DMA region */ + uint16_t region3; /* cluster_metadata within this region */ +}; + +struct cluster_metadata { + u_int refcount; #ifdef INVARIANTS - __be64 ba_hwtag; + struct fl_sdesc *sd; /* For debug only. Could easily be stale */ #endif }; +struct fl_sdesc { + caddr_t cl; + uint8_t nmbuf; + struct cluster_layout cll; +}; + struct tx_desc { __be64 flit[8]; }; @@ -359,17 +375,19 @@ struct sge_eq { uint32_t unstalled; /* recovered from stall */ }; -struct fl_buf_info { - u_int size; - int type; - int hwtag:4; /* tag in low 4 bits of the pa. */ - uma_zone_t zone; +struct sw_zone_info { + uma_zone_t zone; /* zone that this cluster comes from */ + int size; /* size of cluster: 2K, 4K, 9K, 16K, etc. */ + int type; /* EXT_xxx type of the cluster */ + int8_t head_hwidx; + int8_t tail_hwidx; +}; + +struct hw_buf_info { + int8_t zidx; /* backpointer to zone; -ve means unused */ + int8_t next; /* next hwidx for this zone; -1 means no more */ + int size; }; -#define FL_BUF_SIZES(sc) (sc->sge.fl_buf_sizes) -#define FL_BUF_SIZE(sc, x) (sc->sge.fl_buf_info[x].size) -#define FL_BUF_TYPE(sc, x) (sc->sge.fl_buf_info[x].type) -#define FL_BUF_HWTAG(sc, x) (sc->sge.fl_buf_info[x].hwtag) -#define FL_BUF_ZONE(sc, x) (sc->sge.fl_buf_info[x].zone) enum { FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */ @@ -383,9 +401,8 @@ enum { struct sge_fl { bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; - bus_dma_tag_t tag[FL_BUF_SIZES_MAX]; /* only first FL_BUF_SIZES(sc) are - valid */ - uint8_t tag_idx; + struct cluster_layout cll_def; /* default refill zone, layout */ + struct cluster_layout cll_alt; /* alternate refill zone, layout */ struct mtx fl_lock; char lockname[16]; int flags; @@ -402,9 +419,17 @@ struct sge_fl { uint32_t needed; /* # of buffers needed to fill up fl. */ uint32_t lowat; /* # of buffers <= this means fl needs help */ uint32_t pending; /* # of bufs allocated since last doorbell */ - u_int dmamap_failed; - struct mbuf *mstash[8]; TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ + + struct mbuf *m0; + struct mbuf **pnext; + u_int remaining; + + uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */ + uint64_t mbuf_inlined; /* # of mbuf created within clusters */ + uint64_t cl_allocated; /* # of clusters allocated */ + uint64_t cl_recycled; /* # of clusters recycled */ + uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */ }; /* txq: SGE egress queue + what's needed for Ethernet NIC */ @@ -510,6 +535,7 @@ struct sge { int timer_val[SGE_NTIMERS]; int counter_val[SGE_NCOUNTERS]; int fl_starve_threshold; + int fl_starve_threshold2; int eq_s_qpp; int iq_s_qpp; @@ -537,8 +563,11 @@ struct sge { struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */ struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */ - u_int fl_buf_sizes __aligned(CACHE_LINE_SIZE); - struct fl_buf_info fl_buf_info[FL_BUF_SIZES_MAX]; + int pack_boundary; + int8_t safe_hwidx1; /* may not have room for metadata */ + int8_t safe_hwidx2; /* with room for metadata and maybe more */ + struct sw_zone_info sw_zone_info[SW_ZONE_SIZES]; + struct hw_buf_info hw_buf_info[SGE_FLBUF_SIZES]; }; struct rss_header; @@ -629,6 +658,8 @@ struct adapter { const char *last_op; const void *last_op_thr; #endif + + int sc_do_rxcopy; }; #define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock) diff --git a/sys/dev/cxgbe/common/t4_hw.c b/sys/dev/cxgbe/common/t4_hw.c index ff6eea61a..ffe606314 100644 --- a/sys/dev/cxgbe/common/t4_hw.c +++ b/sys/dev/cxgbe/common/t4_hw.c @@ -5645,6 +5645,7 @@ int __devinit t4_port_init(struct port_info *p, int mbox, int pf, int vf) p->viid = ret; p->tx_chan = j; + p->rx_chan_map = get_mps_bg_map(adap, j); p->lport = j; p->rss_size = rss_size; t4_os_set_hw_addr(adap, p->port_id, addr); diff --git a/sys/dev/cxgbe/common/t4_hw.h b/sys/dev/cxgbe/common/t4_hw.h index 3bc209633..f5ac42128 100644 --- a/sys/dev/cxgbe/common/t4_hw.h +++ b/sys/dev/cxgbe/common/t4_hw.h @@ -86,6 +86,7 @@ enum { SGE_NTIMERS = 6, /* # of interrupt holdoff timer values */ SGE_NCOUNTERS = 4, /* # of interrupt packet counter values */ SGE_MAX_IQ_SIZE = 65520, + SGE_FLBUF_SIZES = 16, }; struct sge_qstat { /* data written to SGE queue status entries */ diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 7e4e1d9ce..052b3fe75 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -197,6 +197,9 @@ TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g); static int t4_nrxq1g = -1; TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g); +static int t4_rsrv_noflowq = 0; +TUNABLE_INT("hw.cxgbe.rsrv_noflowq", &t4_rsrv_noflowq); + #ifdef TCP_OFFLOAD #define NOFLDTXQ_10G 8 static int t4_nofldtxq10g = -1; @@ -299,6 +302,7 @@ struct intrs_and_queues { int nrxq10g; /* # of NIC rxq's for each 10G port */ int ntxq1g; /* # of NIC txq's for each 1G port */ int nrxq1g; /* # of NIC rxq's for each 1G port */ + int rsrv_noflowq; /* Flag whether to reserve queue 0 */ #ifdef TCP_OFFLOAD int nofldtxq10g; /* # of TOE txq's for each 10G port */ int nofldrxq10g; /* # of TOE rxq's for each 10G port */ @@ -375,6 +379,7 @@ static int cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); +static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); @@ -489,6 +494,8 @@ CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS); CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES); +CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); + static int t4_probe(device_t dev) { @@ -774,6 +781,11 @@ t4_attach(device_t dev) pi->ntxq = iaq.ntxq1g; } + if (pi->ntxq > 1) + pi->rsrv_noflowq = iaq.rsrv_noflowq ? 1 : 0; + else + pi->rsrv_noflowq = 0; + rqidx += pi->nrxq; tqidx += pi->ntxq; @@ -1252,7 +1264,8 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) } if (m->m_flags & M_FLOWID) - txq += (m->m_pkthdr.flowid % pi->ntxq); + txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq)) + + pi->rsrv_noflowq); br = txq->br; if (TXQ_TRYLOCK(txq) == 0) { @@ -1704,6 +1717,7 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, iaq->ntxq1g = t4_ntxq1g; iaq->nrxq10g = nrxq10g = t4_nrxq10g; iaq->nrxq1g = nrxq1g = t4_nrxq1g; + iaq->rsrv_noflowq = t4_rsrv_noflowq; #ifdef TCP_OFFLOAD if (is_offload(sc)) { iaq->nofldtxq10g = t4_nofldtxq10g; @@ -2624,6 +2638,7 @@ build_medialist(struct port_info *pi) ifmedia_set(media, m | IFM_10G_CX4); break; + case FW_PORT_TYPE_QSFP_10G: case FW_PORT_TYPE_SFP: case FW_PORT_TYPE_FIBER_XFI: case FW_PORT_TYPE_FIBER_XAUI: @@ -4029,6 +4044,7 @@ static void cxgbe_tick(void *arg) { struct port_info *pi = arg; + struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; struct sge_txq *txq; int i, drops; @@ -4040,7 +4056,7 @@ cxgbe_tick(void *arg) return; /* without scheduling another callout */ } - t4_get_port_stats(pi->adapter, pi->tx_chan, s); + t4_get_port_stats(sc, pi->tx_chan, s); ifp->if_opackets = s->tx_frames - s->tx_pause; ifp->if_ipackets = s->rx_frames - s->rx_pause; @@ -4051,6 +4067,19 @@ cxgbe_tick(void *arg) ifp->if_iqdrops = s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 + s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 + s->rx_trunc3; + for (i = 0; i < 4; i++) { + if (pi->rx_chan_map & (1 << i)) { + uint32_t v; + + /* + * XXX: indirect reads from the same ADDR/DATA pair can + * race with each other. + */ + t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, + 1, A_TP_MIB_TNL_CNG_DROP_0 + i); + ifp->if_iqdrops += v; + } + } drops = s->tx_drop; for_each_txq(pi, i, txq) @@ -4197,6 +4226,10 @@ t4_sysctls(struct adapter *sc) oid = device_get_sysctl_tree(sc->dev); c0 = children = SYSCTL_CHILDREN(oid); + sc->sc_do_rxcopy = 1; + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW, + &sc->sc_do_rxcopy, 1, "Do RX copy of small frames"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL, sc->params.nports, "# of ports"); @@ -4257,7 +4290,7 @@ t4_sysctls(struct adapter *sc) NULL, sc->tids.nftids, "number of filters"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | - CTLFLAG_RD, sc, 0, sysctl_temperature, "A", + CTLFLAG_RD, sc, 0, sysctl_temperature, "I", "chip temperature (in Celsius)"); t4_sge_sysctls(sc, ctx, children); @@ -4498,6 +4531,9 @@ cxgbe_sysctls(struct port_info *pi) &pi->first_rxq, 0, "index of first rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &pi->first_txq, 0, "index of first tx queue"); + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT | + CTLFLAG_RW, pi, 0, sysctl_noflowq, "IU", + "Reserve queue 0 for non-flowid packets"); #ifdef TCP_OFFLOAD if (is_offload(sc)) { @@ -4751,6 +4787,25 @@ sysctl_btphy(SYSCTL_HANDLER_ARGS) return (rc); } +static int +sysctl_noflowq(SYSCTL_HANDLER_ARGS) +{ + struct port_info *pi = arg1; + int rc, val; + + val = pi->rsrv_noflowq; + rc = sysctl_handle_int(oidp, &val, 0, req); + if (rc != 0 || req->newptr == NULL) + return (rc); + + if ((val >= 1) && (pi->ntxq > 1)) + pi->rsrv_noflowq = 1; + else + pi->rsrv_noflowq = 0; + + return (rc); +} + static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { @@ -7728,11 +7783,11 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, if (port_id >= sc->params.nports) return (EINVAL); + pi = sc->port[port_id]; /* MAC stats */ - t4_clr_port_stats(sc, port_id); + t4_clr_port_stats(sc, pi->tx_chan); - pi = sc->port[port_id]; if (pi->flags & PORT_INIT_DONE) { struct sge_rxq *rxq; struct sge_txq *txq; diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index befcbcb19..f28f53796 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -50,6 +51,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include "common/common.h" #include "common/t4_regs.h" @@ -122,6 +125,27 @@ static int t4_fl_pack; static int t5_fl_pack; TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack); +/* + * Allow the driver to create mbuf(s) in a cluster allocated for rx. + * 0: never; always allocate mbufs from the zone_mbuf UMA zone. + * 1: ok to create mbuf(s) within a cluster if there is room. + */ +static int allow_mbufs_in_cluster = 1; +TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster); + +/* + * Largest rx cluster size that the driver is allowed to allocate. + */ +static int largest_rx_cluster = MJUM16BYTES; +TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster); + +/* + * Size of cluster allocation that's most likely to succeed. The driver will + * fall back to this size if it fails to allocate clusters larger than this. + */ +static int safest_rx_cluster = PAGE_SIZE; +TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster); + /* Used to track coalesced tx work request */ struct txpkts { uint64_t *flitp; /* ptr to flit where next pkt should start */ @@ -138,9 +162,7 @@ struct sgl { }; static int service_iq(struct sge_iq *, int); -static struct mbuf *get_fl_payload1(struct adapter *, struct sge_fl *, uint32_t, - int *); -static struct mbuf *get_fl_payload2(struct adapter *, struct sge_fl *, uint32_t, +static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t, int *); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int, @@ -156,6 +178,8 @@ static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *, int, int); static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *); +static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, + struct sge_fl *); static int alloc_fwq(struct adapter *); static int free_fwq(struct adapter *); static int alloc_mgmtq(struct adapter *); @@ -189,7 +213,8 @@ static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int alloc_fl_sdesc(struct sge_fl *); static void free_fl_sdesc(struct adapter *, struct sge_fl *); -static void set_fl_tag_idx(struct adapter *, struct sge_fl *, int); +static void find_best_refill_source(struct adapter *, struct sge_fl *, int); +static void find_safe_refill_source(struct adapter *, struct sge_fl *); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int); @@ -214,6 +239,7 @@ static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static int sysctl_uint16(SYSCTL_HANDLER_ARGS); +static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. @@ -262,7 +288,7 @@ t4_sge_modload(void) /* T5's pack boundary is independent of the pad boundary. */ if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) - t5_fl_pack = max(pad, 64); + t5_fl_pack = max(pad, CACHE_LINE_SIZE); else t5_fl_pack = fl_pack; @@ -312,14 +338,18 @@ t4_tweak_chip_settings(struct adapter *sc) int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); - int sw_flbuf_sizes[] = { + static int sge_flbuf_sizes[] = { MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, + MJUMPAGESIZE - CL_METADATA_SIZE, + MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE, #endif MJUM9BYTES, MJUM16BYTES, - MJUMPAGESIZE - MSIZE + MCLBYTES - MSIZE - CL_METADATA_SIZE, + MJUM9BYTES - CL_METADATA_SIZE, + MJUM16BYTES - CL_METADATA_SIZE, }; KASSERT(sc->flags & MASTER_PF, @@ -357,9 +387,11 @@ t4_tweak_chip_settings(struct adapter *sc) V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); - for (i = 0; i < min(nitems(sw_flbuf_sizes), 16); i++) { + KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES, + ("%s: hw buffer size table too big", __func__)); + for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) { t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i), - sw_flbuf_sizes[i]); + sge_flbuf_sizes[i]); } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | @@ -413,6 +445,18 @@ t4_tweak_chip_settings(struct adapter *sc) t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); } +/* + * SGE wants the buffer to be at least 64B and then a multiple of the pad + * boundary or 16, whichever is greater. + */ +static inline int +hwsz_ok(int hwsz) +{ + int mask = max(fl_pad, 16) - 1; + + return (hwsz >= 64 && (hwsz & mask) == 0); +} + /* * XXX: driver really should be able to deal with unexpected settings. */ @@ -423,7 +467,7 @@ t4_read_chip_settings(struct adapter *sc) int i, j, n, rc = 0; uint32_t m, v, r; uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); - uint32_t sge_flbuf_sizes[16], sw_flbuf_sizes[] = { + static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, @@ -431,6 +475,8 @@ t4_read_chip_settings(struct adapter *sc) MJUM9BYTES, MJUM16BYTES }; + struct sw_zone_info *swz, *safe_swz; + struct hw_buf_info *hwb; m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | @@ -461,6 +507,7 @@ t4_read_chip_settings(struct adapter *sc) rc = EINVAL; } } + s->pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack; v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | @@ -476,45 +523,93 @@ t4_read_chip_settings(struct adapter *sc) rc = EINVAL; } - /* - * Make a list of SGE FL buffer sizes programmed in the chip and tally - * it with the FL buffer sizes that we'd like to use. - */ - n = 0; - for (i = 0; i < nitems(sge_flbuf_sizes); i++) { + /* Filter out unusable hw buffer sizes entirely (mark with -2). */ + hwb = &s->hw_buf_info[0]; + for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) { r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i)); - sge_flbuf_sizes[i] = r; - if (r == MJUMPAGESIZE - MSIZE && - (sc->flags & BUF_PACKING_OK) == 0) { - sc->flags |= BUF_PACKING_OK; - FL_BUF_HWTAG(sc, n) = i; - FL_BUF_SIZE(sc, n) = MJUMPAGESIZE - MSIZE; - FL_BUF_TYPE(sc, n) = m_gettype(MJUMPAGESIZE); - FL_BUF_ZONE(sc, n) = m_getzone(MJUMPAGESIZE); - n++; - } + hwb->size = r; + hwb->zidx = hwsz_ok(r) ? -1 : -2; + hwb->next = -1; } - for (i = 0; i < nitems(sw_flbuf_sizes); i++) { - for (j = 0; j < nitems(sge_flbuf_sizes); j++) { - if (sw_flbuf_sizes[i] != sge_flbuf_sizes[j]) + + /* + * Create a sorted list in decreasing order of hw buffer sizes (and so + * increasing order of spare area) for each software zone. + */ + n = 0; /* no usable buffer size to begin with */ + swz = &s->sw_zone_info[0]; + safe_swz = NULL; + for (i = 0; i < SW_ZONE_SIZES; i++, swz++) { + int8_t head = -1, tail = -1; + + swz->size = sw_buf_sizes[i]; + swz->zone = m_getzone(swz->size); + swz->type = m_gettype(swz->size); + + if (swz->size == safest_rx_cluster) + safe_swz = swz; + + hwb = &s->hw_buf_info[0]; + for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) { + if (hwb->zidx != -1 || hwb->size > swz->size) continue; - FL_BUF_HWTAG(sc, n) = j; - FL_BUF_SIZE(sc, n) = sw_flbuf_sizes[i]; - FL_BUF_TYPE(sc, n) = m_gettype(sw_flbuf_sizes[i]); - FL_BUF_ZONE(sc, n) = m_getzone(sw_flbuf_sizes[i]); + hwb->zidx = i; + if (head == -1) + head = tail = j; + else if (hwb->size < s->hw_buf_info[tail].size) { + s->hw_buf_info[tail].next = j; + tail = j; + } else { + int8_t *cur; + struct hw_buf_info *t; + + for (cur = &head; *cur != -1; cur = &t->next) { + t = &s->hw_buf_info[*cur]; + if (hwb->size == t->size) { + hwb->zidx = -2; + break; + } + if (hwb->size > t->size) { + hwb->next = *cur; + *cur = j; + break; + } + } + } + } + swz->head_hwidx = head; + swz->tail_hwidx = tail; + + if (tail != -1) { n++; - break; + if (swz->size - s->hw_buf_info[tail].size >= + CL_METADATA_SIZE) + sc->flags |= BUF_PACKING_OK; } } if (n == 0) { device_printf(sc->dev, "no usable SGE FL buffer size.\n"); rc = EINVAL; - } else if (n == 1 && (sc->flags & BUF_PACKING_OK)) { - device_printf(sc->dev, - "no usable SGE FL buffer size when not packing buffers.\n"); - rc = EINVAL; } - FL_BUF_SIZES(sc) = n; + + s->safe_hwidx1 = -1; + s->safe_hwidx2 = -1; + if (safe_swz != NULL) { + s->safe_hwidx1 = safe_swz->head_hwidx; + for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) { + int spare; + + hwb = &s->hw_buf_info[i]; + spare = safe_swz->size - hwb->size; + if (spare < CL_METADATA_SIZE) + continue; + if (s->safe_hwidx2 == -1 || + spare == CL_METADATA_SIZE + MSIZE) + s->safe_hwidx2 = i; + if (spare >= CL_METADATA_SIZE + MSIZE) + break; + } + } r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD); s->counter_val[0] = G_THRESHOLD_0(r); @@ -568,6 +663,10 @@ t4_read_chip_settings(struct adapter *sc) r = t4_read_reg(sc, A_SGE_CONM_CTRL); s->fl_starve_threshold = G_EGRTHRESHOLD(r) * 2 + 1; + if (is_t4(sc)) + s->fl_starve_threshold2 = s->fl_starve_threshold; + else + s->fl_starve_threshold2 = G_EGRTHRESHOLDPACKING(r) * 2 + 1; /* egress queues: log2 of # of doorbells per BAR2 page */ r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF); @@ -622,6 +721,10 @@ t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", + CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A", + "freelist buffer sizes"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)"); @@ -639,8 +742,7 @@ t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, "pack multiple frames in one fl buffer"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, - NULL, is_t5(sc) ? t5_fl_pack : t4_fl_pack, - "payload pack boundary (bytes)"); + NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)"); } int @@ -760,7 +862,7 @@ port_intr_iq(struct port_info *pi, int idx) #ifdef TCP_OFFLOAD if (sc->flags & INTR_DIRECT) { idx %= pi->nrxq + pi->nofldrxq; - + if (idx >= pi->nrxq) { idx -= pi->nrxq; iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq; @@ -791,29 +893,28 @@ port_intr_iq(struct port_info *pi, int idx) return (iq); } +/* Maximum payload that can be delivered with a single iq descriptor */ static inline int -mtu_to_bufsize(int mtu) +mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) { - int bufsize; - - /* large enough for a frame even when VLAN extraction is disabled */ - bufsize = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu; - bufsize = roundup2(bufsize + fl_pktshift, fl_pad); - - return (bufsize); -} + int payload; #ifdef TCP_OFFLOAD -static inline int -mtu_to_bufsize_toe(struct adapter *sc, int mtu) -{ - - if (sc->tt.rx_coalesce) - return (G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2))); + if (toe) { + payload = sc->tt.rx_coalesce ? + G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu; + } else { +#endif + /* large enough even when hw VLAN extraction is disabled */ + payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + + mtu; +#ifdef TCP_OFFLOAD + } +#endif + payload = roundup2(payload, fl_pad); - return (mtu); + return (payload); } -#endif int t4_setup_port_queues(struct port_info *pi) @@ -832,7 +933,7 @@ t4_setup_port_queues(struct port_info *pi) struct ifnet *ifp = pi->ifp; struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); - int bufsize, pack; + int maxp, pack, mtu = ifp->if_mtu; oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); @@ -853,7 +954,7 @@ t4_setup_port_queues(struct port_info *pi) * a) initialize iq and fl * b) allocate queue iff it will take direct interrupts. */ - bufsize = mtu_to_bufsize(ifp->if_mtu); + maxp = mtu_to_max_payload(sc, mtu, 0); pack = enable_buffer_packing(sc); for_each_rxq(pi, i, rxq) { @@ -862,7 +963,7 @@ t4_setup_port_queues(struct port_info *pi) snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(pi->dev), i); - init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, bufsize, pack, name); + init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); if (sc->flags & INTR_DIRECT #ifdef TCP_OFFLOAD @@ -878,8 +979,7 @@ t4_setup_port_queues(struct port_info *pi) } #ifdef TCP_OFFLOAD - bufsize = mtu_to_bufsize_toe(sc, ifp->if_mtu); - pack = 0; /* XXX: think about this some more */ + maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(pi, i, ofld_rxq) { init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, @@ -887,8 +987,7 @@ t4_setup_port_queues(struct port_info *pi) snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(pi->dev), i); - init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, pack, - name); + init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); if (sc->flags & INTR_DIRECT || (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) { @@ -1162,10 +1261,7 @@ service_iq(struct sge_iq *iq, int budget) ("%s: data for an iq (%p) with no freelist", __func__, iq)); - m0 = fl->flags & FL_BUF_PACKING ? - get_fl_payload1(sc, fl, lq, &fl_bufs_used) : - get_fl_payload2(sc, fl, lq, &fl_bufs_used); - + m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used); if (__predict_false(m0 == NULL)) goto process_iql; #ifdef T4_PKT_TIMESTAMP @@ -1222,6 +1318,14 @@ service_iq(struct sge_iq *iq, int budget) break; } + if (fl_bufs_used >= 16) { + FL_LOCK(fl); + fl->needed += fl_bufs_used; + refill_fl(sc, fl, 32); + FL_UNLOCK(fl); + fl_bufs_used = 0; + } + iq_next(iq); if (++ndescs == limit) { t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), @@ -1230,14 +1334,6 @@ service_iq(struct sge_iq *iq, int budget) V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; - if (fl_bufs_used > 0) { - FL_LOCK(fl); - fl->needed += fl_bufs_used; - refill_fl(sc, fl, fl->cap / 8); - FL_UNLOCK(fl); - fl_bufs_used = 0; - } - if (budget) return (EINPROGRESS); } @@ -1280,7 +1376,7 @@ process_iql: FL_LOCK(fl); fl->needed += fl_bufs_used; - starved = refill_fl(sc, fl, fl->cap / 4); + starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); @@ -1289,74 +1385,28 @@ process_iql: return (0); } -static int -fill_mbuf_stash(struct sge_fl *fl) -{ - int i; - - for (i = 0; i < nitems(fl->mstash); i++) { - if (fl->mstash[i] == NULL) { - struct mbuf *m; - if ((m = m_get(M_NOWAIT, MT_NOINIT)) == NULL) - return (ENOBUFS); - fl->mstash[i] = m; - } - } - return (0); -} - -static struct mbuf * -get_mbuf_from_stash(struct sge_fl *fl) +static inline int +cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll) { - int i; + int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0; - for (i = 0; i < nitems(fl->mstash); i++) { - if (fl->mstash[i] != NULL) { - struct mbuf *m; - - m = fl->mstash[i]; - fl->mstash[i] = NULL; - return (m); - } else - fl->mstash[i] = m_get(M_NOWAIT, MT_NOINIT); - } + if (rc) + MPASS(cll->region3 >= CL_METADATA_SIZE); - return (m_get(M_NOWAIT, MT_NOINIT)); + return (rc); } -static void -return_mbuf_to_stash(struct sge_fl *fl, struct mbuf *m) +static inline struct cluster_metadata * +cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll, + caddr_t cl) { - int i; - if (m == NULL) - return; + if (cl_has_metadata(fl, cll)) { + struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; - for (i = 0; i < nitems(fl->mstash); i++) { - if (fl->mstash[i] == NULL) { - fl->mstash[i] = m; - return; - } + return ((struct cluster_metadata *)(cl + swz->size) - 1); } - m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0); - m_free(m); -} - -/* buf can be any address within the buffer */ -static inline u_int * -find_buf_refcnt(caddr_t buf) -{ - uintptr_t ptr = (uintptr_t)buf; - - return ((u_int *)((ptr & ~(MJUMPAGESIZE - 1)) + MSIZE - sizeof(u_int))); -} - -static inline struct mbuf * -find_buf_mbuf(caddr_t buf) -{ - uintptr_t ptr = (uintptr_t)buf; - - return ((struct mbuf *)(ptr & ~(MJUMPAGESIZE - 1))); + return (NULL); } static void @@ -1364,177 +1414,115 @@ rxb_free(void *arg1, void *arg2) { uma_zone_t zone = arg1; caddr_t cl = arg2; -#ifdef notyet - u_int refcount; - refcount = *find_buf_refcnt(cl); - KASSERT(refcount == 0, ("%s: cl %p refcount is %u", __func__, - cl - MSIZE, refcount)); -#endif - cl -= MSIZE; uma_zfree(zone, cl); } +/* + * The mbuf returned by this function could be allocated from zone_mbuf or + * constructed in spare room in the cluster. + * + * The mbuf carries the payload in one of these ways + * a) frame inside the mbuf (mbuf from zone_mbuf) + * b) m_cljset (for clusters without metadata) zone_mbuf + * c) m_extaddref (cluster with metadata) inline mbuf + * d) m_extaddref (cluster with metadata) zone_mbuf + */ static struct mbuf * -get_fl_payload1(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, - int *fl_bufs_used) +get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int flags) { - struct mbuf *m0, *m; + struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; - unsigned int nbuf, len; - int pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack; - - /* - * No assertion for the fl lock because we don't need it. This routine - * is called only from the rx interrupt handler and it only updates - * fl->cidx. (Contrast that with fl->pidx/fl->needed which could be - * updated in the rx interrupt handler or the starvation helper routine. - * That's why code that manipulates fl->pidx/fl->needed needs the fl - * lock but this routine does not). - */ - - KASSERT(fl->flags & FL_BUF_PACKING, - ("%s: buffer packing disabled for fl %p", __func__, fl)); + struct cluster_layout *cll = &sd->cll; + struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; + struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx]; + struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl); + int len, padded_len; + caddr_t payload; - len = G_RSPD_LEN(len_newbuf); + len = min(total, hwb->size - fl->rx_offset); + padded_len = roundup2(len, fl_pad); + payload = sd->cl + cll->region1 + fl->rx_offset; - if ((len_newbuf & F_RSPD_NEWBUF) == 0) { - KASSERT(fl->rx_offset > 0, - ("%s: packed frame but driver at offset=0", __func__)); + if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { - /* A packed frame is guaranteed to fit entirely in this buf. */ - KASSERT(FL_BUF_SIZE(sc, sd->tag_idx) - fl->rx_offset >= len, - ("%s: packing error. bufsz=%u, offset=%u, len=%u", - __func__, FL_BUF_SIZE(sc, sd->tag_idx), fl->rx_offset, - len)); + /* + * Copy payload into a freshly allocated mbuf. + */ - m0 = get_mbuf_from_stash(fl); - if (m0 == NULL || - m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) { - return_mbuf_to_stash(fl, m0); + m = flags & M_PKTHDR ? + m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); + if (m == NULL) return (NULL); - } - - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, - BUS_DMASYNC_POSTREAD); - if (len < RX_COPY_THRESHOLD) { + fl->mbuf_allocated++; #ifdef T4_PKT_TIMESTAMP - /* Leave room for a timestamp */ - m0->m_data += 8; + /* Leave room for a timestamp */ + m->m_data += 8; #endif - bcopy(sd->cl + fl->rx_offset, mtod(m0, caddr_t), len); - m0->m_pkthdr.len = len; - m0->m_len = len; - } else { - m0->m_pkthdr.len = len; - m0->m_len = len; - m_extaddref(m0, sd->cl + fl->rx_offset, - roundup2(m0->m_len, fl_pad), - find_buf_refcnt(sd->cl), rxb_free, - FL_BUF_ZONE(sc, sd->tag_idx), sd->cl); - } - fl->rx_offset += len; - fl->rx_offset = roundup2(fl->rx_offset, fl_pad); - fl->rx_offset = roundup2(fl->rx_offset, pack_boundary); - if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) { - fl->rx_offset = 0; - (*fl_bufs_used) += 1; - if (__predict_false(++fl->cidx == fl->cap)) - fl->cidx = 0; - } + /* copy data to mbuf */ + bcopy(payload, mtod(m, caddr_t), len); - return (m0); - } + } else if (sd->nmbuf * MSIZE < cll->region1) { - KASSERT(len_newbuf & F_RSPD_NEWBUF, - ("%s: only new buffer handled here", __func__)); + /* + * There's spare room in the cluster for an mbuf. Create one + * and associate it with the payload that's in the cluster too. + */ - nbuf = 0; + MPASS(clm != NULL); + m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE); + /* No bzero required */ + if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA, flags | M_NOFREE)) + return (NULL); + fl->mbuf_inlined++; + m_extaddref(m, payload, padded_len, &clm->refcount, rxb_free, + swz->zone, sd->cl); + sd->nmbuf++; - /* - * Move to the start of the next buffer if we are still in the middle of - * some buffer. This is the case where there was some room left in the - * previous buffer but not enough to fit this frame in its entirety. - */ - if (fl->rx_offset > 0) { - KASSERT(roundup2(len, fl_pad) > FL_BUF_SIZE(sc, sd->tag_idx) - - fl->rx_offset, ("%s: frame (%u bytes) should have fit at " - "cidx %u offset %u bufsize %u", __func__, len, fl->cidx, - fl->rx_offset, FL_BUF_SIZE(sc, sd->tag_idx))); - nbuf++; - fl->rx_offset = 0; - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; - } - } + } else { - m0 = find_buf_mbuf(sd->cl); - if (m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR | M_NOFREE)) - goto done; - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD); - m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx)); - m_extaddref(m0, sd->cl, roundup2(m0->m_len, fl_pad), - find_buf_refcnt(sd->cl), rxb_free, FL_BUF_ZONE(sc, sd->tag_idx), - sd->cl); - m0->m_pkthdr.len = len; - - fl->rx_offset = roundup2(m0->m_len, fl_pad); - fl->rx_offset = roundup2(fl->rx_offset, pack_boundary); - if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) { - fl->rx_offset = 0; - nbuf++; - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; + /* + * Grab an mbuf from zone_mbuf and associate it with the + * payload in the cluster. + */ + + m = flags & M_PKTHDR ? + m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); + if (m == NULL) + return (NULL); + fl->mbuf_allocated++; + if (clm != NULL) + m_extaddref(m, payload, padded_len, &clm->refcount, + rxb_free, swz->zone, sd->cl); + else { + m_cljset(m, sd->cl, swz->type); + sd->cl = NULL; /* consumed, not a recycle candidate */ } } + if (flags & M_PKTHDR) + m->m_pkthdr.len = total; + m->m_len = len; - m = m0; - len -= m->m_len; + if (fl->flags & FL_BUF_PACKING) { + fl->rx_offset += roundup2(padded_len, sc->sge.pack_boundary); + MPASS(fl->rx_offset <= hwb->size); + if (fl->rx_offset < hwb->size) + return (m); /* without advancing the cidx */ + } - while (len > 0) { - m->m_next = find_buf_mbuf(sd->cl); - m = m->m_next; - - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, - BUS_DMASYNC_POSTREAD); - - /* m_init for !M_PKTHDR can't fail so don't bother */ - m_init(m, NULL, 0, M_NOWAIT, MT_DATA, M_NOFREE); - m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx)); - m_extaddref(m, sd->cl, roundup2(m->m_len, fl_pad), - find_buf_refcnt(sd->cl), rxb_free, - FL_BUF_ZONE(sc, sd->tag_idx), sd->cl); - - fl->rx_offset = roundup2(m->m_len, fl_pad); - fl->rx_offset = roundup2(fl->rx_offset, pack_boundary); - if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) { - fl->rx_offset = 0; - nbuf++; - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; - } - } + if (__predict_false(++fl->cidx == fl->cap)) + fl->cidx = 0; + fl->rx_offset = 0; - len -= m->m_len; - } -done: - (*fl_bufs_used) += nbuf; - return (m0); + return (m); } static struct mbuf * -get_fl_payload2(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, +get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, int *fl_bufs_used) { - struct mbuf *m0, *m; - struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; - unsigned int nbuf, len; + struct mbuf *m0, *m, **pnext; + u_int nbuf, len; /* * No assertion for the fl lock because we don't need it. This routine @@ -1545,87 +1533,54 @@ get_fl_payload2(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, * lock but this routine does not). */ - KASSERT((fl->flags & FL_BUF_PACKING) == 0, - ("%s: buffer packing enabled for fl %p", __func__, fl)); - if (__predict_false((len_newbuf & F_RSPD_NEWBUF) == 0)) - panic("%s: cannot handle packed frames", __func__); + nbuf = 0; len = G_RSPD_LEN(len_newbuf); - - /* - * We never want to run out of mbufs in between a frame when a frame - * spans multiple fl buffers. If the fl's mbuf stash isn't full and - * can't be filled up to the brim then fail early. - */ - if (len > FL_BUF_SIZE(sc, sd->tag_idx) && fill_mbuf_stash(fl) != 0) - return (NULL); - - m0 = get_mbuf_from_stash(fl); - if (m0 == NULL || - m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) { - return_mbuf_to_stash(fl, m0); - return (NULL); - } - - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD); - - if (len < RX_COPY_THRESHOLD) { -#ifdef T4_PKT_TIMESTAMP - /* Leave room for a timestamp */ - m0->m_data += 8; -#endif - /* copy data to mbuf, buffer will be recycled */ - bcopy(sd->cl, mtod(m0, caddr_t), len); - m0->m_len = len; - } else { - bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map); - m_cljset(m0, sd->cl, FL_BUF_TYPE(sc, sd->tag_idx)); - sd->cl = NULL; /* consumed */ - m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx)); + if (__predict_false(fl->m0 != NULL)) { + MPASS(len == fl->m0->m_pkthdr.len); + MPASS(fl->remaining < len); + + m0 = fl->m0; + pnext = fl->pnext; + len = fl->remaining; + fl->m0 = NULL; + goto get_segment; } - m0->m_pkthdr.len = len; - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; + if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) { + nbuf++; + fl->rx_offset = 0; + if (__predict_false(++fl->cidx == fl->cap)) + fl->cidx = 0; } - m = m0; - len -= m->m_len; - nbuf = 1; /* # of fl buffers used */ + /* + * Payload starts at rx_offset in the current hw buffer. Its length is + * 'len' and it may span multiple hw buffers. + */ + m0 = get_scatter_segment(sc, fl, len, M_PKTHDR); + len -= m0->m_len; + pnext = &m0->m_next; while (len > 0) { - /* Can't fail, we checked earlier that the stash was full. */ - m->m_next = get_mbuf_from_stash(fl); - m = m->m_next; - - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, - BUS_DMASYNC_POSTREAD); - - /* m_init for !M_PKTHDR can't fail so don't bother */ - m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0); - if (len <= MLEN) { - bcopy(sd->cl, mtod(m, caddr_t), len); - m->m_len = len; - } else { - bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map); - m_cljset(m, sd->cl, FL_BUF_TYPE(sc, sd->tag_idx)); - sd->cl = NULL; /* consumed */ - m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx)); - } - - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; + nbuf++; +get_segment: + MPASS(fl->rx_offset == 0); + m = get_scatter_segment(sc, fl, len, 0); + if (m == NULL) { + fl->m0 = m0; + fl->pnext = pnext; + fl->remaining = len; + return (NULL); } - + *pnext = m; + pnext = &m->m_next; len -= m->m_len; - nbuf++; } + *pnext = NULL; + if (fl->rx_offset == 0) + nbuf++; (*fl_bufs_used) += nbuf; - return (m0); } @@ -1982,23 +1937,23 @@ t4_update_fl_bufsize(struct ifnet *ifp) struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; - int i, bufsize; + int i, maxp, mtu = ifp->if_mtu; - bufsize = mtu_to_bufsize(ifp->if_mtu); + maxp = mtu_to_max_payload(sc, mtu, 0); for_each_rxq(pi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); - set_fl_tag_idx(sc, fl, bufsize); + find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD - bufsize = mtu_to_bufsize_toe(pi->adapter, ifp->if_mtu); + maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(pi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); - set_fl_tag_idx(sc, fl, bufsize); + find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #endif @@ -2032,7 +1987,7 @@ init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, } static inline void -init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int bufsize, int pack, +init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, int pack, char *name) { @@ -2040,7 +1995,8 @@ init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int bufsize, int pack, strlcpy(fl->lockname, name, sizeof(fl->lockname)); if (pack) fl->flags |= FL_BUF_PACKING; - set_fl_tag_idx(sc, fl, bufsize); + find_best_refill_source(sc, fl, maxp); + find_safe_refill_source(sc, fl); } static inline void @@ -2169,24 +2125,6 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, if (fl) { mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); - for (i = 0; i < FL_BUF_SIZES(sc); i++) { - - /* - * A freelist buffer must be 16 byte aligned as the SGE - * uses the low 4 bits of the bus addr to figure out the - * buffer size. - */ - rc = bus_dma_tag_create(sc->dmat, 16, 0, - BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, - FL_BUF_SIZE(sc, i), 1, FL_BUF_SIZE(sc, i), - BUS_DMA_ALLOCNOW, NULL, NULL, &fl->tag[i]); - if (rc != 0) { - device_printf(sc->dev, - "failed to create fl DMA tag[%d]: %d\n", - i, rc); - return (rc); - } - } len = fl->qsize * RX_FL_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); @@ -2203,7 +2141,9 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, return (rc); } fl->needed = fl->cap; - fl->lowat = roundup2(sc->sge.fl_starve_threshold, 8); + fl->lowat = fl->flags & FL_BUF_PACKING ? + roundup2(sc->sge.fl_starve_threshold2, 8) : + roundup2(sc->sge.fl_starve_threshold, 8); c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | @@ -2301,7 +2241,7 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, static int free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl) { - int i, rc; + int rc; struct adapter *sc = iq->adapter; device_t dev; @@ -2333,29 +2273,48 @@ free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl) if (fl->sdesc) free_fl_sdesc(sc, fl); - for (i = 0; i < nitems(fl->mstash); i++) { - struct mbuf *m = fl->mstash[i]; - - if (m != NULL) { - m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0); - m_free(m); - } - } - if (mtx_initialized(&fl->fl_lock)) mtx_destroy(&fl->fl_lock); - for (i = 0; i < FL_BUF_SIZES(sc); i++) { - if (fl->tag[i]) - bus_dma_tag_destroy(fl->tag[i]); - } - bzero(fl, sizeof(*fl)); } return (0); } +static void +add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, + struct sge_fl *fl) +{ + struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); + + oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, + "freelist"); + children = SYSCTL_CHILDREN(oid); + + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", + CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I", + "SGE context id of the freelist"); + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, + 0, "consumer index"); + if (fl->flags & FL_BUF_PACKING) { + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", + CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); + } + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, + 0, "producer index"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated", + CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined", + CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", + CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", + CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", + CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); +} + static int alloc_fwq(struct adapter *sc) { @@ -2438,7 +2397,7 @@ tnl_cong(struct port_info *pi) else if (cong_drop == 1) return (0); else - return (1 << pi->tx_chan); + return (pi->rx_chan_map); } static int @@ -2496,22 +2455,7 @@ alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx, CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); - children = SYSCTL_CHILDREN(oid); - oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "fl", CTLFLAG_RD, - NULL, "freelist"); - children = SYSCTL_CHILDREN(oid); - - SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id", - CTLTYPE_INT | CTLFLAG_RD, &rxq->fl.cntxt_id, 0, sysctl_uint16, "I", - "SGE context id of the queue"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, - &rxq->fl.cidx, 0, "consumer index"); - if (rxq->fl.flags & FL_BUF_PACKING) { - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "rx_offset", - CTLFLAG_RD, &rxq->fl.rx_offset, 0, "packing rx offset"); - } - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, - &rxq->fl.pidx, 0, "producer index"); + add_fl_sysctls(&pi->ctx, oid, &rxq->fl); return (rc); } @@ -2545,7 +2489,7 @@ alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq, char name[16]; rc = alloc_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, - 1 << pi->tx_chan); + pi->rx_chan_map); if (rc != 0) return (rc); @@ -2566,18 +2510,7 @@ alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq, CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); - children = SYSCTL_CHILDREN(oid); - oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "fl", CTLFLAG_RD, - NULL, "freelist"); - children = SYSCTL_CHILDREN(oid); - - SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id", - CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->fl.cntxt_id, 0, sysctl_uint16, - "I", "SGE context id of the queue"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, - &ofld_rxq->fl.cidx, 0, "consumer index"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, - &ofld_rxq->fl.pidx, 0, "producer index"); + add_fl_sysctls(&pi->ctx, oid, &ofld_rxq->fl); return (rc); } @@ -3064,103 +2997,83 @@ refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs) { __be64 *d = &fl->desc[fl->pidx]; struct fl_sdesc *sd = &fl->sdesc[fl->pidx]; - bus_dma_tag_t tag; - bus_addr_t pa; + uintptr_t pa; caddr_t cl; - int rc; + struct cluster_layout *cll = &fl->cll_def; /* default layout */ + struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; + struct cluster_metadata *clm; FL_LOCK_ASSERT_OWNED(fl); -#ifdef INVARIANTS - if (fl->flags & FL_BUF_PACKING) - KASSERT(sd->tag_idx == 0, - ("%s: expected tag 0 but found tag %d at pidx %u instead", - __func__, sd->tag_idx, fl->pidx)); -#endif if (nbufs > fl->needed) nbufs = fl->needed; + nbufs -= (fl->pidx + nbufs) % 8; while (nbufs--) { if (sd->cl != NULL) { - KASSERT(*d == sd->ba_hwtag, - ("%s: recyling problem at pidx %d", - __func__, fl->pidx)); - - if (fl->flags & FL_BUF_PACKING) { - u_int *refcount = find_buf_refcnt(sd->cl); - - if (atomic_fetchadd_int(refcount, -1) == 1) { - *refcount = 1; /* reinstate */ - d++; - goto recycled; - } - sd->cl = NULL; /* gave up my reference */ - } else { + if (sd->nmbuf == 0) { /* - * This happens when a frame small enough to fit - * entirely in an mbuf was received in cl last - * time. We'd held on to cl and can reuse it - * now. Note that we reuse a cluster of the old - * size if fl->tag_idx is no longer the same as - * sd->tag_idx. + * Fast recycle without involving any atomics on + * the cluster's metadata (if the cluster has + * metadata). This happens when all frames + * received in the cluster were small enough to + * fit within a single mbuf each. */ - d++; - goto recycled; + fl->cl_fast_recycled++; + goto recycled_fast; } - } - - if (__predict_false(fl->tag_idx != sd->tag_idx)) { - bus_dmamap_t map; - bus_dma_tag_t newtag = fl->tag[fl->tag_idx]; - bus_dma_tag_t oldtag = fl->tag[sd->tag_idx]; /* - * An MTU change can get us here. Discard the old map - * which was created with the old tag, but only if - * we're able to get a new one. + * Cluster is guaranteed to have metadata. Clusters + * without metadata always take the fast recycle path + * when they're recycled. */ - rc = bus_dmamap_create(newtag, 0, &map); - if (rc == 0) { - bus_dmamap_destroy(oldtag, sd->map); - sd->map = map; - sd->tag_idx = fl->tag_idx; - } - } + clm = cl_metadata(sc, fl, &sd->cll, sd->cl); + MPASS(clm != NULL); - tag = fl->tag[sd->tag_idx]; - - cl = uma_zalloc(FL_BUF_ZONE(sc, sd->tag_idx), M_NOWAIT); - if (cl == NULL) - break; - if (fl->flags & FL_BUF_PACKING) { - *find_buf_refcnt(cl) = 1; - cl += MSIZE; + if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { + fl->cl_recycled++; + goto recycled; + } + sd->cl = NULL; /* gave up my reference */ } + MPASS(sd->cl == NULL); +alloc: + cl = uma_zalloc(swz->zone, M_NOWAIT); + if (__predict_false(cl == NULL)) { + if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 || + fl->cll_def.zidx == fl->cll_alt.zidx) + break; - rc = bus_dmamap_load(tag, sd->map, cl, - FL_BUF_SIZE(sc, sd->tag_idx), oneseg_dma_callback, &pa, 0); - if (rc != 0 || pa == 0) { - fl->dmamap_failed++; - if (fl->flags & FL_BUF_PACKING) - cl -= MSIZE; - uma_zfree(FL_BUF_ZONE(sc, sd->tag_idx), cl); - break; + /* fall back to the safe zone */ + cll = &fl->cll_alt; + swz = &sc->sge.sw_zone_info[cll->zidx]; + goto alloc; } + fl->cl_allocated++; + pa = pmap_kextract((vm_offset_t)cl); + pa += cll->region1; sd->cl = cl; - *d++ = htobe64(pa | FL_BUF_HWTAG(sc, sd->tag_idx)); - + sd->cll = *cll; + *d = htobe64(pa | cll->hwidx); + clm = cl_metadata(sc, fl, cll, cl); + if (clm != NULL) { +recycled: #ifdef INVARIANTS - sd->ba_hwtag = htobe64(pa | FL_BUF_HWTAG(sc, sd->tag_idx)); + clm->sd = sd; #endif - -recycled: + clm->refcount = 1; + } + sd->nmbuf = 0; +recycled_fast: fl->pending++; fl->needed--; + d++; sd++; - if (++fl->pidx == fl->cap) { + if (__predict_false(++fl->pidx == fl->cap)) { fl->pidx = 0; sd = fl->sdesc; d = fl->desc; @@ -3201,53 +3114,33 @@ refill_sfl(void *arg) static int alloc_fl_sdesc(struct sge_fl *fl) { - struct fl_sdesc *sd; - bus_dma_tag_t tag; - int i, rc; fl->sdesc = malloc(fl->cap * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); - tag = fl->tag[fl->tag_idx]; - sd = fl->sdesc; - for (i = 0; i < fl->cap; i++, sd++) { - - sd->tag_idx = fl->tag_idx; - rc = bus_dmamap_create(tag, 0, &sd->map); - if (rc != 0) - goto failed; - } - return (0); -failed: - while (--i >= 0) { - sd--; - bus_dmamap_destroy(tag, sd->map); - } - KASSERT(sd == fl->sdesc, ("%s: EDOOFUS", __func__)); - - free(fl->sdesc, M_CXGBE); - fl->sdesc = NULL; - - return (rc); } static void free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; + struct cluster_metadata *clm; + struct cluster_layout *cll; int i; sd = fl->sdesc; for (i = 0; i < fl->cap; i++, sd++) { + if (sd->cl == NULL) + continue; - if (sd->cl) { - bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map); - uma_zfree(FL_BUF_ZONE(sc, sd->tag_idx), sd->cl); - sd->cl = NULL; + cll = &sd->cll; + clm = cl_metadata(sc, fl, cll, sd->cl); + if (sd->nmbuf == 0 || + (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1)) { + uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); } - - bus_dmamap_destroy(fl->tag[sd->tag_idx], sd->map); + sd->cl = NULL; } free(fl->sdesc, M_CXGBE); @@ -4071,7 +3964,7 @@ write_eqflush_wr(struct sge_eq *eq) eq->pending++; eq->avail--; if (++eq->pidx == eq->cap) - eq->pidx = 0; + eq->pidx = 0; } static __be64 @@ -4098,52 +3991,167 @@ get_flit(bus_dma_segment_t *sgl, int nsegs, int idx) return (0); } -/* - * Find an SGE FL buffer size to use for the given bufsize. Look for the the - * smallest size that is large enough to hold bufsize or pick the largest size - * if all sizes are less than bufsize. - */ static void -set_fl_tag_idx(struct adapter *sc, struct sge_fl *fl, int bufsize) +find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp) { - int i, largest, best, delta, start; + int8_t zidx, hwidx, idx; + uint16_t region1, region3; + int spare, spare_needed, n; + struct sw_zone_info *swz; + struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0]; - if (fl->flags & FL_BUF_PACKING) { - fl->tag_idx = 0; /* first tag is the one for packing */ - return; - } + /* + * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize + * large enough for the max payload and cluster metadata. Otherwise + * settle for the largest bufsize that leaves enough room in the cluster + * for metadata. + * + * Without buffer packing: Look for the smallest zone which has a + * bufsize large enough for the max payload. Settle for the largest + * bufsize available if there's nothing big enough for max payload. + */ + spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0; + swz = &sc->sge.sw_zone_info[0]; + hwidx = -1; + for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) { + if (swz->size > largest_rx_cluster) { + if (__predict_true(hwidx != -1)) + break; - start = sc->flags & BUF_PACKING_OK ? 1 : 0; - delta = FL_BUF_SIZE(sc, start) - bufsize; - if (delta == 0) { - fl->tag_idx = start; /* ideal fit, look no further */ - return; - } - best = start; - largest = start; + /* + * This is a misconfiguration. largest_rx_cluster is + * preventing us from finding a refill source. See + * dev.t5nex..buffer_sizes to figure out why. + */ + device_printf(sc->dev, "largest_rx_cluster=%u leaves no" + " refill source for fl %p (dma %u). Ignored.\n", + largest_rx_cluster, fl, maxp); + } + for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) { + hwb = &hwb_list[idx]; + spare = swz->size - hwb->size; + if (spare < spare_needed) + continue; - for (i = start + 1; i < FL_BUF_SIZES(sc); i++) { - int d, fl_buf_size; + hwidx = idx; /* best option so far */ + if (hwb->size >= maxp) { - fl_buf_size = FL_BUF_SIZE(sc, i); - d = fl_buf_size - bufsize; + if ((fl->flags & FL_BUF_PACKING) == 0) + goto done; /* stop looking (not packing) */ - if (d == 0) { - fl->tag_idx = i; /* ideal fit, look no further */ - return; + if (swz->size >= safest_rx_cluster) + goto done; /* stop looking (packing) */ + } + break; /* keep looking, next zone */ } - if (fl_buf_size > FL_BUF_SIZE(sc, largest)) - largest = i; - if (d > 0 && (delta < 0 || delta > d)) { - delta = d; - best = i; + } +done: + /* A usable hwidx has been located. */ + MPASS(hwidx != -1); + hwb = &hwb_list[hwidx]; + zidx = hwb->zidx; + swz = &sc->sge.sw_zone_info[zidx]; + region1 = 0; + region3 = swz->size - hwb->size; + + /* + * Stay within this zone and see if there is a better match when mbuf + * inlining is allowed. Remember that the hwidx's are sorted in + * decreasing order of size (so in increasing order of spare area). + */ + for (idx = hwidx; idx != -1; idx = hwb->next) { + hwb = &hwb_list[idx]; + spare = swz->size - hwb->size; + + if (allow_mbufs_in_cluster == 0 || hwb->size < maxp) + break; + if (spare < CL_METADATA_SIZE + MSIZE) + continue; + n = (spare - CL_METADATA_SIZE) / MSIZE; + if (n > howmany(hwb->size, maxp)) + break; + + hwidx = idx; + if (fl->flags & FL_BUF_PACKING) { + region1 = n * MSIZE; + region3 = spare - region1; + } else { + region1 = MSIZE; + region3 = spare - region1; + break; } } - if (delta > 0) - fl->tag_idx = best; /* Found a buf bigger than bufsize */ + KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES, + ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp)); + KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES, + ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp)); + KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 == + sc->sge.sw_zone_info[zidx].size, + ("%s: bad buffer layout for fl %p, maxp %d. " + "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, + sc->sge.sw_zone_info[zidx].size, region1, + sc->sge.hw_buf_info[hwidx].size, region3)); + if (fl->flags & FL_BUF_PACKING || region1 > 0) { + KASSERT(region3 >= CL_METADATA_SIZE, + ("%s: no room for metadata. fl %p, maxp %d; " + "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, + sc->sge.sw_zone_info[zidx].size, region1, + sc->sge.hw_buf_info[hwidx].size, region3)); + KASSERT(region1 % MSIZE == 0, + ("%s: bad mbuf region for fl %p, maxp %d. " + "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, + sc->sge.sw_zone_info[zidx].size, region1, + sc->sge.hw_buf_info[hwidx].size, region3)); + } + + fl->cll_def.zidx = zidx; + fl->cll_def.hwidx = hwidx; + fl->cll_def.region1 = region1; + fl->cll_def.region3 = region3; +} + +static void +find_safe_refill_source(struct adapter *sc, struct sge_fl *fl) +{ + struct sge *s = &sc->sge; + struct hw_buf_info *hwb; + struct sw_zone_info *swz; + int spare; + int8_t hwidx; + + if (fl->flags & FL_BUF_PACKING) + hwidx = s->safe_hwidx2; /* with room for metadata */ + else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) { + hwidx = s->safe_hwidx2; + hwb = &s->hw_buf_info[hwidx]; + swz = &s->sw_zone_info[hwb->zidx]; + spare = swz->size - hwb->size; + + /* no good if there isn't room for an mbuf as well */ + if (spare < CL_METADATA_SIZE + MSIZE) + hwidx = s->safe_hwidx1; + } else + hwidx = s->safe_hwidx1; + + if (hwidx == -1) { + /* No fallback source */ + fl->cll_alt.hwidx = -1; + fl->cll_alt.zidx = -1; + + return; + } + + hwb = &s->hw_buf_info[hwidx]; + swz = &s->sw_zone_info[hwb->zidx]; + spare = swz->size - hwb->size; + fl->cll_alt.hwidx = hwidx; + fl->cll_alt.zidx = hwb->zidx; + if (allow_mbufs_in_cluster) + fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE; else - fl->tag_idx = largest; /* No buf large enough for bufsize */ + fl->cll_alt.region1 = 0; + fl->cll_alt.region3 = spare - fl->cll_alt.region1; } static void @@ -4220,3 +4228,29 @@ sysctl_uint16(SYSCTL_HANDLER_ARGS) return sysctl_handle_int(oidp, &i, 0, req); } + +static int +sysctl_bufsizes(SYSCTL_HANDLER_ARGS) +{ + struct sge *s = arg1; + struct hw_buf_info *hwb = &s->hw_buf_info[0]; + struct sw_zone_info *swz = &s->sw_zone_info[0]; + int i, rc; + struct sbuf sb; + char c; + + sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); + for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { + if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster) + c = '*'; + else + c = '\0'; + + sbuf_printf(&sb, "%u%c ", hwb->size, c); + } + sbuf_trim(&sb); + sbuf_finish(&sb); + rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); + sbuf_delete(&sb); + return (rc); +} diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 17f4adb65..0dc02e3a3 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -203,6 +203,17 @@ alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi) return (NULL); } + if (inp->inp_vflag & INP_IPV6 && + !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { + struct tom_data *td = sc->tom_softc; + + lctx->ce = hold_lip(td, &inp->in6p_laddr); + if (lctx->ce == NULL) { + free(lctx, M_CXGBE); + return (NULL); + } + } + lctx->ctrlq = &sc->sge.ctrlq[pi->port_id]; lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq]; refcount_init(&lctx->refcount, 1); @@ -219,6 +230,7 @@ static int free_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; + struct tom_data *td = sc->tom_softc; INP_WLOCK_ASSERT(inp); KASSERT(lctx->refcount == 0, @@ -230,6 +242,8 @@ free_lctx(struct adapter *sc, struct listen_ctx *lctx) CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", __func__, lctx->stid, lctx, lctx->inp); + if (lctx->ce) + release_lip(td, lctx->ce); free_stid(sc, lctx); free(lctx, M_CXGBE); @@ -495,6 +509,12 @@ t4_listen_start(struct toedev *tod, struct tcpcb *tp) INP_WLOCK_ASSERT(inp); + /* Don't start a hardware listener for any loopback address. */ + if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) + return (0); + if (!(inp->inp_vflag & INP_IPV6) && + IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) + return (0); #if 0 ADAPTER_LOCK(sc); if (IS_BUSY(sc)) { diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index 927bc94e1..6328240cd 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -176,6 +176,7 @@ struct listen_ctx { struct inpcb *inp; /* listening socket's inp */ struct sge_wrq *ctrlq; struct sge_ofld_rxq *ofld_rxq; + struct clip_entry *ce; TAILQ_HEAD(, synq_entry) synq; }; -- 2.45.0