From c4a44591ef74a8356b4f7d7a7d4d6e170c65ba2e Mon Sep 17 00:00:00 2001 From: np Date: Thu, 21 Aug 2014 19:54:02 +0000 Subject: [PATCH] MFC r266571, r266757, r268536, r269076, r269364, r269366, r269411, r269413, r269428, r269440, r269537, r269644, r269731, and the cxgbe portion of r270063. r266571: cxgbe(4): Remove stray if_up from the code that creates the tracing ifnet. r266757: cxgbe(4): netmap support for Terminator 5 (T5) based 10G/40G cards. Netmap gets its own hardware-assisted virtual interface and won't take over or disrupt the "normal" interface in any way. You can use both simultaneously. For kernels with DEV_NETMAP, cxgbe(4) carves out an ncxl interface (note the 'n' prefix) in the hardware to accompany each cxl interface. These two ifnet's per port share the same wire but really are separate interfaces in the hardware and software. Each gets its own L2 MAC addresses (unicast and multicast), MTU, checksum caps, etc. You should run netmap on the 'n' interfaces only, that's what they are for. With this, pkt-gen is able to transmit > 45Mpps out of a single 40G port of a T580 card. 2 port tx is at ~56Mpps total (28M + 28M) as of now. Single port receive is at 33Mpps but this is very much a work in progress. I expect it to be closer to 40Mpps once done. In any case the current effort can already saturate multiple 10G ports of a T5 card at the smallest legal packet size. T4 gear is totally untested. trantor:~# ./pkt-gen -i ncxl0 -f tx -D 00:07:43:ab:cd:ef 881.952141 main [1621] interface is ncxl0 881.952250 extract_ip_range [275] range is 10.0.0.1:0 to 10.0.0.1:0 881.952253 extract_ip_range [275] range is 10.1.0.1:0 to 10.1.0.1:0 881.962540 main [1804] mapped 334980KB at 0x801dff000 Sending on netmap:ncxl0: 4 queues, 1 threads and 1 cpus. 10.0.0.1 -> 10.1.0.1 (00:00:00:00:00:00 -> 00:07:43:ab:cd:ef) 881.962562 main [1882] Sending 512 packets every 0.000000000 s 881.962563 main [1884] Wait 2 secs for phy reset 884.088516 main [1886] Ready... 884.088535 nm_open [457] overriding ifname ncxl0 ringid 0x0 flags 0x1 884.088607 sender_body [996] start 884.093246 sender_body [1064] drop copy 885.090435 main_thread [1418] 45206353 pps (45289533 pkts in 1001840 usec) 886.091600 main_thread [1418] 45322792 pps (45375593 pkts in 1001165 usec) 887.092435 main_thread [1418] 45313992 pps (45351784 pkts in 1000834 usec) 888.094434 main_thread [1418] 45315765 pps (45406397 pkts in 1002000 usec) 889.095434 main_thread [1418] 45333218 pps (45378551 pkts in 1001000 usec) 890.097434 main_thread [1418] 45315247 pps (45405877 pkts in 1002000 usec) 891.099434 main_thread [1418] 45326515 pps (45417168 pkts in 1002000 usec) 892.101434 main_thread [1418] 45333039 pps (45423705 pkts in 1002000 usec) 893.103434 main_thread [1418] 45324105 pps (45414708 pkts in 1001999 usec) 894.105434 main_thread [1418] 45318042 pps (45408723 pkts in 1002001 usec) 895.106434 main_thread [1418] 45332430 pps (45377762 pkts in 1001000 usec) 896.107434 main_thread [1418] 45338072 pps (45383410 pkts in 1001000 usec) ... r268536: cxgbe(4): Add an iSCSI softc to the adapter structure. r269076: Some hooks in cxgbe(4) for the offloaded iSCSI driver. r269364: Improve compliance with style.Makefile(5). r269366: List one file per line in the Makefiles. This makes it easier to read diffs when a file is added or removed. r269411: cxgbe(4): minor optimizations in ingress queue processing. Reorganize struct sge_iq. Make the iq entry size a compile time constant. While here, eliminate RX_FL_ESIZE and use EQ_ESIZE directly. r269413: cxgbe(4): Fix an off by one error when looking for the BAR2 doorbell address of an egress queue. r269428: cxgbe(4): some optimizations in freelist handling. r269440: cxgbe(4): Remove an unused version of t4_enable_vi. r269537: cxgbe(4): Do not run any sleepable code in the SIOCSIFFLAGS handler when IFF_PROMISC or IFF_ALLMULTI is being flipped. bpf(4) holds its global mutex around ifpromisc in at least the bpf_dtor path. r269644: cxgbe(4): Let caller specify whether it's ok to sleep in t4_sched_config and t4_sched_params. r269731: cxgbe(4): Do not poke T4-only registers on a T5 (and vice versa). Relnotes: Yes (native netmap support for Chelsio T4/T5 cards) git-svn-id: svn://svn.freebsd.org/base/stable/10@270297 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- sys/conf/files | 2 + sys/dev/cxgbe/adapter.h | 235 ++++- sys/dev/cxgbe/common/common.h | 10 +- sys/dev/cxgbe/common/t4_hw.c | 59 +- sys/dev/cxgbe/offload.h | 2 +- sys/dev/cxgbe/t4_main.c | 500 +++++++---- sys/dev/cxgbe/t4_netmap.c | 1138 ++++++++++++++++++++++++ sys/dev/cxgbe/t4_sge.c | 704 ++++++++++----- sys/dev/cxgbe/t4_tracer.c | 1 - sys/dev/cxgbe/tom/t4_cpl_io.c | 343 ++++++- sys/dev/cxgbe/tom/t4_ddp.c | 11 + sys/dev/cxgbe/tom/t4_tom.h | 17 + sys/modules/cxgbe/Makefile | 14 +- sys/modules/cxgbe/if_cxgbe/Makefile | 25 +- sys/modules/cxgbe/iw_cxgbe/Makefile | 26 +- sys/modules/cxgbe/t4_firmware/Makefile | 14 +- sys/modules/cxgbe/t5_firmware/Makefile | 14 +- sys/modules/cxgbe/tom/Makefile | 18 +- 18 files changed, 2610 insertions(+), 523 deletions(-) create mode 100644 sys/dev/cxgbe/t4_netmap.c diff --git a/sys/conf/files b/sys/conf/files index e3932eba3..3846ed7a2 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1128,6 +1128,8 @@ dev/cxgb/cxgb_t3fw.c optional cxgb cxgb_t3fw \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgbe/t4_main.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" +dev/cxgbe/t4_netmap.c optional cxgbe pci \ + compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_sge.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_l2t.c optional cxgbe pci \ diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 0a986ec24..69e87a815 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -48,6 +48,7 @@ #include #include "offload.h" +#include "common/t4_msg.h" #include "firmware/t4fw_interface.h" MALLOC_DECLARE(M_CXGBE); @@ -118,15 +119,24 @@ struct adapter; typedef struct adapter adapter_t; enum { + /* + * All ingress queues use this entry size. Note that the firmware event + * queue and any iq expecting CPL_RX_PKT in the descriptor needs this to + * be at least 64. + */ + IQ_ESIZE = 64, + + /* Default queue sizes for all kinds of ingress queues */ FW_IQ_QSIZE = 256, - FW_IQ_ESIZE = 64, /* At least 64 mandated by the firmware spec */ - RX_IQ_QSIZE = 1024, - RX_IQ_ESIZE = 64, /* At least 64 so CPL_RX_PKT will fit */ - EQ_ESIZE = 64, /* All egress queues use this entry size */ + /* All egress queues use this entry size */ + EQ_ESIZE = 64, + + /* Default queue sizes for all kinds of egress queues */ + CTRL_EQ_QSIZE = 128, + TX_EQ_QSIZE = 1024, - RX_FL_ESIZE = EQ_ESIZE, /* 8 64bit addresses */ #if MJUMPAGESIZE != MCLBYTES SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */ #else @@ -134,9 +144,7 @@ enum { #endif CL_METADATA_SIZE = CACHE_LINE_SIZE, - CTRL_EQ_QSIZE = 128, - - TX_EQ_QSIZE = 1024, + SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */ TX_SGL_SEGS = 36, TX_WR_FLITS = SGE_MAX_WR_LEN / 8 }; @@ -148,6 +156,17 @@ enum { INTR_MSIX = (1 << 2) }; +enum { + XGMAC_MTU = (1 << 0), + XGMAC_PROMISC = (1 << 1), + XGMAC_ALLMULTI = (1 << 2), + XGMAC_VLANEX = (1 << 3), + XGMAC_UCADDR = (1 << 4), + XGMAC_MCADDRS = (1 << 5), + + XGMAC_ALL = 0xffff +}; + enum { /* flags understood by begin_synchronized_op */ HOLD_LOCK = (1 << 0), @@ -162,7 +181,7 @@ enum { /* adapter flags */ FULL_INIT_DONE = (1 << 0), FW_OK = (1 << 1), - INTR_DIRECT = (1 << 2), /* direct interrupts for everything */ + /* INTR_DIRECT = (1 << 2), No longer used. */ MASTER_PF = (1 << 3), ADAP_SYSCTL_CTX = (1 << 4), TOM_INIT_DONE = (1 << 5), @@ -175,6 +194,10 @@ enum { PORT_INIT_DONE = (1 << 1), PORT_SYSCTL_CTX = (1 << 2), HAS_TRACEQ = (1 << 3), + INTR_RXQ = (1 << 4), /* All NIC rxq's take interrupts */ + INTR_OFLD_RXQ = (1 << 5), /* All TOE rxq's take interrupts */ + INTR_NM_RXQ = (1 << 6), /* All netmap rxq's take interrupts */ + INTR_ALL = (INTR_RXQ | INTR_OFLD_RXQ | INTR_NM_RXQ), }; #define IS_DOOMED(pi) ((pi)->flags & DOOMED) @@ -218,6 +241,19 @@ struct port_info { int first_ofld_txq; /* index of first offload tx queue */ int nofldrxq; /* # of offload rx queues */ int first_ofld_rxq; /* index of first offload rx queue */ +#endif +#ifdef DEV_NETMAP + int nnmtxq; /* # of netmap tx queues */ + int first_nm_txq; /* index of first netmap tx queue */ + int nnmrxq; /* # of netmap rx queues */ + int first_nm_rxq; /* index of first netmap rx queue */ + + struct ifnet *nm_ifp; + struct ifmedia nm_media; + int nmif_flags; + uint16_t nm_viid; + int16_t nm_xact_addr_filt; + uint16_t nm_rss_size; /* size of netmap VI's RSS table slice */ #endif int tmr_idx; int pktc_idx; @@ -281,6 +317,16 @@ struct tx_sdesc { uint8_t credits; /* NIC txq: # of frames sent out in the WR */ }; + +#define IQ_PAD (IQ_ESIZE - sizeof(struct rsp_ctrl) - sizeof(struct rss_header)) +struct iq_desc { + struct rss_header rss; + uint8_t cpl[IQ_PAD]; + struct rsp_ctrl rsp; +}; +#undef IQ_PAD +CTASSERT(sizeof(struct iq_desc) == IQ_ESIZE); + enum { /* iq flags */ IQ_ALLOCATED = (1 << 0), /* firmware resources allocated */ @@ -298,27 +344,25 @@ enum { * Ingress Queue: T4 is producer, driver is consumer. */ struct sge_iq { - bus_dma_tag_t desc_tag; - bus_dmamap_t desc_map; - bus_addr_t ba; /* bus address of descriptor ring */ uint32_t flags; - uint16_t abs_id; /* absolute SGE id for the iq */ - int8_t intr_pktc_idx; /* packet count threshold index */ - int8_t pad0; - __be64 *desc; /* KVA of descriptor ring */ - volatile int state; struct adapter *adapter; - const __be64 *cdesc; /* current descriptor */ + struct iq_desc *desc; /* KVA of descriptor ring */ + int8_t intr_pktc_idx; /* packet count threshold index */ uint8_t gen; /* generation bit */ uint8_t intr_params; /* interrupt holdoff parameters */ uint8_t intr_next; /* XXX: holdoff for next interrupt */ - uint8_t esize; /* size (bytes) of each entry in the queue */ uint16_t qsize; /* size (# of entries) of the queue */ + uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer index */ uint16_t cntxt_id; /* SGE context id for the iq */ + uint16_t abs_id; /* absolute SGE id for the iq */ STAILQ_ENTRY(sge_iq) link; + + bus_dma_tag_t desc_tag; + bus_dmamap_t desc_map; + bus_addr_t ba; /* bus address of descriptor ring */ }; enum { @@ -356,7 +400,7 @@ struct sge_eq { struct tx_desc *desc; /* KVA of descriptor ring */ bus_addr_t ba; /* bus address of descriptor ring */ struct sge_qstat *spg; /* status page, for convenience */ - int doorbells; + uint16_t doorbells; volatile uint32_t *udb; /* KVA of doorbell (lies within BAR2) */ u_int udb_qid; /* relative qid within the doorbell page */ uint16_t cap; /* max # of desc, for convenience */ @@ -394,43 +438,55 @@ enum { FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */ FL_DOOMED = (1 << 1), /* about to be destroyed */ FL_BUF_PACKING = (1 << 2), /* buffer packing enabled */ + FL_BUF_RESUME = (1 << 3), /* resume from the middle of the frame */ }; -#define FL_RUNNING_LOW(fl) (fl->cap - fl->needed <= fl->lowat) -#define FL_NOT_RUNNING_LOW(fl) (fl->cap - fl->needed >= 2 * fl->lowat) +#define FL_RUNNING_LOW(fl) \ + (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) <= fl->lowat) +#define FL_NOT_RUNNING_LOW(fl) \ + (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) >= 2 * fl->lowat) struct sge_fl { - bus_dma_tag_t desc_tag; - bus_dmamap_t desc_map; - struct cluster_layout cll_def; /* default refill zone, layout */ - struct cluster_layout cll_alt; /* alternate refill zone, layout */ struct mtx fl_lock; - char lockname[16]; - int flags; - __be64 *desc; /* KVA of descriptor ring, ptr to addresses */ - bus_addr_t ba; /* bus address of descriptor ring */ struct fl_sdesc *sdesc; /* KVA of software descriptor ring */ - uint32_t cap; /* max # of buffers, for convenience */ - uint16_t qsize; /* size (# of entries) of the queue */ - uint16_t cntxt_id; /* SGE context id for the freelist */ - uint32_t cidx; /* consumer idx (buffer idx, NOT hw desc idx) */ - uint32_t rx_offset; /* offset in fl buf (when buffer packing) */ - uint32_t pidx; /* producer idx (buffer idx, NOT hw desc idx) */ - uint32_t needed; /* # of buffers needed to fill up fl. */ - uint32_t lowat; /* # of buffers <= this means fl needs help */ - uint32_t pending; /* # of bufs allocated since last doorbell */ - TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ + struct cluster_layout cll_def; /* default refill zone, layout */ + uint16_t lowat; /* # of buffers <= this means fl needs help */ + int flags; + uint16_t buf_boundary; - struct mbuf *m0; - struct mbuf **pnext; - u_int remaining; + /* The 16b idx all deal with hw descriptors */ + uint16_t dbidx; /* hw pidx after last doorbell */ + uint16_t sidx; /* index of status page */ + volatile uint16_t hw_cidx; + + /* The 32b idx are all buffer idx, not hardware descriptor idx */ + uint32_t cidx; /* consumer index */ + uint32_t pidx; /* producer index */ + + uint32_t dbval; + u_int rx_offset; /* offset in fl buf (when buffer packing) */ + volatile uint32_t *udb; uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */ uint64_t mbuf_inlined; /* # of mbuf created within clusters */ uint64_t cl_allocated; /* # of clusters allocated */ uint64_t cl_recycled; /* # of clusters recycled */ uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */ + + /* These 3 are valid when FL_BUF_RESUME is set, stale otherwise. */ + struct mbuf *m0; + struct mbuf **pnext; + u_int remaining; + + uint16_t qsize; /* # of hw descriptors (status page included) */ + uint16_t cntxt_id; /* SGE context id for the freelist */ + TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ + bus_dma_tag_t desc_tag; + bus_dmamap_t desc_map; + char lockname[16]; + bus_addr_t ba; /* bus address of descriptor ring */ + struct cluster_layout cll_alt; /* alternate refill zone, layout */ }; /* txq: SGE egress queue + what's needed for Ethernet NIC */ @@ -532,6 +588,64 @@ struct sge_wrq { uint32_t no_desc; /* out of hardware descriptors */ } __aligned(CACHE_LINE_SIZE); + +#ifdef DEV_NETMAP +struct sge_nm_rxq { + struct port_info *pi; + + struct iq_desc *iq_desc; + uint16_t iq_abs_id; + uint16_t iq_cntxt_id; + uint16_t iq_cidx; + uint16_t iq_sidx; + uint8_t iq_gen; + + __be64 *fl_desc; + uint16_t fl_cntxt_id; + uint32_t fl_cidx; + uint32_t fl_pidx; + uint32_t fl_sidx; + uint32_t fl_db_val; + u_int fl_hwidx:4; + + u_int nid; /* netmap ring # for this queue */ + + /* infrequently used items after this */ + + bus_dma_tag_t iq_desc_tag; + bus_dmamap_t iq_desc_map; + bus_addr_t iq_ba; + int intr_idx; + + bus_dma_tag_t fl_desc_tag; + bus_dmamap_t fl_desc_map; + bus_addr_t fl_ba; +} __aligned(CACHE_LINE_SIZE); + +struct sge_nm_txq { + struct tx_desc *desc; + uint16_t cidx; + uint16_t pidx; + uint16_t sidx; + uint16_t equiqidx; /* EQUIQ last requested at this pidx */ + uint16_t equeqidx; /* EQUEQ last requested at this pidx */ + uint16_t dbidx; /* pidx of the most recent doorbell */ + uint16_t doorbells; + volatile uint32_t *udb; + u_int udb_qid; + u_int cntxt_id; + __be32 cpl_ctrl0; /* for convenience */ + u_int nid; /* netmap ring # for this queue */ + + /* infrequently used items after this */ + + bus_dma_tag_t desc_tag; + bus_dmamap_t desc_map; + bus_addr_t ba; + int iqidx; +} __aligned(CACHE_LINE_SIZE); +#endif + struct sge { int timer_val[SGE_NTIMERS]; int counter_val[SGE_NCOUNTERS]; @@ -545,6 +659,10 @@ struct sge { #ifdef TCP_OFFLOAD int nofldrxq; /* total # of TOE rx queues */ int nofldtxq; /* total # of TOE tx queues */ +#endif +#ifdef DEV_NETMAP + int nnmrxq; /* total # of netmap rx queues */ + int nnmtxq; /* total # of netmap tx queues */ #endif int niq; /* total # of ingress queues */ int neq; /* total # of egress queues */ @@ -558,6 +676,10 @@ struct sge { struct sge_wrq *ofld_txq; /* TOE tx queues */ struct sge_ofld_rxq *ofld_rxq; /* TOE rx queues */ #endif +#ifdef DEV_NETMAP + struct sge_nm_txq *nm_txq; /* netmap tx queues */ + struct sge_nm_rxq *nm_rxq; /* netmap rx queues */ +#endif uint16_t iq_start; int eq_start; @@ -619,11 +741,12 @@ struct adapter { void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; void *iwarp_softc; /* (struct c4iw_dev *) */ + void *iscsi_softc; #endif struct l2t_data *l2t; /* L2 table */ struct tid_info tids; - int doorbells; + uint16_t doorbells; int open_device_map; #ifdef TCP_OFFLOAD int offload_map; @@ -724,6 +847,18 @@ struct adapter { #define for_each_ofld_rxq(pi, iter, q) \ for (q = &pi->adapter->sge.ofld_rxq[pi->first_ofld_rxq], iter = 0; \ iter < pi->nofldrxq; ++iter, ++q) +#define for_each_nm_txq(pi, iter, q) \ + for (q = &pi->adapter->sge.nm_txq[pi->first_nm_txq], iter = 0; \ + iter < pi->nnmtxq; ++iter, ++q) +#define for_each_nm_rxq(pi, iter, q) \ + for (q = &pi->adapter->sge.nm_rxq[pi->first_nm_rxq], iter = 0; \ + iter < pi->nnmrxq; ++iter, ++q) + +#define IDXINCR(idx, incr, wrap) do { \ + idx = wrap - idx > incr ? idx + incr : incr - (wrap - idx); \ +} while (0) +#define IDXDIFF(head, tail, wrap) \ + ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head)) /* One for errors, one for firmware events */ #define T4_EXTRA_INTR 2 @@ -848,6 +983,18 @@ int t4_register_fw_msg_handler(struct adapter *, int, fw_msg_handler_t); int t4_filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int begin_synchronized_op(struct adapter *, struct port_info *, int, char *); void end_synchronized_op(struct adapter *, int); +int update_mac_settings(struct ifnet *, int); +int adapter_full_init(struct adapter *); +int adapter_full_uninit(struct adapter *); +int port_full_init(struct port_info *); +int port_full_uninit(struct port_info *); + +#ifdef DEV_NETMAP +/* t4_netmap.c */ +int create_netmap_ifnet(struct port_info *); +int destroy_netmap_ifnet(struct port_info *); +void t4_nm_intr(void *); +#endif /* t4_sge.c */ void t4_sge_modload(void); diff --git a/sys/dev/cxgbe/common/common.h b/sys/dev/cxgbe/common/common.h index 84f31c7bf..7a88462a2 100644 --- a/sys/dev/cxgbe/common/common.h +++ b/sys/dev/cxgbe/common/common.h @@ -561,11 +561,11 @@ int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int exactf, unsigned int rcaps, unsigned int wxcaps); int t4_alloc_vi_func(struct adapter *adap, unsigned int mbox, unsigned int port, unsigned int pf, unsigned int vf, - unsigned int nmac, u8 *mac, unsigned int *rss_size, + unsigned int nmac, u8 *mac, u16 *rss_size, unsigned int portfunc, unsigned int idstype); int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port, unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac, - unsigned int *rss_size); + u16 *rss_size); int t4_free_vi(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int viid); @@ -614,8 +614,10 @@ int t4_sge_ctxt_rd_bd(struct adapter *adap, unsigned int cid, enum ctxt_type cty int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox); int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl); int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox, u32 addr, u32 val); -int t4_sched_config(struct adapter *adapter, int type, int minmaxen); +int t4_sched_config(struct adapter *adapter, int type, int minmaxen, + int sleep_ok); int t4_sched_params(struct adapter *adapter, int type, int level, int mode, int rateunit, int ratemode, int channel, int cl, - int minrate, int maxrate, int weight, int pktsize); + int minrate, int maxrate, int weight, int pktsize, + int sleep_ok); #endif /* __CHELSIO_COMMON_H */ diff --git a/sys/dev/cxgbe/common/t4_hw.c b/sys/dev/cxgbe/common/t4_hw.c index e4badf2c2..4287a8188 100644 --- a/sys/dev/cxgbe/common/t4_hw.c +++ b/sys/dev/cxgbe/common/t4_hw.c @@ -2074,15 +2074,18 @@ static void pcie_intr_handler(struct adapter *adapter) int fat; - fat = t4_handle_intr_status(adapter, - A_PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS, - sysbus_intr_info) + - t4_handle_intr_status(adapter, - A_PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS, - pcie_port_intr_info) + - t4_handle_intr_status(adapter, A_PCIE_INT_CAUSE, - is_t4(adapter) ? - pcie_intr_info : t5_pcie_intr_info); + if (is_t4(adapter)) + fat = t4_handle_intr_status(adapter, + A_PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS, + sysbus_intr_info) + + t4_handle_intr_status(adapter, + A_PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS, + pcie_port_intr_info) + + t4_handle_intr_status(adapter, A_PCIE_INT_CAUSE, + pcie_intr_info); + else + fat = t4_handle_intr_status(adapter, A_PCIE_INT_CAUSE, + t5_pcie_intr_info); if (fat) t4_fatal_err(adapter); } @@ -2463,9 +2466,15 @@ static void ma_intr_handler(struct adapter *adapter) { u32 v, status = t4_read_reg(adapter, A_MA_INT_CAUSE); - if (status & F_MEM_PERR_INT_CAUSE) + if (status & F_MEM_PERR_INT_CAUSE) { CH_ALERT(adapter, "MA parity error, parity status %#x\n", - t4_read_reg(adapter, A_MA_PARITY_ERROR_STATUS)); + t4_read_reg(adapter, A_MA_PARITY_ERROR_STATUS1)); + if (is_t5(adapter)) + CH_ALERT(adapter, + "MA parity error, parity status %#x\n", + t4_read_reg(adapter, + A_MA_PARITY_ERROR_STATUS2)); + } if (status & F_MEM_WRAP_INT_CAUSE) { v = t4_read_reg(adapter, A_MA_INT_WRAP_STATUS); CH_ALERT(adapter, "MA address wrap-around error by client %u to" @@ -2682,10 +2691,8 @@ void t4_intr_clear(struct adapter *adapter) { static const unsigned int cause_reg[] = { A_SGE_INT_CAUSE1, A_SGE_INT_CAUSE2, A_SGE_INT_CAUSE3, - A_PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS, - A_PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS, A_PCIE_NONFAT_ERR, A_PCIE_INT_CAUSE, - A_MA_INT_WRAP_STATUS, A_MA_PARITY_ERROR_STATUS, A_MA_INT_CAUSE, + A_MA_INT_WRAP_STATUS, A_MA_PARITY_ERROR_STATUS1, A_MA_INT_CAUSE, A_EDC_INT_CAUSE, EDC_REG(A_EDC_INT_CAUSE, 1), A_CIM_HOST_INT_CAUSE, A_CIM_HOST_UPACC_INT_CAUSE, MYPF_REG(A_CIM_PF_HOST_INT_CAUSE), @@ -2707,6 +2714,14 @@ void t4_intr_clear(struct adapter *adapter) t4_write_reg(adapter, is_t4(adapter) ? A_MC_INT_CAUSE : A_MC_P_INT_CAUSE, 0xffffffff); + if (is_t4(adapter)) { + t4_write_reg(adapter, A_PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS, + 0xffffffff); + t4_write_reg(adapter, A_PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS, + 0xffffffff); + } else + t4_write_reg(adapter, A_MA_PARITY_ERROR_STATUS2, 0xffffffff); + t4_write_reg(adapter, A_PL_INT_CAUSE, GLBL_INTR_MASK); (void) t4_read_reg(adapter, A_PL_INT_CAUSE); /* flush */ } @@ -4874,7 +4889,7 @@ int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf, */ int t4_alloc_vi_func(struct adapter *adap, unsigned int mbox, unsigned int port, unsigned int pf, unsigned int vf, - unsigned int nmac, u8 *mac, unsigned int *rss_size, + unsigned int nmac, u8 *mac, u16 *rss_size, unsigned int portfunc, unsigned int idstype) { int ret; @@ -4929,7 +4944,7 @@ int t4_alloc_vi_func(struct adapter *adap, unsigned int mbox, */ int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port, unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac, - unsigned int *rss_size) + u16 *rss_size) { return t4_alloc_vi_func(adap, mbox, port, pf, vf, nmac, mac, rss_size, FW_VI_FUNC_ETH, 0); @@ -5671,7 +5686,7 @@ int __devinit t4_port_init(struct port_info *p, int mbox, int pf, int vf) u8 addr[6]; int ret, i, j; struct fw_port_cmd c; - unsigned int rss_size; + u16 rss_size; adapter_t *adap = p->adapter; memset(&c, 0, sizeof(c)); @@ -5714,7 +5729,8 @@ int __devinit t4_port_init(struct port_info *p, int mbox, int pf, int vf) return 0; } -int t4_sched_config(struct adapter *adapter, int type, int minmaxen) +int t4_sched_config(struct adapter *adapter, int type, int minmaxen, + int sleep_ok) { struct fw_sched_cmd cmd; @@ -5729,12 +5745,13 @@ int t4_sched_config(struct adapter *adapter, int type, int minmaxen) cmd.u.config.minmaxen = minmaxen; return t4_wr_mbox_meat(adapter,adapter->mbox, &cmd, sizeof(cmd), - NULL, 1); + NULL, sleep_ok); } int t4_sched_params(struct adapter *adapter, int type, int level, int mode, int rateunit, int ratemode, int channel, int cl, - int minrate, int maxrate, int weight, int pktsize) + int minrate, int maxrate, int weight, int pktsize, + int sleep_ok) { struct fw_sched_cmd cmd; @@ -5758,5 +5775,5 @@ int t4_sched_params(struct adapter *adapter, int type, int level, int mode, cmd.u.params.pktsize = cpu_to_be16(pktsize); return t4_wr_mbox_meat(adapter,adapter->mbox, &cmd, sizeof(cmd), - NULL, 1); + NULL, sleep_ok); } diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h index a03d114a1..e5f3ec2ea 100644 --- a/sys/dev/cxgbe/offload.h +++ b/sys/dev/cxgbe/offload.h @@ -153,6 +153,6 @@ int t4_register_uld(struct uld_info *); int t4_unregister_uld(struct uld_info *); int t4_activate_uld(struct adapter *, int); int t4_deactivate_uld(struct adapter *, int); +void t4_iscsi_init(struct ifnet *, unsigned int, const unsigned int *); #endif - #endif diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 8e8fdc572..cc51b7629 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -218,6 +218,24 @@ static int t4_nofldrxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq1g", &t4_nofldrxq1g); #endif +#ifdef DEV_NETMAP +#define NNMTXQ_10G 2 +static int t4_nnmtxq10g = -1; +TUNABLE_INT("hw.cxgbe.nnmtxq10g", &t4_nnmtxq10g); + +#define NNMRXQ_10G 2 +static int t4_nnmrxq10g = -1; +TUNABLE_INT("hw.cxgbe.nnmrxq10g", &t4_nnmrxq10g); + +#define NNMTXQ_1G 1 +static int t4_nnmtxq1g = -1; +TUNABLE_INT("hw.cxgbe.nnmtxq1g", &t4_nnmtxq1g); + +#define NNMRXQ_1G 1 +static int t4_nnmrxq1g = -1; +TUNABLE_INT("hw.cxgbe.nnmrxq1g", &t4_nnmrxq1g); +#endif + /* * Holdoff parameters for 10G and 1G ports. */ @@ -295,19 +313,26 @@ static int t5_write_combine = 0; TUNABLE_INT("hw.cxl.write_combine", &t5_write_combine); struct intrs_and_queues { - int intr_type; /* INTx, MSI, or MSI-X */ - int nirq; /* Number of vectors */ - int intr_flags; - int ntxq10g; /* # of NIC txq's for each 10G port */ - int nrxq10g; /* # of NIC rxq's for each 10G port */ - int ntxq1g; /* # of NIC txq's for each 1G port */ - int nrxq1g; /* # of NIC rxq's for each 1G port */ - int rsrv_noflowq; /* Flag whether to reserve queue 0 */ + uint16_t intr_type; /* INTx, MSI, or MSI-X */ + uint16_t nirq; /* Total # of vectors */ + uint16_t intr_flags_10g;/* Interrupt flags for each 10G port */ + uint16_t intr_flags_1g; /* Interrupt flags for each 1G port */ + uint16_t ntxq10g; /* # of NIC txq's for each 10G port */ + uint16_t nrxq10g; /* # of NIC rxq's for each 10G port */ + uint16_t ntxq1g; /* # of NIC txq's for each 1G port */ + uint16_t nrxq1g; /* # of NIC rxq's for each 1G port */ + uint16_t rsrv_noflowq; /* Flag whether to reserve queue 0 */ #ifdef TCP_OFFLOAD - int nofldtxq10g; /* # of TOE txq's for each 10G port */ - int nofldrxq10g; /* # of TOE rxq's for each 10G port */ - int nofldtxq1g; /* # of TOE txq's for each 1G port */ - int nofldrxq1g; /* # of TOE rxq's for each 1G port */ + uint16_t nofldtxq10g; /* # of TOE txq's for each 10G port */ + uint16_t nofldrxq10g; /* # of TOE rxq's for each 10G port */ + uint16_t nofldtxq1g; /* # of TOE txq's for each 1G port */ + uint16_t nofldrxq1g; /* # of TOE rxq's for each 1G port */ +#endif +#ifdef DEV_NETMAP + uint16_t nnmtxq10g; /* # of netmap txq's for each 10G port */ + uint16_t nnmrxq10g; /* # of netmap rxq's for each 10G port */ + uint16_t nnmtxq1g; /* # of netmap txq's for each 1G port */ + uint16_t nnmrxq1g; /* # of netmap rxq's for each 1G port */ #endif }; @@ -321,17 +346,6 @@ struct filter_entry { struct t4_filter_specification fs; }; -enum { - XGMAC_MTU = (1 << 0), - XGMAC_PROMISC = (1 << 1), - XGMAC_ALLMULTI = (1 << 2), - XGMAC_VLANEX = (1 << 3), - XGMAC_UCADDR = (1 << 4), - XGMAC_MCADDRS = (1 << 5), - - XGMAC_ALL = 0xffff -}; - static int map_bars_0_and_4(struct adapter *); static int map_bar_2(struct adapter *); static void setup_memwin(struct adapter *); @@ -350,15 +364,10 @@ static int get_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); -static void build_medialist(struct port_info *); -static int update_mac_settings(struct port_info *, int); +static void build_medialist(struct port_info *, struct ifmedia *); static int cxgbe_init_synchronized(struct port_info *); static int cxgbe_uninit_synchronized(struct port_info *); static int setup_intr_handlers(struct adapter *); -static int adapter_full_init(struct adapter *); -static int adapter_full_uninit(struct adapter *); -static int port_full_init(struct port_info *); -static int port_full_uninit(struct port_info *); static void quiesce_eq(struct adapter *, struct sge_eq *); static void quiesce_iq(struct adapter *, struct sge_iq *); static void quiesce_fl(struct adapter *, struct sge_fl *); @@ -556,6 +565,9 @@ t4_attach(device_t dev) #ifdef TCP_OFFLOAD int ofld_rqidx, ofld_tqidx; #endif +#ifdef DEV_NETMAP + int nm_rqidx, nm_tqidx; +#endif sc = device_get_softc(dev); sc->dev = dev; @@ -684,6 +696,13 @@ t4_attach(device_t dev) sc->port[i] = NULL; goto done; } + rc = -t4_link_start(sc, sc->mbox, pi->tx_chan, &pi->link_cfg); + if (rc != 0) { + device_printf(dev, "port %d l1cfg failed: %d\n", i, rc); + free(pi, M_CXGBE); + sc->port[i] = NULL; + goto done; + } snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d", device_get_nameunit(dev), i); @@ -725,7 +744,6 @@ t4_attach(device_t dev) sc->intr_type = iaq.intr_type; sc->intr_count = iaq.nirq; - sc->flags |= iaq.intr_flags; s = &sc->sge; s->nrxq = n10g * iaq.nrxq10g + n1g * iaq.nrxq1g; @@ -733,10 +751,8 @@ t4_attach(device_t dev) s->neq = s->ntxq + s->nrxq; /* the free list in an rxq is an eq */ s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ - #ifdef TCP_OFFLOAD if (is_offload(sc)) { - s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g; s->nofldtxq = n10g * iaq.nofldtxq10g + n1g * iaq.nofldtxq1g; s->neq += s->nofldtxq + s->nofldrxq; @@ -748,6 +764,17 @@ t4_attach(device_t dev) M_CXGBE, M_ZERO | M_WAITOK); } #endif +#ifdef DEV_NETMAP + s->nnmrxq = n10g * iaq.nnmrxq10g + n1g * iaq.nnmrxq1g; + s->nnmtxq = n10g * iaq.nnmtxq10g + n1g * iaq.nnmtxq1g; + s->neq += s->nnmtxq + s->nnmrxq; + s->niq += s->nnmrxq; + + s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq), + M_CXGBE, M_ZERO | M_WAITOK); + s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq), + M_CXGBE, M_ZERO | M_WAITOK); +#endif s->ctrlq = malloc(sc->params.nports * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); @@ -772,6 +799,9 @@ t4_attach(device_t dev) rqidx = tqidx = 0; #ifdef TCP_OFFLOAD ofld_rqidx = ofld_tqidx = 0; +#endif +#ifdef DEV_NETMAP + nm_rqidx = nm_tqidx = 0; #endif for_each_port(sc, i) { struct port_info *pi = sc->port[i]; @@ -782,9 +812,11 @@ t4_attach(device_t dev) pi->first_rxq = rqidx; pi->first_txq = tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { + pi->flags |= iaq.intr_flags_10g; pi->nrxq = iaq.nrxq10g; pi->ntxq = iaq.ntxq10g; } else { + pi->flags |= iaq.intr_flags_1g; pi->nrxq = iaq.nrxq1g; pi->ntxq = iaq.ntxq1g; } @@ -796,7 +828,6 @@ t4_attach(device_t dev) rqidx += pi->nrxq; tqidx += pi->ntxq; - #ifdef TCP_OFFLOAD if (is_offload(sc)) { pi->first_ofld_rxq = ofld_rqidx; @@ -811,6 +842,19 @@ t4_attach(device_t dev) ofld_rqidx += pi->nofldrxq; ofld_tqidx += pi->nofldtxq; } +#endif +#ifdef DEV_NETMAP + pi->first_nm_rxq = nm_rqidx; + pi->first_nm_txq = nm_tqidx; + if (is_10G_port(pi) || is_40G_port(pi)) { + pi->nnmrxq = iaq.nnmrxq10g; + pi->nnmtxq = iaq.nnmtxq10g; + } else { + pi->nnmrxq = iaq.nnmrxq1g; + pi->nnmtxq = iaq.nnmtxq1g; + } + nm_rqidx += pi->nnmrxq; + nm_tqidx += pi->nnmtxq; #endif } @@ -886,7 +930,7 @@ t4_detach(device_t dev) for (i = 0; i < MAX_NPORTS; i++) { pi = sc->port[i]; if (pi) { - t4_free_vi(pi->adapter, sc->mbox, sc->pf, 0, pi->viid); + t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->viid); if (pi->dev) device_delete_child(dev, pi->dev); @@ -922,6 +966,10 @@ t4_detach(device_t dev) #ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); free(sc->sge.ofld_txq, M_CXGBE); +#endif +#ifdef DEV_NETMAP + free(sc->sge.nm_rxq, M_CXGBE); + free(sc->sge.nm_txq, M_CXGBE); #endif free(sc->irq, M_CXGBE); free(sc->sge.rxq, M_CXGBE); @@ -950,7 +998,6 @@ t4_detach(device_t dev) return (0); } - static int cxgbe_probe(device_t dev) { @@ -973,6 +1020,8 @@ cxgbe_attach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct ifnet *ifp; + char *s; + int n, o; /* Allocate an ifnet and set it up */ ifp = if_alloc(IFT_ETHER); @@ -1005,22 +1054,39 @@ cxgbe_attach(device_t dev) /* Initialize ifmedia for this port */ ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); - build_medialist(pi); + build_medialist(pi, &pi->media); pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp, EVENTHANDLER_PRI_ANY); ether_ifattach(ifp, pi->hw_addr); + n = 128; + s = malloc(n, M_CXGBE, M_WAITOK); + o = snprintf(s, n, "%d txq, %d rxq (NIC)", pi->ntxq, pi->nrxq); + MPASS(n > o); #ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) { - device_printf(dev, - "%d txq, %d rxq (NIC); %d txq, %d rxq (TOE)\n", - pi->ntxq, pi->nrxq, pi->nofldtxq, pi->nofldrxq); - } else + o += snprintf(s + o, n - o, "; %d txq, %d rxq (TOE)", + pi->nofldtxq, pi->nofldrxq); + MPASS(n > o); + } +#endif +#ifdef DEV_NETMAP + o += snprintf(s + o, n - o, "; %d txq, %d rxq (netmap)", pi->nnmtxq, + pi->nnmrxq); + MPASS(n > o); #endif - device_printf(dev, "%d txq, %d rxq\n", pi->ntxq, pi->nrxq); + device_printf(dev, "%s\n", s); + free(s, M_CXGBE); +#ifdef DEV_NETMAP + /* nm_media handled here to keep implementation private to this file */ + ifmedia_init(&pi->nm_media, IFM_IMASK, cxgbe_media_change, + cxgbe_media_status); + build_medialist(pi, &pi->nm_media); + create_netmap_ifnet(pi); /* logs errors it something fails */ +#endif cxgbe_sysctls(pi); return (0); @@ -1068,6 +1134,11 @@ cxgbe_detach(device_t dev) ether_ifdetach(pi->ifp); if_free(pi->ifp); +#ifdef DEV_NETMAP + /* XXXNM: equivalent of cxgbe_uninit_synchronized to ifdown nm_ifp */ + destroy_netmap_ifnet(pi); +#endif + ADAPTER_LOCK(sc); CLR_BUSY(sc); wakeup(&sc->flags); @@ -1091,7 +1162,7 @@ cxgbe_init(void *arg) static int cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) { - int rc = 0, mtu, flags; + int rc = 0, mtu, flags, can_sleep; struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; struct ifreq *ifr = (struct ifreq *)data; @@ -1110,13 +1181,16 @@ cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) if (pi->flags & PORT_INIT_DONE) { t4_update_fl_bufsize(ifp); if (ifp->if_drv_flags & IFF_DRV_RUNNING) - rc = update_mac_settings(pi, XGMAC_MTU); + rc = update_mac_settings(ifp, XGMAC_MTU); } end_synchronized_op(sc, 0); break; case SIOCSIFFLAGS: - rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4flg"); + can_sleep = 0; +redo_sifflags: + rc = begin_synchronized_op(sc, pi, + can_sleep ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4flg"); if (rc) return (rc); @@ -1125,24 +1199,41 @@ cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) flags = pi->if_flags; if ((ifp->if_flags ^ flags) & (IFF_PROMISC | IFF_ALLMULTI)) { - rc = update_mac_settings(pi, + if (can_sleep == 1) { + end_synchronized_op(sc, 0); + can_sleep = 0; + goto redo_sifflags; + } + rc = update_mac_settings(ifp, XGMAC_PROMISC | XGMAC_ALLMULTI); } - } else + } else { + if (can_sleep == 0) { + end_synchronized_op(sc, LOCK_HELD); + can_sleep = 1; + goto redo_sifflags; + } rc = cxgbe_init_synchronized(pi); + } pi->if_flags = ifp->if_flags; - } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) + } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (can_sleep == 0) { + end_synchronized_op(sc, LOCK_HELD); + can_sleep = 1; + goto redo_sifflags; + } rc = cxgbe_uninit_synchronized(pi); - end_synchronized_op(sc, 0); + } + end_synchronized_op(sc, can_sleep ? 0 : LOCK_HELD); break; - case SIOCADDMULTI: + case SIOCADDMULTI: case SIOCDELMULTI: /* these two are called with a mutex held :-( */ rc = begin_synchronized_op(sc, pi, HOLD_LOCK, "t4multi"); if (rc) return (rc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) - rc = update_mac_settings(pi, XGMAC_MCADDRS); + rc = update_mac_settings(ifp, XGMAC_MCADDRS); end_synchronized_op(sc, LOCK_HELD); break; @@ -1231,7 +1322,7 @@ cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) - rc = update_mac_settings(pi, XGMAC_VLANEX); + rc = update_mac_settings(ifp, XGMAC_VLANEX); } if (mask & IFCAP_VLAN_MTU) { ifp->if_capenable ^= IFCAP_VLAN_MTU; @@ -1366,13 +1457,23 @@ static void cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct port_info *pi = ifp->if_softc; - struct ifmedia_entry *cur = pi->media.ifm_cur; + struct ifmedia *media = NULL; + struct ifmedia_entry *cur; int speed = pi->link_cfg.speed; int data = (pi->port_type << 8) | pi->mod_type; + if (ifp == pi->ifp) + media = &pi->media; +#ifdef DEV_NETMAP + else if (ifp == pi->nm_ifp) + media = &pi->nm_media; +#endif + MPASS(media != NULL); + + cur = media->ifm_cur; if (cur->ifm_data != data) { - build_medialist(pi); - cur = pi->media.ifm_cur; + build_medialist(pi, media); + cur = media->ifm_cur; } ifmr->ifm_status = IFM_AVALID; @@ -1725,6 +1826,7 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, { int rc, itype, navail, nrxq10g, nrxq1g, n; int nofldrxq10g = 0, nofldrxq1g = 0; + int nnmrxq10g = 0, nnmrxq1g = 0; bzero(iaq, sizeof(*iaq)); @@ -1741,6 +1843,12 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, iaq->nofldrxq1g = nofldrxq1g = t4_nofldrxq1g; } #endif +#ifdef DEV_NETMAP + iaq->nnmtxq10g = t4_nnmtxq10g; + iaq->nnmtxq1g = t4_nnmtxq1g; + iaq->nnmrxq10g = nnmrxq10g = t4_nnmrxq10g; + iaq->nnmrxq1g = nnmrxq1g = t4_nnmrxq1g; +#endif for (itype = INTR_MSIX; itype; itype >>= 1) { @@ -1758,30 +1866,60 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, continue; iaq->intr_type = itype; - iaq->intr_flags = 0; + iaq->intr_flags_10g = 0; + iaq->intr_flags_1g = 0; /* * Best option: an interrupt vector for errors, one for the - * firmware event queue, and one each for each rxq (NIC as well - * as offload). + * firmware event queue, and one for every rxq (NIC, TOE, and + * netmap). */ iaq->nirq = T4_EXTRA_INTR; - iaq->nirq += n10g * (nrxq10g + nofldrxq10g); - iaq->nirq += n1g * (nrxq1g + nofldrxq1g); + iaq->nirq += n10g * (nrxq10g + nofldrxq10g + nnmrxq10g); + iaq->nirq += n1g * (nrxq1g + nofldrxq1g + nnmrxq1g); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { - iaq->intr_flags |= INTR_DIRECT; + iaq->intr_flags_10g = INTR_ALL; + iaq->intr_flags_1g = INTR_ALL; goto allocate; } /* - * Second best option: an interrupt vector for errors, one for - * the firmware event queue, and one each for either NIC or - * offload rxq's. + * Second best option: a vector for errors, one for the firmware + * event queue, and vectors for either all the NIC rx queues or + * all the TOE rx queues. The queues that don't get vectors + * will forward their interrupts to those that do. + * + * Note: netmap rx queues cannot be created early and so they + * can't be setup to receive forwarded interrupts for others. */ iaq->nirq = T4_EXTRA_INTR; - iaq->nirq += n10g * max(nrxq10g, nofldrxq10g); - iaq->nirq += n1g * max(nrxq1g, nofldrxq1g); + if (nrxq10g >= nofldrxq10g) { + iaq->intr_flags_10g = INTR_RXQ; + iaq->nirq += n10g * nrxq10g; +#ifdef DEV_NETMAP + iaq->nnmrxq10g = min(nnmrxq10g, nrxq10g); +#endif + } else { + iaq->intr_flags_10g = INTR_OFLD_RXQ; + iaq->nirq += n10g * nofldrxq10g; +#ifdef DEV_NETMAP + iaq->nnmrxq10g = min(nnmrxq10g, nofldrxq10g); +#endif + } + if (nrxq1g >= nofldrxq1g) { + iaq->intr_flags_1g = INTR_RXQ; + iaq->nirq += n1g * nrxq1g; +#ifdef DEV_NETMAP + iaq->nnmrxq1g = min(nnmrxq1g, nrxq1g); +#endif + } else { + iaq->intr_flags_1g = INTR_OFLD_RXQ; + iaq->nirq += n1g * nofldrxq1g; +#ifdef DEV_NETMAP + iaq->nnmrxq1g = min(nnmrxq1g, nofldrxq1g); +#endif + } if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) goto allocate; @@ -1789,8 +1927,8 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, /* * Next best option: an interrupt vector for errors, one for the * firmware event queue, and at least one per port. At this - * point we know we'll have to downsize nrxq or nofldrxq to fit - * what's available to us. + * point we know we'll have to downsize nrxq and/or nofldrxq + * and/or nnmrxq to fit what's available to us. */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g + n1g; @@ -1800,6 +1938,9 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, if (n10g > 0) { int target = max(nrxq10g, nofldrxq10g); + iaq->intr_flags_10g = nrxq10g >= nofldrxq10g ? + INTR_RXQ : INTR_OFLD_RXQ; + n = 1; while (n < target && leftover >= n10g) { leftover -= n10g; @@ -1808,14 +1949,19 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, } iaq->nrxq10g = min(n, nrxq10g); #ifdef TCP_OFFLOAD - if (is_offload(sc)) - iaq->nofldrxq10g = min(n, nofldrxq10g); + iaq->nofldrxq10g = min(n, nofldrxq10g); +#endif +#ifdef DEV_NETMAP + iaq->nnmrxq10g = min(n, nnmrxq10g); #endif } if (n1g > 0) { int target = max(nrxq1g, nofldrxq1g); + iaq->intr_flags_1g = nrxq1g >= nofldrxq1g ? + INTR_RXQ : INTR_OFLD_RXQ; + n = 1; while (n < target && leftover >= n1g) { leftover -= n1g; @@ -1824,8 +1970,10 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, } iaq->nrxq1g = min(n, nrxq1g); #ifdef TCP_OFFLOAD - if (is_offload(sc)) - iaq->nofldrxq1g = min(n, nofldrxq1g); + iaq->nofldrxq1g = min(n, nofldrxq1g); +#endif +#ifdef DEV_NETMAP + iaq->nnmrxq1g = min(n, nnmrxq1g); #endif } @@ -1837,10 +1985,14 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, * Least desirable option: one interrupt vector for everything. */ iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1; + iaq->intr_flags_10g = iaq->intr_flags_1g = 0; #ifdef TCP_OFFLOAD if (is_offload(sc)) iaq->nofldrxq10g = iaq->nofldrxq1g = 1; #endif +#ifdef DEV_NETMAP + iaq->nnmrxq10g = iaq->nnmrxq1g = 1; +#endif allocate: navail = iaq->nirq; @@ -2620,9 +2772,8 @@ t4_set_desc(struct adapter *sc) } static void -build_medialist(struct port_info *pi) +build_medialist(struct port_info *pi, struct ifmedia *media) { - struct ifmedia *media = &pi->media; int data, m; PORT_LOCK(pi); @@ -2751,17 +2902,29 @@ build_medialist(struct port_info *pi) * Program the port's XGMAC based on parameters in ifnet. The caller also * indicates which parameters should be programmed (the rest are left alone). */ -static int -update_mac_settings(struct port_info *pi, int flags) +int +update_mac_settings(struct ifnet *ifp, int flags) { - int rc; - struct ifnet *ifp = pi->ifp; + int rc = 0; + struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; + uint16_t viid = 0xffff; + int16_t *xact_addr_filt = NULL; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); + if (ifp == pi->ifp) { + viid = pi->viid; + xact_addr_filt = &pi->xact_addr_filt; + } +#ifdef DEV_NETMAP + else if (ifp == pi->nm_ifp) { + viid = pi->nm_viid; + xact_addr_filt = &pi->nm_xact_addr_filt; + } +#endif if (flags & XGMAC_MTU) mtu = ifp->if_mtu; @@ -2774,25 +2937,28 @@ update_mac_settings(struct port_info *pi, int flags) if (flags & XGMAC_VLANEX) vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0; - rc = -t4_set_rxmode(sc, sc->mbox, pi->viid, mtu, promisc, allmulti, 1, - vlanex, false); - if (rc) { - if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, rc); - return (rc); + if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) { + rc = -t4_set_rxmode(sc, sc->mbox, viid, mtu, promisc, allmulti, + 1, vlanex, false); + if (rc) { + if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, + rc); + return (rc); + } } if (flags & XGMAC_UCADDR) { uint8_t ucaddr[ETHER_ADDR_LEN]; bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr)); - rc = t4_change_mac(sc, sc->mbox, pi->viid, pi->xact_addr_filt, - ucaddr, true, true); + rc = t4_change_mac(sc, sc->mbox, viid, *xact_addr_filt, ucaddr, + true, true); if (rc < 0) { rc = -rc; if_printf(ifp, "change_mac failed: %d\n", rc); return (rc); } else { - pi->xact_addr_filt = rc; + *xact_addr_filt = rc; rc = 0; } } @@ -2812,8 +2978,8 @@ update_mac_settings(struct port_info *pi, int flags) LLADDR((struct sockaddr_dl *)ifma->ifma_addr); if (i == FW_MAC_EXACT_CHUNK) { - rc = t4_alloc_mac_filt(sc, sc->mbox, pi->viid, - del, i, mcaddr, NULL, &hash, 0); + rc = t4_alloc_mac_filt(sc, sc->mbox, viid, del, + i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { @@ -2833,8 +2999,8 @@ update_mac_settings(struct port_info *pi, int flags) } } if (i > 0) { - rc = t4_alloc_mac_filt(sc, sc->mbox, pi->viid, - del, i, mcaddr, NULL, &hash, 0); + rc = t4_alloc_mac_filt(sc, sc->mbox, viid, del, i, + mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { @@ -2851,7 +3017,7 @@ update_mac_settings(struct port_info *pi, int flags) } } - rc = -t4_set_addr_hash(sc, sc->mbox, pi->viid, 0, hash, 0); + rc = -t4_set_addr_hash(sc, sc->mbox, viid, 0, hash, 0); if (rc != 0) if_printf(ifp, "failed to set mc address hash: %d", rc); mcfail: @@ -2954,16 +3120,10 @@ cxgbe_init_synchronized(struct port_info *pi) ((rc = port_full_init(pi)) != 0)) return (rc); /* error message displayed already */ - rc = update_mac_settings(pi, XGMAC_ALL); + rc = update_mac_settings(ifp, XGMAC_ALL); if (rc) goto done; /* error message displayed already */ - rc = -t4_link_start(sc, sc->mbox, pi->tx_chan, &pi->link_cfg); - if (rc != 0) { - if_printf(ifp, "start_link failed: %d\n", rc); - goto done; - } - rc = -t4_enable_vi(sc, sc->mbox, pi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); @@ -3048,61 +3208,41 @@ setup_intr_handlers(struct adapter *sc) #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif +#ifdef DEV_NETMAP + struct sge_nm_rxq *nm_rxq; +#endif /* * Setup interrupts. */ irq = &sc->irq[0]; rid = sc->intr_type == INTR_INTX ? 0 : 1; - if (sc->intr_count == 1) { - KASSERT(!(sc->flags & INTR_DIRECT), - ("%s: single interrupt && INTR_DIRECT?", __func__)); + if (sc->intr_count == 1) + return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all")); - rc = t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all"); - if (rc != 0) - return (rc); - } else { - /* Multiple interrupts. */ - KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, - ("%s: too few intr.", __func__)); + /* Multiple interrupts. */ + KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, + ("%s: too few intr.", __func__)); - /* The first one is always error intr */ - rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); - if (rc != 0) - return (rc); - irq++; - rid++; + /* The first one is always error intr */ + rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); + if (rc != 0) + return (rc); + irq++; + rid++; - /* The second one is always the firmware event queue */ - rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sc->sge.fwq, - "evt"); - if (rc != 0) - return (rc); - irq++; - rid++; + /* The second one is always the firmware event queue */ + rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sc->sge.fwq, "evt"); + if (rc != 0) + return (rc); + irq++; + rid++; - /* - * Note that if INTR_DIRECT is not set then either the NIC rx - * queues or (exclusive or) the TOE rx queueus will be taking - * direct interrupts. - * - * There is no need to check for is_offload(sc) as nofldrxq - * will be 0 if offload is disabled. - */ - for_each_port(sc, p) { - pi = sc->port[p]; + for_each_port(sc, p) { + pi = sc->port[p]; -#ifdef TCP_OFFLOAD - /* - * Skip over the NIC queues if they aren't taking direct - * interrupts. - */ - if (!(sc->flags & INTR_DIRECT) && - pi->nofldrxq > pi->nrxq) - goto ofld_queues; -#endif - rxq = &sc->sge.rxq[pi->first_rxq]; - for (q = 0; q < pi->nrxq; q++, rxq++) { + if (pi->flags & INTR_RXQ) { + for_each_rxq(pi, q, rxq) { snprintf(s, sizeof(s), "%d.%d", p, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq, s); @@ -3111,17 +3251,10 @@ setup_intr_handlers(struct adapter *sc) irq++; rid++; } - + } #ifdef TCP_OFFLOAD - /* - * Skip over the offload queues if they aren't taking - * direct interrupts. - */ - if (!(sc->flags & INTR_DIRECT)) - continue; -ofld_queues: - ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq]; - for (q = 0; q < pi->nofldrxq; q++, ofld_rxq++) { + if (pi->flags & INTR_OFLD_RXQ) { + for_each_ofld_rxq(pi, q, ofld_rxq) { snprintf(s, sizeof(s), "%d,%d", p, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, ofld_rxq, s); @@ -3130,14 +3263,28 @@ setup_intr_handlers(struct adapter *sc) irq++; rid++; } + } #endif +#ifdef DEV_NETMAP + if (pi->flags & INTR_NM_RXQ) { + for_each_nm_rxq(pi, q, nm_rxq) { + snprintf(s, sizeof(s), "%d-%d", p, q); + rc = t4_alloc_irq(sc, irq, rid, t4_nm_intr, + nm_rxq, s); + if (rc != 0) + return (rc); + irq++; + rid++; + } } +#endif } + MPASS(irq == &sc->irq[sc->intr_count]); return (0); } -static int +int adapter_full_init(struct adapter *sc) { int rc, i; @@ -3175,7 +3322,7 @@ adapter_full_init(struct adapter *sc) return (rc); } -static int +int adapter_full_uninit(struct adapter *sc) { int i; @@ -3194,7 +3341,7 @@ adapter_full_uninit(struct adapter *sc) return (0); } -static int +int port_full_init(struct port_info *pi) { struct adapter *sc = pi->adapter; @@ -3248,7 +3395,7 @@ port_full_init(struct port_info *pi) /* * Idempotent. */ -static int +int port_full_uninit(struct port_info *pi) { struct adapter *sc = pi->adapter; @@ -4581,6 +4728,18 @@ cxgbe_sysctls(struct port_info *pi) "index of first TOE tx queue"); } #endif +#ifdef DEV_NETMAP + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD, + &pi->nnmrxq, 0, "# of rx queues for netmap"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD, + &pi->nnmtxq, 0, "# of tx queues for netmap"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq", + CTLFLAG_RD, &pi->first_nm_rxq, 0, + "index of first netmap rx queue"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq", + CTLFLAG_RD, &pi->first_nm_txq, 0, + "index of first netmap tx queue"); +#endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_holdoff_tmr_idx, "I", @@ -7442,7 +7601,7 @@ set_sched_class(struct adapter *sc, struct t4_sched_params *p) } /* And pass the request to the firmware ...*/ - rc = -t4_sched_config(sc, fw_type, p->u.config.minmax); + rc = -t4_sched_config(sc, fw_type, p->u.config.minmax, 1); goto done; } @@ -7540,7 +7699,7 @@ set_sched_class(struct adapter *sc, struct t4_sched_params *p) rc = -t4_sched_params(sc, fw_type, fw_level, fw_mode, fw_rateunit, fw_ratemode, p->u.params.channel, p->u.params.cl, p->u.params.minrate, p->u.params.maxrate, - p->u.params.weight, p->u.params.pktsize); + p->u.params.weight, p->u.params.pktsize, 1); goto done; } @@ -7879,6 +8038,19 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, } #ifdef TCP_OFFLOAD +void +t4_iscsi_init(struct ifnet *ifp, unsigned int tag_mask, + const unsigned int *pgsz_order) +{ + struct port_info *pi = ifp->if_softc; + struct adapter *sc = pi->adapter; + + t4_write_reg(sc, A_ULP_RX_ISCSI_TAGMASK, tag_mask); + t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, V_HPZ0(pgsz_order[0]) | + V_HPZ1(pgsz_order[1]) | V_HPZ2(pgsz_order[2]) | + V_HPZ3(pgsz_order[3])); +} + static int toe_capability(struct port_info *pi, int enable) { @@ -8067,6 +8239,20 @@ tweak_tunables(void) t4_toecaps_allowed = 0; #endif +#ifdef DEV_NETMAP + if (t4_nnmtxq10g < 1) + t4_nnmtxq10g = min(nc, NNMTXQ_10G); + + if (t4_nnmtxq1g < 1) + t4_nnmtxq1g = min(nc, NNMTXQ_1G); + + if (t4_nnmrxq10g < 1) + t4_nnmrxq10g = min(nc, NNMRXQ_10G); + + if (t4_nnmrxq1g < 1) + t4_nnmrxq1g = min(nc, NNMRXQ_1G); +#endif + if (t4_tmr_idx_10g < 0 || t4_tmr_idx_10g >= SGE_NTIMERS) t4_tmr_idx_10g = TMR_IDX_10G; diff --git a/sys/dev/cxgbe/t4_netmap.c b/sys/dev/cxgbe/t4_netmap.c new file mode 100644 index 000000000..89aef3240 --- /dev/null +++ b/sys/dev/cxgbe/t4_netmap.c @@ -0,0 +1,1138 @@ +/*- + * Copyright (c) 2014 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" + +#ifdef DEV_NETMAP +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/common.h" +#include "common/t4_regs.h" +#include "common/t4_regs_values.h" + +extern int fl_pad; /* XXXNM */ +extern int spg_len; /* XXXNM */ +extern int fl_pktshift; /* XXXNM */ + +/* netmap ifnet routines */ +static void cxgbe_nm_init(void *); +static int cxgbe_nm_ioctl(struct ifnet *, unsigned long, caddr_t); +static int cxgbe_nm_transmit(struct ifnet *, struct mbuf *); +static void cxgbe_nm_qflush(struct ifnet *); + +static int cxgbe_nm_init_synchronized(struct port_info *); +static int cxgbe_nm_uninit_synchronized(struct port_info *); + +static void +cxgbe_nm_init(void *arg) +{ + struct port_info *pi = arg; + struct adapter *sc = pi->adapter; + + if (begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4nminit") != 0) + return; + cxgbe_nm_init_synchronized(pi); + end_synchronized_op(sc, 0); + + return; +} + +static int +cxgbe_nm_init_synchronized(struct port_info *pi) +{ + struct adapter *sc = pi->adapter; + struct ifnet *ifp = pi->nm_ifp; + int rc = 0; + + ASSERT_SYNCHRONIZED_OP(sc); + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return (0); /* already running */ + + if (!(sc->flags & FULL_INIT_DONE) && + ((rc = adapter_full_init(sc)) != 0)) + return (rc); /* error message displayed already */ + + if (!(pi->flags & PORT_INIT_DONE) && + ((rc = port_full_init(pi)) != 0)) + return (rc); /* error message displayed already */ + + rc = update_mac_settings(ifp, XGMAC_ALL); + if (rc) + return (rc); /* error message displayed already */ + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + + return (rc); +} + +static int +cxgbe_nm_uninit_synchronized(struct port_info *pi) +{ +#ifdef INVARIANTS + struct adapter *sc = pi->adapter; +#endif + struct ifnet *ifp = pi->nm_ifp; + + ASSERT_SYNCHRONIZED_OP(sc); + + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + + return (0); +} + +static int +cxgbe_nm_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) +{ + int rc = 0, mtu, flags; + struct port_info *pi = ifp->if_softc; + struct adapter *sc = pi->adapter; + struct ifreq *ifr = (struct ifreq *)data; + uint32_t mask; + + MPASS(pi->nm_ifp == ifp); + + switch (cmd) { + case SIOCSIFMTU: + mtu = ifr->ifr_mtu; + if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO)) + return (EINVAL); + + rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4nmtu"); + if (rc) + return (rc); + ifp->if_mtu = mtu; + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + rc = update_mac_settings(ifp, XGMAC_MTU); + end_synchronized_op(sc, 0); + break; + + case SIOCSIFFLAGS: + rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4nflg"); + if (rc) + return (rc); + + if (ifp->if_flags & IFF_UP) { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + flags = pi->nmif_flags; + if ((ifp->if_flags ^ flags) & + (IFF_PROMISC | IFF_ALLMULTI)) { + rc = update_mac_settings(ifp, + XGMAC_PROMISC | XGMAC_ALLMULTI); + } + } else + rc = cxgbe_nm_init_synchronized(pi); + pi->nmif_flags = ifp->if_flags; + } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) + rc = cxgbe_nm_uninit_synchronized(pi); + end_synchronized_op(sc, 0); + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: /* these two are called with a mutex held :-( */ + rc = begin_synchronized_op(sc, pi, HOLD_LOCK, "t4nmulti"); + if (rc) + return (rc); + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + rc = update_mac_settings(ifp, XGMAC_MCADDRS); + end_synchronized_op(sc, LOCK_HELD); + break; + + case SIOCSIFCAP: + mask = ifr->ifr_reqcap ^ ifp->if_capenable; + if (mask & IFCAP_TXCSUM) { + ifp->if_capenable ^= IFCAP_TXCSUM; + ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); + } + if (mask & IFCAP_TXCSUM_IPV6) { + ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; + ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); + } + if (mask & IFCAP_RXCSUM) + ifp->if_capenable ^= IFCAP_RXCSUM; + if (mask & IFCAP_RXCSUM_IPV6) + ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; + break; + + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + ifmedia_ioctl(ifp, ifr, &pi->nm_media, cmd); + break; + + default: + rc = ether_ioctl(ifp, cmd, data); + } + + return (rc); +} + +static int +cxgbe_nm_transmit(struct ifnet *ifp, struct mbuf *m) +{ + + m_freem(m); + return (0); +} + +static void +cxgbe_nm_qflush(struct ifnet *ifp) +{ + + return; +} + +static int +alloc_nm_rxq_hwq(struct port_info *pi, struct sge_nm_rxq *nm_rxq) +{ + int rc, cntxt_id; + __be32 v; + struct adapter *sc = pi->adapter; + struct netmap_adapter *na = NA(pi->nm_ifp); + struct fw_iq_cmd c; + + MPASS(na != NULL); + MPASS(nm_rxq->iq_desc != NULL); + MPASS(nm_rxq->fl_desc != NULL); + + bzero(nm_rxq->iq_desc, pi->qsize_rxq * IQ_ESIZE); + bzero(nm_rxq->fl_desc, na->num_rx_desc * EQ_ESIZE + spg_len); + + bzero(&c, sizeof(c)); + c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | + F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | + V_FW_IQ_CMD_VFN(0)); + c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | + FW_LEN16(c)); + if (pi->flags & INTR_NM_RXQ) { + KASSERT(nm_rxq->intr_idx < sc->intr_count, + ("%s: invalid direct intr_idx %d", __func__, + nm_rxq->intr_idx)); + v = V_FW_IQ_CMD_IQANDSTINDEX(nm_rxq->intr_idx); + } else { + CXGBE_UNIMPLEMENTED(__func__); /* XXXNM: needs review */ + v = V_FW_IQ_CMD_IQANDSTINDEX(nm_rxq->intr_idx) | + F_FW_IQ_CMD_IQANDST; + } + c.type_to_iqandstindex = htobe32(v | + V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | + V_FW_IQ_CMD_VIID(pi->nm_viid) | + V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); + c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | + F_FW_IQ_CMD_IQGTSMODE | + V_FW_IQ_CMD_IQINTCNTTHRESH(0) | + V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); + c.iqsize = htobe16(pi->qsize_rxq); + c.iqaddr = htobe64(nm_rxq->iq_ba); + c.iqns_to_fl0congen |= + htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | + F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | + (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0)); + c.fl0dcaen_to_fl0cidxfthresh = + htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_64B) | + V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B)); + c.fl0size = htobe16(na->num_rx_desc + spg_len / EQ_ESIZE); + c.fl0addr = htobe64(nm_rxq->fl_ba); + + rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); + if (rc != 0) { + device_printf(sc->dev, + "failed to create netmap ingress queue: %d\n", rc); + return (rc); + } + + nm_rxq->iq_cidx = 0; + MPASS(nm_rxq->iq_sidx == pi->qsize_rxq - spg_len / IQ_ESIZE); + nm_rxq->iq_gen = F_RSPD_GEN; + nm_rxq->iq_cntxt_id = be16toh(c.iqid); + nm_rxq->iq_abs_id = be16toh(c.physiqid); + cntxt_id = nm_rxq->iq_cntxt_id - sc->sge.iq_start; + if (cntxt_id >= sc->sge.niq) { + panic ("%s: nm_rxq->iq_cntxt_id (%d) more than the max (%d)", + __func__, cntxt_id, sc->sge.niq - 1); + } + sc->sge.iqmap[cntxt_id] = (void *)nm_rxq; + + nm_rxq->fl_cntxt_id = be16toh(c.fl0id); + nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; + MPASS(nm_rxq->fl_sidx == na->num_rx_desc); + cntxt_id = nm_rxq->fl_cntxt_id - sc->sge.eq_start; + if (cntxt_id >= sc->sge.neq) { + panic("%s: nm_rxq->fl_cntxt_id (%d) more than the max (%d)", + __func__, cntxt_id, sc->sge.neq - 1); + } + sc->sge.eqmap[cntxt_id] = (void *)nm_rxq; + + nm_rxq->fl_db_val = F_DBPRIO | V_QID(nm_rxq->fl_cntxt_id) | V_PIDX(0); + if (is_t5(sc)) + nm_rxq->fl_db_val |= F_DBTYPE; + + t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(F_QINTR_CNT_EN) | + V_INGRESSQID(nm_rxq->iq_cntxt_id)); + + return (rc); +} + +static int +free_nm_rxq_hwq(struct port_info *pi, struct sge_nm_rxq *nm_rxq) +{ + struct adapter *sc = pi->adapter; + int rc; + + rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, + nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, 0xffff); + if (rc != 0) + device_printf(sc->dev, "%s: failed for iq %d, fl %d: %d\n", + __func__, nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, rc); + return (rc); +} + +static int +alloc_nm_txq_hwq(struct port_info *pi, struct sge_nm_txq *nm_txq) +{ + int rc, cntxt_id; + size_t len; + struct adapter *sc = pi->adapter; + struct netmap_adapter *na = NA(pi->nm_ifp); + struct fw_eq_eth_cmd c; + + MPASS(na != NULL); + MPASS(nm_txq->desc != NULL); + + len = na->num_tx_desc * EQ_ESIZE + spg_len; + bzero(nm_txq->desc, len); + + bzero(&c, sizeof(c)); + c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | + F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | + V_FW_EQ_ETH_CMD_VFN(0)); + c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | + F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); + c.autoequiqe_to_viid = htobe32(V_FW_EQ_ETH_CMD_VIID(pi->nm_viid)); + c.fetchszm_to_iqid = + htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | + V_FW_EQ_ETH_CMD_PCIECHN(pi->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | + V_FW_EQ_ETH_CMD_IQID(sc->sge.nm_rxq[nm_txq->iqidx].iq_cntxt_id)); + c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | + V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | + V_FW_EQ_ETH_CMD_EQSIZE(len / EQ_ESIZE)); + c.eqaddr = htobe64(nm_txq->ba); + + rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); + if (rc != 0) { + device_printf(pi->dev, + "failed to create netmap egress queue: %d\n", rc); + return (rc); + } + + nm_txq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); + cntxt_id = nm_txq->cntxt_id - sc->sge.eq_start; + if (cntxt_id >= sc->sge.neq) + panic("%s: nm_txq->cntxt_id (%d) more than the max (%d)", __func__, + cntxt_id, sc->sge.neq - 1); + sc->sge.eqmap[cntxt_id] = (void *)nm_txq; + + nm_txq->pidx = nm_txq->cidx = 0; + MPASS(nm_txq->sidx == na->num_tx_desc); + nm_txq->equiqidx = nm_txq-> equeqidx = nm_txq->dbidx = 0; + + nm_txq->doorbells = sc->doorbells; + if (isset(&nm_txq->doorbells, DOORBELL_UDB) || + isset(&nm_txq->doorbells, DOORBELL_UDBWC) || + isset(&nm_txq->doorbells, DOORBELL_WCWR)) { + uint32_t s_qpp = sc->sge.eq_s_qpp; + uint32_t mask = (1 << s_qpp) - 1; + volatile uint8_t *udb; + + udb = sc->udbs_base + UDBS_DB_OFFSET; + udb += (nm_txq->cntxt_id >> s_qpp) << PAGE_SHIFT; + nm_txq->udb_qid = nm_txq->cntxt_id & mask; + if (nm_txq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) + clrbit(&nm_txq->doorbells, DOORBELL_WCWR); + else { + udb += nm_txq->udb_qid << UDBS_SEG_SHIFT; + nm_txq->udb_qid = 0; + } + nm_txq->udb = (volatile void *)udb; + } + + return (rc); +} + +static int +free_nm_txq_hwq(struct port_info *pi, struct sge_nm_txq *nm_txq) +{ + struct adapter *sc = pi->adapter; + int rc; + + rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, nm_txq->cntxt_id); + if (rc != 0) + device_printf(sc->dev, "%s: failed for eq %d: %d\n", __func__, + nm_txq->cntxt_id, rc); + return (rc); +} + +static int +cxgbe_netmap_on(struct adapter *sc, struct port_info *pi, struct ifnet *ifp, + struct netmap_adapter *na) +{ + struct netmap_slot *slot; + struct sge_nm_rxq *nm_rxq; + struct sge_nm_txq *nm_txq; + int rc, i, j, hwidx; + struct hw_buf_info *hwb; + uint16_t *rss; + + ASSERT_SYNCHRONIZED_OP(sc); + + if ((pi->flags & PORT_INIT_DONE) == 0 || + (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return (EAGAIN); + + hwb = &sc->sge.hw_buf_info[0]; + for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { + if (hwb->size == NETMAP_BUF_SIZE(na)) + break; + } + if (i >= SGE_FLBUF_SIZES) { + if_printf(ifp, "no hwidx for netmap buffer size %d.\n", + NETMAP_BUF_SIZE(na)); + return (ENXIO); + } + hwidx = i; + + /* Must set caps before calling netmap_reset */ + nm_set_native_flags(na); + + for_each_nm_rxq(pi, i, nm_rxq) { + alloc_nm_rxq_hwq(pi, nm_rxq); + nm_rxq->fl_hwidx = hwidx; + slot = netmap_reset(na, NR_RX, i, 0); + MPASS(slot != NULL); /* XXXNM: error check, not assert */ + + /* We deal with 8 bufs at a time */ + MPASS((na->num_rx_desc & 7) == 0); + MPASS(na->num_rx_desc == nm_rxq->fl_sidx); + for (j = 0; j < nm_rxq->fl_sidx - 8; j++) { + uint64_t ba; + + PNMB(na, &slot[j], &ba); + nm_rxq->fl_desc[j] = htobe64(ba | hwidx); + } + nm_rxq->fl_pidx = j; + MPASS((j & 7) == 0); + j /= 8; /* driver pidx to hardware pidx */ + wmb(); + t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + nm_rxq->fl_db_val | V_PIDX(j)); + } + + for_each_nm_txq(pi, i, nm_txq) { + alloc_nm_txq_hwq(pi, nm_txq); + slot = netmap_reset(na, NR_TX, i, 0); + MPASS(slot != NULL); /* XXXNM: error check, not assert */ + } + + rss = malloc(pi->nm_rss_size * sizeof (*rss), M_CXGBE, M_ZERO | + M_WAITOK); + for (i = 0; i < pi->nm_rss_size;) { + for_each_nm_rxq(pi, j, nm_rxq) { + rss[i++] = nm_rxq->iq_abs_id; + if (i == pi->nm_rss_size) + break; + } + } + rc = -t4_config_rss_range(sc, sc->mbox, pi->nm_viid, 0, pi->nm_rss_size, + rss, pi->nm_rss_size); + if (rc != 0) + if_printf(ifp, "netmap rss_config failed: %d\n", rc); + free(rss, M_CXGBE); + + rc = -t4_enable_vi(sc, sc->mbox, pi->nm_viid, true, true); + if (rc != 0) + if_printf(ifp, "netmap enable_vi failed: %d\n", rc); + + return (rc); +} + +static int +cxgbe_netmap_off(struct adapter *sc, struct port_info *pi, struct ifnet *ifp, + struct netmap_adapter *na) +{ + int rc, i; + struct sge_nm_txq *nm_txq; + struct sge_nm_rxq *nm_rxq; + + ASSERT_SYNCHRONIZED_OP(sc); + + rc = -t4_enable_vi(sc, sc->mbox, pi->nm_viid, false, false); + if (rc != 0) + if_printf(ifp, "netmap disable_vi failed: %d\n", rc); + nm_clear_native_flags(na); + + /* + * XXXNM: We need to make sure that the tx queues are quiet and won't + * request any more SGE_EGR_UPDATEs. + */ + + for_each_nm_txq(pi, i, nm_txq) { + free_nm_txq_hwq(pi, nm_txq); + } + for_each_nm_rxq(pi, i, nm_rxq) { + free_nm_rxq_hwq(pi, nm_rxq); + } + + return (rc); +} + +static int +cxgbe_netmap_reg(struct netmap_adapter *na, int on) +{ + struct ifnet *ifp = na->ifp; + struct port_info *pi = ifp->if_softc; + struct adapter *sc = pi->adapter; + int rc; + + rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4nmreg"); + if (rc != 0) + return (rc); + if (on) + rc = cxgbe_netmap_on(sc, pi, ifp, na); + else + rc = cxgbe_netmap_off(sc, pi, ifp, na); + end_synchronized_op(sc, 0); + + return (rc); +} + +/* How many packets can a single type1 WR carry in n descriptors */ +static inline int +ndesc_to_npkt(const int n) +{ + + MPASS(n > 0 && n <= SGE_MAX_WR_NDESC); + + return (n * 2 - 1); +} +#define MAX_NPKT_IN_TYPE1_WR (ndesc_to_npkt(SGE_MAX_WR_NDESC)) + +/* Space (in descriptors) needed for a type1 WR that carries n packets */ +static inline int +npkt_to_ndesc(const int n) +{ + + MPASS(n > 0 && n <= MAX_NPKT_IN_TYPE1_WR); + + return ((n + 2) / 2); +} + +/* Space (in 16B units) needed for a type1 WR that carries n packets */ +static inline int +npkt_to_len16(const int n) +{ + + MPASS(n > 0 && n <= MAX_NPKT_IN_TYPE1_WR); + + return (n * 2 + 1); +} + +#define NMIDXDIFF(q, idx) IDXDIFF((q)->pidx, (q)->idx, (q)->sidx) + +static void +ring_nm_txq_db(struct adapter *sc, struct sge_nm_txq *nm_txq) +{ + int n; + u_int db = nm_txq->doorbells; + + MPASS(nm_txq->pidx != nm_txq->dbidx); + + n = NMIDXDIFF(nm_txq, dbidx); + if (n > 1) + clrbit(&db, DOORBELL_WCWR); + wmb(); + + switch (ffs(db) - 1) { + case DOORBELL_UDB: + *nm_txq->udb = htole32(V_QID(nm_txq->udb_qid) | V_PIDX(n)); + break; + + case DOORBELL_WCWR: { + volatile uint64_t *dst, *src; + + /* + * Queues whose 128B doorbell segment fits in the page do not + * use relative qid (udb_qid is always 0). Only queues with + * doorbell segments can do WCWR. + */ + KASSERT(nm_txq->udb_qid == 0 && n == 1, + ("%s: inappropriate doorbell (0x%x, %d, %d) for nm_txq %p", + __func__, nm_txq->doorbells, n, nm_txq->pidx, nm_txq)); + + dst = (volatile void *)((uintptr_t)nm_txq->udb + + UDBS_WR_OFFSET - UDBS_DB_OFFSET); + src = (void *)&nm_txq->desc[nm_txq->dbidx]; + while (src != (void *)&nm_txq->desc[nm_txq->dbidx + 1]) + *dst++ = *src++; + wmb(); + break; + } + + case DOORBELL_UDBWC: + *nm_txq->udb = htole32(V_QID(nm_txq->udb_qid) | V_PIDX(n)); + wmb(); + break; + + case DOORBELL_KDB: + t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + V_QID(nm_txq->cntxt_id) | V_PIDX(n)); + break; + } + nm_txq->dbidx = nm_txq->pidx; +} + +int lazy_tx_credit_flush = 1; + +/* + * Write work requests to send 'npkt' frames and ring the doorbell to send them + * on their way. No need to check for wraparound. + */ +static void +cxgbe_nm_tx(struct adapter *sc, struct sge_nm_txq *nm_txq, + struct netmap_kring *kring, int npkt, int npkt_remaining) +{ + struct netmap_ring *ring = kring->ring; + struct netmap_slot *slot; + const u_int lim = kring->nkr_num_slots - 1; + struct fw_eth_tx_pkts_wr *wr = (void *)&nm_txq->desc[nm_txq->pidx]; + uint16_t len; + uint64_t ba; + struct cpl_tx_pkt_core *cpl; + struct ulptx_sgl *usgl; + int i, n; + + while (npkt) { + n = min(npkt, MAX_NPKT_IN_TYPE1_WR); + len = 0; + + wr = (void *)&nm_txq->desc[nm_txq->pidx]; + wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); + wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(npkt_to_len16(n))); + wr->npkt = n; + wr->r3 = 0; + wr->type = 1; + cpl = (void *)(wr + 1); + + for (i = 0; i < n; i++) { + slot = &ring->slot[kring->nr_hwcur]; + PNMB(kring->na, slot, &ba); + + cpl->ctrl0 = nm_txq->cpl_ctrl0; + cpl->pack = 0; + cpl->len = htobe16(slot->len); + /* + * netmap(4) says "netmap does not use features such as + * checksum offloading, TCP segmentation offloading, + * encryption, VLAN encapsulation/decapsulation, etc." + * + * XXXNM: it makes sense to enable checksum offload. + */ + cpl->ctrl1 = htobe64(F_TXPKT_IPCSUM_DIS | + F_TXPKT_L4CSUM_DIS); + + usgl = (void *)(cpl + 1); + usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | + V_ULPTX_NSGE(1)); + usgl->len0 = htobe32(slot->len); + usgl->addr0 = htobe64(ba); + + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + cpl = (void *)(usgl + 1); + MPASS(slot->len + len <= UINT16_MAX); + len += slot->len; + kring->nr_hwcur = nm_next(kring->nr_hwcur, lim); + } + wr->plen = htobe16(len); + + npkt -= n; + nm_txq->pidx += npkt_to_ndesc(n); + MPASS(nm_txq->pidx <= nm_txq->sidx); + if (__predict_false(nm_txq->pidx == nm_txq->sidx)) { + /* + * This routine doesn't know how to write WRs that wrap + * around. Make sure it wasn't asked to. + */ + MPASS(npkt == 0); + nm_txq->pidx = 0; + } + + if (npkt == 0 && npkt_remaining == 0) { + /* All done. */ + if (lazy_tx_credit_flush == 0) { + wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | + F_FW_WR_EQUIQ); + nm_txq->equeqidx = nm_txq->pidx; + nm_txq->equiqidx = nm_txq->pidx; + } + ring_nm_txq_db(sc, nm_txq); + return; + } + + if (NMIDXDIFF(nm_txq, equiqidx) >= nm_txq->sidx / 2) { + wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | + F_FW_WR_EQUIQ); + nm_txq->equeqidx = nm_txq->pidx; + nm_txq->equiqidx = nm_txq->pidx; + } else if (NMIDXDIFF(nm_txq, equeqidx) >= 64) { + wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); + nm_txq->equeqidx = nm_txq->pidx; + } + if (NMIDXDIFF(nm_txq, dbidx) >= 2 * SGE_MAX_WR_NDESC) + ring_nm_txq_db(sc, nm_txq); + } + + /* Will get called again. */ + MPASS(npkt_remaining); +} + +/* How many contiguous free descriptors starting at pidx */ +static inline int +contiguous_ndesc_available(struct sge_nm_txq *nm_txq) +{ + + if (nm_txq->cidx > nm_txq->pidx) + return (nm_txq->cidx - nm_txq->pidx - 1); + else if (nm_txq->cidx > 0) + return (nm_txq->sidx - nm_txq->pidx); + else + return (nm_txq->sidx - nm_txq->pidx - 1); +} + +static int +reclaim_nm_tx_desc(struct sge_nm_txq *nm_txq) +{ + struct sge_qstat *spg = (void *)&nm_txq->desc[nm_txq->sidx]; + uint16_t hw_cidx = spg->cidx; /* snapshot */ + struct fw_eth_tx_pkts_wr *wr; + int n = 0; + + hw_cidx = be16toh(hw_cidx); + + while (nm_txq->cidx != hw_cidx) { + wr = (void *)&nm_txq->desc[nm_txq->cidx]; + + MPASS(wr->op_pkd == htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR))); + MPASS(wr->type == 1); + MPASS(wr->npkt > 0 && wr->npkt <= MAX_NPKT_IN_TYPE1_WR); + + n += wr->npkt; + nm_txq->cidx += npkt_to_ndesc(wr->npkt); + + /* + * We never sent a WR that wrapped around so the credits coming + * back, WR by WR, should never cause the cidx to wrap around + * either. + */ + MPASS(nm_txq->cidx <= nm_txq->sidx); + if (__predict_false(nm_txq->cidx == nm_txq->sidx)) + nm_txq->cidx = 0; + } + + return (n); +} + +static int +cxgbe_netmap_txsync(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + struct ifnet *ifp = na->ifp; + struct port_info *pi = ifp->if_softc; + struct adapter *sc = pi->adapter; + struct sge_nm_txq *nm_txq = &sc->sge.nm_txq[pi->first_nm_txq + kring->ring_id]; + const u_int head = kring->rhead; + u_int reclaimed = 0; + int n, d, npkt_remaining, ndesc_remaining; + + /* + * Tx was at kring->nr_hwcur last time around and now we need to advance + * to kring->rhead. Note that the driver's pidx moves independent of + * netmap's kring->nr_hwcur (pidx counts descriptors and the relation + * between descriptors and frames isn't 1:1). + */ + + npkt_remaining = head >= kring->nr_hwcur ? head - kring->nr_hwcur : + kring->nkr_num_slots - kring->nr_hwcur + head; + while (npkt_remaining) { + reclaimed += reclaim_nm_tx_desc(nm_txq); + ndesc_remaining = contiguous_ndesc_available(nm_txq); + /* Can't run out of descriptors with packets still remaining */ + MPASS(ndesc_remaining > 0); + + /* # of desc needed to tx all remaining packets */ + d = (npkt_remaining / MAX_NPKT_IN_TYPE1_WR) * SGE_MAX_WR_NDESC; + if (npkt_remaining % MAX_NPKT_IN_TYPE1_WR) + d += npkt_to_ndesc(npkt_remaining % MAX_NPKT_IN_TYPE1_WR); + + if (d <= ndesc_remaining) + n = npkt_remaining; + else { + /* Can't send all, calculate how many can be sent */ + n = (ndesc_remaining / SGE_MAX_WR_NDESC) * + MAX_NPKT_IN_TYPE1_WR; + if (ndesc_remaining % SGE_MAX_WR_NDESC) + n += ndesc_to_npkt(ndesc_remaining % SGE_MAX_WR_NDESC); + } + + /* Send n packets and update nm_txq->pidx and kring->nr_hwcur */ + npkt_remaining -= n; + cxgbe_nm_tx(sc, nm_txq, kring, n, npkt_remaining); + } + MPASS(npkt_remaining == 0); + MPASS(kring->nr_hwcur == head); + MPASS(nm_txq->dbidx == nm_txq->pidx); + + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (reclaimed || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + reclaimed += reclaim_nm_tx_desc(nm_txq); + kring->nr_hwtail += reclaimed; + if (kring->nr_hwtail >= kring->nkr_num_slots) + kring->nr_hwtail -= kring->nkr_num_slots; + } + + nm_txsync_finalize(kring); + + return (0); +} + +static int +cxgbe_netmap_rxsync(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + struct netmap_ring *ring = kring->ring; + struct ifnet *ifp = na->ifp; + struct port_info *pi = ifp->if_softc; + struct adapter *sc = pi->adapter; + struct sge_nm_rxq *nm_rxq = &sc->sge.nm_rxq[pi->first_nm_rxq + kring->ring_id]; + u_int const head = nm_rxsync_prologue(kring); + u_int n; + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + if (netmap_no_pendintr || force_update) { + kring->nr_hwtail = atomic_load_acq_32(&nm_rxq->fl_cidx); + kring->nr_kflags &= ~NKR_PENDINTR; + } + + /* Userspace done with buffers from kring->nr_hwcur to head */ + n = head >= kring->nr_hwcur ? head - kring->nr_hwcur : + kring->nkr_num_slots - kring->nr_hwcur + head; + n &= ~7U; + if (n > 0) { + u_int fl_pidx = nm_rxq->fl_pidx; + struct netmap_slot *slot = &ring->slot[fl_pidx]; + uint64_t ba; + int i, dbinc = 0, hwidx = nm_rxq->fl_hwidx; + + /* + * We always deal with 8 buffers at a time. We must have + * stopped at an 8B boundary (fl_pidx) last time around and we + * must have a multiple of 8B buffers to give to the freelist. + */ + MPASS((fl_pidx & 7) == 0); + MPASS((n & 7) == 0); + + IDXINCR(kring->nr_hwcur, n, kring->nkr_num_slots); + IDXINCR(nm_rxq->fl_pidx, n, nm_rxq->fl_sidx); + + while (n > 0) { + for (i = 0; i < 8; i++, fl_pidx++, slot++) { + PNMB(na, slot, &ba); + nm_rxq->fl_desc[fl_pidx] = htobe64(ba | hwidx); + slot->flags &= ~NS_BUF_CHANGED; + MPASS(fl_pidx <= nm_rxq->fl_sidx); + } + n -= 8; + if (fl_pidx == nm_rxq->fl_sidx) { + fl_pidx = 0; + slot = &ring->slot[0]; + } + if (++dbinc == 8 && n >= 32) { + wmb(); + t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + nm_rxq->fl_db_val | V_PIDX(dbinc)); + dbinc = 0; + } + } + MPASS(nm_rxq->fl_pidx == fl_pidx); + + if (dbinc > 0) { + wmb(); + t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + nm_rxq->fl_db_val | V_PIDX(dbinc)); + } + } + + nm_rxsync_finalize(kring); + + return (0); +} + +/* + * Create an ifnet solely for netmap use and register it with the kernel. + */ +int +create_netmap_ifnet(struct port_info *pi) +{ + struct adapter *sc = pi->adapter; + struct netmap_adapter na; + struct ifnet *ifp; + device_t dev = pi->dev; + uint8_t mac[ETHER_ADDR_LEN]; + int rc; + + if (pi->nnmtxq <= 0 || pi->nnmrxq <= 0) + return (0); + MPASS(pi->nm_ifp == NULL); + + /* + * Allocate a virtual interface exclusively for netmap use. Give it the + * MAC address normally reserved for use by a TOE interface. (The TOE + * driver on FreeBSD doesn't use it). + */ + rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1, &mac[0], + &pi->nm_rss_size, FW_VI_FUNC_OFLD, 0); + if (rc < 0) { + device_printf(dev, "unable to allocate netmap virtual " + "interface for port %d: %d\n", pi->port_id, -rc); + return (-rc); + } + pi->nm_viid = rc; + pi->nm_xact_addr_filt = -1; + + ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + device_printf(dev, "Cannot allocate netmap ifnet\n"); + return (ENOMEM); + } + pi->nm_ifp = ifp; + ifp->if_softc = pi; + + if_initname(ifp, is_t4(pi->adapter) ? "ncxgbe" : "ncxl", + device_get_unit(dev)); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + + ifp->if_init = cxgbe_nm_init; + ifp->if_ioctl = cxgbe_nm_ioctl; + ifp->if_transmit = cxgbe_nm_transmit; + ifp->if_qflush = cxgbe_nm_qflush; + + /* + * netmap(4) says "netmap does not use features such as checksum + * offloading, TCP segmentation offloading, encryption, VLAN + * encapsulation/decapsulation, etc." + * + * By default we comply with the statement above. But we do declare the + * ifnet capable of L3/L4 checksumming so that a user can override + * netmap and have the hardware do the L3/L4 checksums. + */ + ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_JUMBO_MTU | + IFCAP_HWCSUM_IPV6; + ifp->if_capenable = 0; + ifp->if_hwassist = 0; + + /* nm_media has already been setup by the caller */ + + ether_ifattach(ifp, mac); + + /* + * Register with netmap in the kernel. + */ + bzero(&na, sizeof(na)); + + na.ifp = pi->nm_ifp; + na.na_flags = NAF_BDG_MAYSLEEP; + + /* Netmap doesn't know about the space reserved for the status page. */ + na.num_tx_desc = pi->qsize_txq - spg_len / EQ_ESIZE; + + /* + * The freelist's cidx/pidx drives netmap's rx cidx/pidx. So + * num_rx_desc is based on the number of buffers that can be held in the + * freelist, and not the number of entries in the iq. (These two are + * not exactly the same due to the space taken up by the status page). + */ + na.num_rx_desc = (pi->qsize_rxq / 8) * 8; + na.nm_txsync = cxgbe_netmap_txsync; + na.nm_rxsync = cxgbe_netmap_rxsync; + na.nm_register = cxgbe_netmap_reg; + na.num_tx_rings = pi->nnmtxq; + na.num_rx_rings = pi->nnmrxq; + netmap_attach(&na); /* This adds IFCAP_NETMAP to if_capabilities */ + + return (0); +} + +int +destroy_netmap_ifnet(struct port_info *pi) +{ + struct adapter *sc = pi->adapter; + + if (pi->nm_ifp == NULL) + return (0); + + netmap_detach(pi->nm_ifp); + ifmedia_removeall(&pi->nm_media); + ether_ifdetach(pi->nm_ifp); + if_free(pi->nm_ifp); + t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->nm_viid); + + return (0); +} + +static void +handle_nm_fw6_msg(struct adapter *sc, struct ifnet *ifp, + const struct cpl_fw6_msg *cpl) +{ + const struct cpl_sge_egr_update *egr; + uint32_t oq; + struct sge_nm_txq *nm_txq; + + if (cpl->type != FW_TYPE_RSSCPL && cpl->type != FW6_TYPE_RSSCPL) + panic("%s: FW_TYPE 0x%x on nm_rxq.", __func__, cpl->type); + + /* data[0] is RSS header */ + egr = (const void *)&cpl->data[1]; + oq = be32toh(egr->opcode_qid); + MPASS(G_CPL_OPCODE(oq) == CPL_SGE_EGR_UPDATE); + nm_txq = (void *)sc->sge.eqmap[G_EGR_QID(oq) - sc->sge.eq_start]; + + netmap_tx_irq(ifp, nm_txq->nid); +} + +void +t4_nm_intr(void *arg) +{ + struct sge_nm_rxq *nm_rxq = arg; + struct port_info *pi = nm_rxq->pi; + struct adapter *sc = pi->adapter; + struct ifnet *ifp = pi->nm_ifp; + struct netmap_adapter *na = NA(ifp); + struct netmap_kring *kring = &na->rx_rings[nm_rxq->nid]; + struct netmap_ring *ring = kring->ring; + struct iq_desc *d = &nm_rxq->iq_desc[nm_rxq->iq_cidx]; + uint32_t lq; + u_int n = 0; + int processed = 0; + uint8_t opcode; + uint32_t fl_cidx = atomic_load_acq_32(&nm_rxq->fl_cidx); + + while ((d->rsp.u.type_gen & F_RSPD_GEN) == nm_rxq->iq_gen) { + + rmb(); + + lq = be32toh(d->rsp.pldbuflen_qid); + opcode = d->rss.opcode; + + switch (G_RSPD_TYPE(d->rsp.u.type_gen)) { + case X_RSPD_TYPE_FLBUF: + /* No buffer packing so new buf every time */ + MPASS(lq & F_RSPD_NEWBUF); + + /* fall through */ + + case X_RSPD_TYPE_CPL: + MPASS(opcode < NUM_CPL_CMDS); + + switch (opcode) { + case CPL_FW4_MSG: + case CPL_FW6_MSG: + handle_nm_fw6_msg(sc, ifp, + (const void *)&d->cpl[0]); + break; + case CPL_RX_PKT: + ring->slot[fl_cidx].len = G_RSPD_LEN(lq) - fl_pktshift; + ring->slot[fl_cidx].flags = kring->nkr_slot_flags; + if (__predict_false(++fl_cidx == nm_rxq->fl_sidx)) + fl_cidx = 0; + break; + default: + panic("%s: unexpected opcode 0x%x on nm_rxq %p", + __func__, opcode, nm_rxq); + } + break; + + case X_RSPD_TYPE_INTR: + /* Not equipped to handle forwarded interrupts. */ + panic("%s: netmap queue received interrupt for iq %u\n", + __func__, lq); + + default: + panic("%s: illegal response type %d on nm_rxq %p", + __func__, G_RSPD_TYPE(d->rsp.u.type_gen), nm_rxq); + } + + d++; + if (__predict_false(++nm_rxq->iq_cidx == nm_rxq->iq_sidx)) { + nm_rxq->iq_cidx = 0; + d = &nm_rxq->iq_desc[0]; + nm_rxq->iq_gen ^= F_RSPD_GEN; + } + + if (__predict_false(++n == 64)) { /* XXXNM: tune */ + t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), + V_CIDXINC(n) | V_INGRESSQID(nm_rxq->iq_cntxt_id) | + V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); + n = 0; + } + } + if (fl_cidx != nm_rxq->fl_cidx) { + atomic_store_rel_32(&nm_rxq->fl_cidx, fl_cidx); + netmap_rx_irq(ifp, nm_rxq->nid, &processed); + } + t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(n) | + V_INGRESSQID((u32)nm_rxq->iq_cntxt_id) | V_SEINTARM(F_QINTR_CNT_EN)); +} +#endif diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 9b47ff031..3df1492b3 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -55,6 +55,13 @@ __FBSDID("$FreeBSD$"); #include #include #include +#ifdef DEV_NETMAP +#include +#include +#include +#include +#include +#endif #include "common/common.h" #include "common/t4_regs.h" @@ -71,7 +78,7 @@ __FBSDID("$FreeBSD$"); * Ethernet frames are DMA'd at this byte offset into the freelist buffer. * 0-7 are valid values. */ -static int fl_pktshift = 2; +int fl_pktshift = 2; TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift); /* @@ -80,7 +87,7 @@ TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift); * 0: disable padding. * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. */ -static int fl_pad = -1; +int fl_pad = -1; TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad); /* @@ -88,7 +95,7 @@ TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad); * -1: driver should figure out a good value. * 64 or 128 are the only other valid values. */ -static int spg_len = -1; +int spg_len = -1; TUNABLE_INT("hw.cxgbe.spg_len", &spg_len); /* @@ -164,11 +171,9 @@ struct sgl { }; static int service_iq(struct sge_iq *, int); -static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t, - int *); +static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); -static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int, - int); +static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int); static inline void init_fl(struct adapter *, struct sge_fl *, int, int, int, char *); static inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t, @@ -194,6 +199,14 @@ static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int, struct sysctl_oid *); static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *); #endif +#ifdef DEV_NETMAP +static int alloc_nm_rxq(struct port_info *, struct sge_nm_rxq *, int, int, + struct sysctl_oid *); +static int free_nm_rxq(struct port_info *, struct sge_nm_rxq *); +static int alloc_nm_txq(struct port_info *, struct sge_nm_txq *, int, int, + struct sysctl_oid *); +static int free_nm_txq(struct port_info *, struct sge_nm_txq *); +#endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *); #ifdef TCP_OFFLOAD @@ -208,8 +221,6 @@ static int alloc_txq(struct port_info *, struct sge_txq *, int, struct sysctl_oid *); static int free_txq(struct port_info *, struct sge_txq *); static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); -static inline bool is_new_response(const struct sge_iq *, struct rsp_ctrl **); -static inline void iq_next(struct sge_iq *); static inline void ring_fl_db(struct adapter *, struct sge_fl *); static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); @@ -835,6 +846,24 @@ t4_teardown_adapter_queues(struct adapter *sc) return (0); } +static inline int +port_intr_count(struct port_info *pi) +{ + int rc = 0; + + if (pi->flags & INTR_RXQ) + rc += pi->nrxq; +#ifdef TCP_OFFLOAD + if (pi->flags & INTR_OFLD_RXQ) + rc += pi->nofldrxq; +#endif +#ifdef DEV_NETMAP + if (pi->flags & INTR_NM_RXQ) + rc += pi->nnmrxq; +#endif + return (rc); +} + static inline int first_vector(struct port_info *pi) { @@ -845,28 +874,10 @@ first_vector(struct port_info *pi) return (0); for_each_port(sc, i) { - struct port_info *p = sc->port[i]; - if (i == pi->port_id) break; -#ifdef TCP_OFFLOAD - if (sc->flags & INTR_DIRECT) - rc += p->nrxq + p->nofldrxq; - else - rc += max(p->nrxq, p->nofldrxq); -#else - /* - * Not compiled with offload support and intr_count > 1. Only - * NIC queues exist and they'd better be taking direct - * interrupts. - */ - KASSERT(sc->flags & INTR_DIRECT, - ("%s: intr_count %d, !INTR_DIRECT", __func__, - sc->intr_count)); - - rc += p->nrxq; -#endif + rc += port_intr_count(sc->port[i]); } return (rc); @@ -883,41 +894,48 @@ port_intr_iq(struct port_info *pi, int idx) struct adapter *sc = pi->adapter; struct sge *s = &sc->sge; struct sge_iq *iq = NULL; + int nintr, i; if (sc->intr_count == 1) return (&sc->sge.fwq); -#ifdef TCP_OFFLOAD - if (sc->flags & INTR_DIRECT) { - idx %= pi->nrxq + pi->nofldrxq; - - if (idx >= pi->nrxq) { - idx -= pi->nrxq; - iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq; - } else - iq = &s->rxq[pi->first_rxq + idx].iq; - - } else { - idx %= max(pi->nrxq, pi->nofldrxq); + nintr = port_intr_count(pi); + KASSERT(nintr != 0, + ("%s: pi %p has no exclusive interrupts, total interrupts = %d", + __func__, pi, sc->intr_count)); +#ifdef DEV_NETMAP + /* Exclude netmap queues as they can't take anyone else's interrupts */ + if (pi->flags & INTR_NM_RXQ) + nintr -= pi->nnmrxq; + KASSERT(nintr > 0, + ("%s: pi %p has nintr %d after netmap adjustment of %d", __func__, + pi, nintr, pi->nnmrxq)); +#endif + i = idx % nintr; - if (pi->nrxq >= pi->nofldrxq) - iq = &s->rxq[pi->first_rxq + idx].iq; - else - iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq; + if (pi->flags & INTR_RXQ) { + if (i < pi->nrxq) { + iq = &s->rxq[pi->first_rxq + i].iq; + goto done; + } + i -= pi->nrxq; + } +#ifdef TCP_OFFLOAD + if (pi->flags & INTR_OFLD_RXQ) { + if (i < pi->nofldrxq) { + iq = &s->ofld_rxq[pi->first_ofld_rxq + i].iq; + goto done; + } + i -= pi->nofldrxq; } -#else - /* - * Not compiled with offload support and intr_count > 1. Only NIC - * queues exist and they'd better be taking direct interrupts. - */ - KASSERT(sc->flags & INTR_DIRECT, - ("%s: intr_count %d, !INTR_DIRECT", __func__, sc->intr_count)); - - idx %= pi->nrxq; - iq = &s->rxq[pi->first_rxq + idx].iq; #endif - - KASSERT(iq->flags & IQ_INTR, ("%s: EDOOFUS", __func__)); + panic("%s: pi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__, + pi, pi->flags & INTR_ALL, idx, nintr); +done: + MPASS(iq != NULL); + KASSERT(iq->flags & IQ_INTR, + ("%s: iq %p (port %p, intr_flags 0x%lx, idx %d)", __func__, iq, pi, + pi->flags & INTR_ALL, idx)); return (iq); } @@ -954,7 +972,10 @@ t4_setup_port_queues(struct port_info *pi) #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; - struct sysctl_oid *oid2 = NULL; +#endif +#ifdef DEV_NETMAP + struct sge_nm_rxq *nm_rxq; + struct sge_nm_txq *nm_txq; #endif char name[16]; struct adapter *sc = pi->adapter; @@ -963,41 +984,29 @@ t4_setup_port_queues(struct port_info *pi) struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); int maxp, pack, mtu = ifp->if_mtu; - oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, - NULL, "rx queues"); - -#ifdef TCP_OFFLOAD - if (is_offload(sc)) { - oid2 = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq", - CTLFLAG_RD, NULL, - "rx queues for offloaded TCP connections"); - } -#endif - /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = first_vector(pi); /* - * First pass over all rx queues (NIC and TOE): + * First pass over all NIC and TOE rx queues: * a) initialize iq and fl * b) allocate queue iff it will take direct interrupts. */ maxp = mtu_to_max_payload(sc, mtu, 0); pack = enable_buffer_packing(sc); + if (pi->flags & INTR_RXQ) { + oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", + CTLFLAG_RD, NULL, "rx queues"); + } for_each_rxq(pi, i, rxq) { - init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq, - RX_IQ_ESIZE); + init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq); snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(pi->dev), i); init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); - if (sc->flags & INTR_DIRECT -#ifdef TCP_OFFLOAD - || (sc->intr_count > 1 && pi->nrxq >= pi->nofldrxq) -#endif - ) { + if (pi->flags & INTR_RXQ) { rxq->iq.flags |= IQ_INTR; rc = alloc_rxq(pi, rxq, intr_idx, i, oid); if (rc != 0) @@ -1005,22 +1014,42 @@ t4_setup_port_queues(struct port_info *pi) intr_idx++; } } - #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); + if (is_offload(sc) && pi->flags & INTR_OFLD_RXQ) { + oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq", + CTLFLAG_RD, NULL, + "rx queues for offloaded TCP connections"); + } for_each_ofld_rxq(pi, i, ofld_rxq) { init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, - pi->qsize_rxq, RX_IQ_ESIZE); + pi->qsize_rxq); snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(pi->dev), i); init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); - if (sc->flags & INTR_DIRECT || - (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) { + if (pi->flags & INTR_OFLD_RXQ) { ofld_rxq->iq.flags |= IQ_INTR; - rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid2); + rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid); + if (rc != 0) + goto done; + intr_idx++; + } + } +#endif +#ifdef DEV_NETMAP + /* + * We don't have buffers to back the netmap rx queues right now so we + * create the queues in a way that doesn't set off any congestion signal + * in the chip. + */ + if (pi->flags & INTR_NM_RXQ) { + oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "nm_rxq", + CTLFLAG_RD, NULL, "rx queues for netmap"); + for_each_nm_rxq(pi, i, nm_rxq) { + rc = alloc_nm_rxq(pi, nm_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; @@ -1029,35 +1058,45 @@ t4_setup_port_queues(struct port_info *pi) #endif /* - * Second pass over all rx queues (NIC and TOE). The queues forwarding + * Second pass over all NIC and TOE rx queues. The queues forwarding * their interrupts are allocated now. */ j = 0; - for_each_rxq(pi, i, rxq) { - if (rxq->iq.flags & IQ_INTR) - continue; + if (!(pi->flags & INTR_RXQ)) { + oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", + CTLFLAG_RD, NULL, "rx queues"); + for_each_rxq(pi, i, rxq) { + MPASS(!(rxq->iq.flags & IQ_INTR)); - intr_idx = port_intr_iq(pi, j)->abs_id; + intr_idx = port_intr_iq(pi, j)->abs_id; - rc = alloc_rxq(pi, rxq, intr_idx, i, oid); - if (rc != 0) - goto done; - j++; + rc = alloc_rxq(pi, rxq, intr_idx, i, oid); + if (rc != 0) + goto done; + j++; + } } - #ifdef TCP_OFFLOAD - for_each_ofld_rxq(pi, i, ofld_rxq) { - if (ofld_rxq->iq.flags & IQ_INTR) - continue; + if (is_offload(sc) && !(pi->flags & INTR_OFLD_RXQ)) { + oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq", + CTLFLAG_RD, NULL, + "rx queues for offloaded TCP connections"); + for_each_ofld_rxq(pi, i, ofld_rxq) { + MPASS(!(ofld_rxq->iq.flags & IQ_INTR)); - intr_idx = port_intr_iq(pi, j)->abs_id; + intr_idx = port_intr_iq(pi, j)->abs_id; - rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid2); - if (rc != 0) - goto done; - j++; + rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid); + if (rc != 0) + goto done; + j++; + } } #endif +#ifdef DEV_NETMAP + if (!(pi->flags & INTR_NM_RXQ)) + CXGBE_UNIMPLEMENTED(__func__); +#endif /* * Now the tx queues. Only one pass needed. @@ -1066,10 +1105,7 @@ t4_setup_port_queues(struct port_info *pi) NULL, "tx queues"); j = 0; for_each_txq(pi, i, txq) { - uint16_t iqid; - iqid = port_intr_iq(pi, j)->cntxt_id; - snprintf(name, sizeof(name), "%s txq%d", device_get_nameunit(pi->dev), i); init_eq(&txq->eq, EQ_ETH, pi->qsize_txq, pi->tx_chan, iqid, @@ -1080,15 +1116,13 @@ t4_setup_port_queues(struct port_info *pi) goto done; j++; } - #ifdef TCP_OFFLOAD oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections"); for_each_ofld_txq(pi, i, ofld_txq) { - uint16_t iqid; + struct sysctl_oid *oid2; iqid = port_intr_iq(pi, j)->cntxt_id; - snprintf(name, sizeof(name), "%s ofld_txq%d", device_get_nameunit(pi->dev), i); init_eq(&ofld_txq->eq, EQ_OFLD, pi->qsize_txq, pi->tx_chan, @@ -1104,6 +1138,17 @@ t4_setup_port_queues(struct port_info *pi) j++; } #endif +#ifdef DEV_NETMAP + oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "nm_txq", + CTLFLAG_RD, NULL, "tx queues for netmap use"); + for_each_nm_txq(pi, i, nm_txq) { + iqid = pi->first_nm_rxq + (j % pi->nnmrxq); + rc = alloc_nm_txq(pi, nm_txq, iqid, i, oid); + if (rc != 0) + goto done; + j++; + } +#endif /* * Finally, the control queue. @@ -1137,6 +1182,10 @@ t4_teardown_port_queues(struct port_info *pi) struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif +#ifdef DEV_NETMAP + struct sge_nm_rxq *nm_rxq; + struct sge_nm_txq *nm_txq; +#endif /* Do this before freeing the queues */ if (pi->flags & PORT_SYSCTL_CTX) { @@ -1154,12 +1203,15 @@ t4_teardown_port_queues(struct port_info *pi) for_each_txq(pi, i, txq) { free_txq(pi, txq); } - #ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { free_wrq(sc, ofld_txq); } #endif +#ifdef DEV_NETMAP + for_each_nm_txq(pi, i, nm_txq) + free_nm_txq(pi, nm_txq); +#endif /* * Then take down the rx queues that forward their interrupts, as they @@ -1170,13 +1222,16 @@ t4_teardown_port_queues(struct port_info *pi) if ((rxq->iq.flags & IQ_INTR) == 0) free_rxq(pi, rxq); } - #ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if ((ofld_rxq->iq.flags & IQ_INTR) == 0) free_ofld_rxq(pi, ofld_rxq); } #endif +#ifdef DEV_NETMAP + for_each_nm_rxq(pi, i, nm_rxq) + free_nm_rxq(pi, nm_rxq); +#endif /* * Then take down the rx queues that take direct interrupts. @@ -1186,13 +1241,15 @@ t4_teardown_port_queues(struct port_info *pi) if (rxq->iq.flags & IQ_INTR) free_rxq(pi, rxq); } - #ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) free_ofld_rxq(pi, ofld_rxq); } #endif +#ifdef DEV_NETMAP + CXGBE_UNIMPLEMENTED(__func__); +#endif return (0); } @@ -1254,36 +1311,44 @@ service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ - struct sge_fl *fl = &rxq->fl; /* Use iff IQ_HAS_FL */ + struct sge_fl *fl; /* Use iff IQ_HAS_FL */ struct adapter *sc = iq->adapter; - struct rsp_ctrl *ctrl; - const struct rss_header *rss; - int ndescs = 0, limit, fl_bufs_used = 0; - int rsp_type; + struct iq_desc *d = &iq->desc[iq->cidx]; + int ndescs = 0, limit; + int rsp_type, refill; uint32_t lq; + uint16_t fl_hw_cidx; struct mbuf *m0; STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); #if defined(INET) || defined(INET6) const struct timeval lro_timeout = {0, sc->lro_timeout}; #endif - limit = budget ? budget : iq->qsize / 8; - KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); + limit = budget ? budget : iq->qsize / 16; + + if (iq->flags & IQ_HAS_FL) { + fl = &rxq->fl; + fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ + } else { + fl = NULL; + fl_hw_cidx = 0; /* to silence gcc warning */ + } + /* * We always come back and check the descriptor ring for new indirect * interrupts and other responses after running a single handler. */ for (;;) { - while (is_new_response(iq, &ctrl)) { + while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); + refill = 0; m0 = NULL; - rsp_type = G_RSPD_TYPE(ctrl->u.type_gen); - lq = be32toh(ctrl->pldbuflen_qid); - rss = (const void *)iq->cdesc; + rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); + lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: @@ -1292,9 +1357,10 @@ service_iq(struct sge_iq *iq, int budget) ("%s: data for an iq (%p) with no freelist", __func__, iq)); - m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used); + m0 = get_fl_payload(sc, fl, lq); if (__predict_false(m0 == NULL)) goto process_iql; + refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2; #ifdef T4_PKT_TIMESTAMP /* * 60 bit timestamp for the payload is @@ -1313,10 +1379,10 @@ service_iq(struct sge_iq *iq, int budget) /* fall through */ case X_RSPD_TYPE_CPL: - KASSERT(rss->opcode < NUM_CPL_CMDS, + KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, - rss->opcode)); - sc->cpl_handler[rss->opcode](iq, rss, m0); + d->rss.opcode)); + sc->cpl_handler[d->rss.opcode](iq, &d->rss, m0); break; case X_RSPD_TYPE_INTR: @@ -1338,14 +1404,14 @@ service_iq(struct sge_iq *iq, int budget) * iWARP async notification. */ if (lq >= 1024) { - sc->an_handler(iq, ctrl); + sc->an_handler(iq, &d->rsp); break; } q = sc->sge.iqmap[lq - sc->sge.iq_start]; if (atomic_cmpset_int(&q->state, IQS_IDLE, IQS_BUSY)) { - if (service_iq(q, q->qsize / 8) == 0) { + if (service_iq(q, q->qsize / 16) == 0) { atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); } else { @@ -1365,16 +1431,13 @@ service_iq(struct sge_iq *iq, int budget) break; } - if (fl_bufs_used >= 16) { - FL_LOCK(fl); - fl->needed += fl_bufs_used; - refill_fl(sc, fl, 32); - FL_UNLOCK(fl); - fl_bufs_used = 0; + d++; + if (__predict_false(++iq->cidx == iq->sidx)) { + iq->cidx = 0; + iq->gen ^= F_RSPD_GEN; + d = &iq->desc[0]; } - - iq_next(iq); - if (++ndescs == limit) { + if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | @@ -1390,15 +1453,20 @@ service_iq(struct sge_iq *iq, int budget) #endif if (budget) { - if (fl_bufs_used) { + if (iq->flags & IQ_HAS_FL) { FL_LOCK(fl); - fl->needed += fl_bufs_used; refill_fl(sc, fl, 32); FL_UNLOCK(fl); } return (EINPROGRESS); } } + if (refill) { + FL_LOCK(fl); + refill_fl(sc, fl, 32); + FL_UNLOCK(fl); + fl_hw_cidx = fl->hw_cidx; + } } process_iql: @@ -1437,7 +1505,6 @@ service_iq(struct sge_iq *iq, int budget) int starved; FL_LOCK(fl); - fl->needed += fl_bufs_used; starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) @@ -1506,7 +1573,7 @@ get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int flags) caddr_t payload; len = min(total, hwb->size - fl->rx_offset); - padded_len = roundup2(len, fl_pad); + padded_len = roundup2(len, fl->buf_boundary); payload = sd->cl + cll->region1 + fl->rx_offset; if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { @@ -1572,38 +1639,32 @@ get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int flags) m->m_len = len; if (fl->flags & FL_BUF_PACKING) { - fl->rx_offset += roundup2(padded_len, sc->sge.pack_boundary); + fl->rx_offset += padded_len; MPASS(fl->rx_offset <= hwb->size); if (fl->rx_offset < hwb->size) return (m); /* without advancing the cidx */ } - if (__predict_false(++fl->cidx == fl->cap)) - fl->cidx = 0; + if (__predict_false(++fl->cidx % 8 == 0)) { + uint16_t cidx = fl->cidx / 8; + + if (__predict_false(cidx == fl->sidx)) + fl->cidx = cidx = 0; + fl->hw_cidx = cidx; + } fl->rx_offset = 0; return (m); } static struct mbuf * -get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, - int *fl_bufs_used) +get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf) { struct mbuf *m0, *m, **pnext; - u_int nbuf, len; - - /* - * No assertion for the fl lock because we don't need it. This routine - * is called only from the rx interrupt handler and it only updates - * fl->cidx. (Contrast that with fl->pidx/fl->needed which could be - * updated in the rx interrupt handler or the starvation helper routine. - * That's why code that manipulates fl->pidx/fl->needed needs the fl - * lock but this routine does not). - */ + u_int len; - nbuf = 0; len = G_RSPD_LEN(len_newbuf); - if (__predict_false(fl->m0 != NULL)) { + if (__predict_false(fl->flags & FL_BUF_RESUME)) { M_ASSERTPKTHDR(fl->m0); MPASS(len == fl->m0->m_pkthdr.len); MPASS(fl->remaining < len); @@ -1611,15 +1672,19 @@ get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, m0 = fl->m0; pnext = fl->pnext; len = fl->remaining; - fl->m0 = NULL; + fl->flags &= ~FL_BUF_RESUME; goto get_segment; } if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) { - nbuf++; fl->rx_offset = 0; - if (__predict_false(++fl->cidx == fl->cap)) - fl->cidx = 0; + if (__predict_false(++fl->cidx % 8 == 0)) { + uint16_t cidx = fl->cidx / 8; + + if (__predict_false(cidx == fl->sidx)) + fl->cidx = cidx = 0; + fl->hw_cidx = cidx; + } } /* @@ -1629,30 +1694,26 @@ get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, m0 = get_scatter_segment(sc, fl, len, M_PKTHDR); if (m0 == NULL) - goto done; + return (NULL); len -= m0->m_len; pnext = &m0->m_next; while (len > 0) { - nbuf++; get_segment: MPASS(fl->rx_offset == 0); m = get_scatter_segment(sc, fl, len, 0); - if (m == NULL) { + if (__predict_false(m == NULL)) { fl->m0 = m0; fl->pnext = pnext; fl->remaining = len; - m0 = NULL; - goto done; + fl->flags |= FL_BUF_RESUME; + return (NULL); } *pnext = m; pnext = &m->m_next; len -= m->m_len; } *pnext = NULL; - if (fl->rx_offset == 0) - nbuf++; -done: - (*fl_bufs_used) += nbuf; + return (m0); } @@ -2040,8 +2101,9 @@ can_resume_tx(struct sge_eq *eq) static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, - int qsize, int esize) + int qsize) { + KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, ("%s: bad tmr_idx %d", __func__, tmr_idx)); KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ @@ -2056,7 +2118,7 @@ init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, iq->intr_pktc_idx = pktc_idx; } iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ - iq->esize = max(esize, 16); /* See FW_IQ_CMD/iqesize */ + iq->sidx = iq->qsize - spg_len / IQ_ESIZE; } static inline void @@ -2065,6 +2127,7 @@ init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, int pack, { fl->qsize = qsize; + fl->sidx = qsize - spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); if (pack) fl->flags |= FL_BUF_PACKING; @@ -2157,7 +2220,7 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, struct adapter *sc = iq->adapter; __be32 v = 0; - len = iq->qsize * iq->esize; + len = iq->qsize * IQ_ESIZE; rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, (void **)&iq->desc); if (rc != 0) @@ -2189,7 +2252,7 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | - V_FW_IQ_CMD_IQESIZE(ilog2(iq->esize) - 4)); + V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(iq->qsize); c.iqaddr = htobe64(iq->ba); if (cong >= 0) @@ -2198,14 +2261,13 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, if (fl) { mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); - len = fl->qsize * RX_FL_ESIZE; + len = fl->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); if (rc) return (rc); /* Allocate space for one software descriptor per buffer. */ - fl->cap = (fl->qsize - spg_len / RX_FL_ESIZE) * 8; rc = alloc_fl_sdesc(fl); if (rc != 0) { device_printf(sc->dev, @@ -2213,10 +2275,14 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, rc); return (rc); } - fl->needed = fl->cap; - fl->lowat = fl->flags & FL_BUF_PACKING ? - roundup2(sc->sge.fl_starve_threshold2, 8) : - roundup2(sc->sge.fl_starve_threshold, 8); + + if (fl->flags & FL_BUF_PACKING) { + fl->lowat = roundup2(sc->sge.fl_starve_threshold2, 8); + fl->buf_boundary = max(fl_pad, sc->sge.pack_boundary); + } else { + fl->lowat = roundup2(sc->sge.fl_starve_threshold, 8); + fl->buf_boundary = fl_pad; + } c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | @@ -2244,9 +2310,8 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, return (rc); } - iq->cdesc = iq->desc; iq->cidx = 0; - iq->gen = 1; + iq->gen = F_RSPD_GEN; iq->intr_next = iq->intr_params; iq->cntxt_id = be16toh(c.iqid); iq->abs_id = be16toh(c.physiqid); @@ -2260,6 +2325,9 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, sc->sge.iqmap[cntxt_id] = iq; if (fl) { + u_int qid; + + iq->flags |= IQ_HAS_FL; fl->cntxt_id = be16toh(c.fl0id); fl->pidx = fl->cidx = 0; @@ -2270,12 +2338,29 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, } sc->sge.eqmap[cntxt_id] = (void *)fl; + qid = fl->cntxt_id; + if (isset(&sc->doorbells, DOORBELL_UDB)) { + uint32_t s_qpp = sc->sge.eq_s_qpp; + uint32_t mask = (1 << s_qpp) - 1; + volatile uint8_t *udb; + + udb = sc->udbs_base + UDBS_DB_OFFSET; + udb += (qid >> s_qpp) << PAGE_SHIFT; + qid &= mask; + if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { + udb += qid << UDBS_SEG_SHIFT; + qid = 0; + } + fl->udb = (volatile void *)udb; + } + fl->dbval = F_DBPRIO | V_QID(qid); + if (is_t5(sc)) + fl->dbval |= F_DBTYPE; + FL_LOCK(fl); /* Enough to make sure the SGE doesn't think it's starved */ refill_fl(sc, fl, fl->lowat); FL_UNLOCK(fl); - - iq->flags |= IQ_HAS_FL; } if (is_t5(sc) && cong >= 0) { @@ -2396,7 +2481,7 @@ alloc_fwq(struct adapter *sc) struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); - init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE, FW_IQ_ESIZE); + init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE); fwq->flags |= IQ_INTR; /* always */ intr_idx = sc->intr_count > 1 ? 1 : 0; rc = alloc_iq_fl(sc->port[0], fwq, NULL, intr_idx, -1); @@ -2485,8 +2570,12 @@ alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx, if (rc != 0) return (rc); + /* + * The freelist is just barely above the starvation threshold right now, + * fill it up a bit more. + */ FL_LOCK(&rxq->fl); - refill_fl(pi->adapter, &rxq->fl, rxq->fl.needed / 8); + refill_fl(pi->adapter, &rxq->fl, 128); FL_UNLOCK(&rxq->fl); #if defined(INET) || defined(INET6) @@ -2601,6 +2690,143 @@ free_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq) } #endif +#ifdef DEV_NETMAP +static int +alloc_nm_rxq(struct port_info *pi, struct sge_nm_rxq *nm_rxq, int intr_idx, + int idx, struct sysctl_oid *oid) +{ + int rc; + struct sysctl_oid_list *children; + struct sysctl_ctx_list *ctx; + char name[16]; + size_t len; + struct adapter *sc = pi->adapter; + struct netmap_adapter *na = NA(pi->nm_ifp); + + MPASS(na != NULL); + + len = pi->qsize_rxq * IQ_ESIZE; + rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map, + &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc); + if (rc != 0) + return (rc); + + len = na->num_rx_desc * EQ_ESIZE + spg_len; + rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map, + &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc); + if (rc != 0) + return (rc); + + nm_rxq->pi = pi; + nm_rxq->nid = idx; + nm_rxq->iq_cidx = 0; + nm_rxq->iq_sidx = pi->qsize_rxq - spg_len / IQ_ESIZE; + nm_rxq->iq_gen = F_RSPD_GEN; + nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; + nm_rxq->fl_sidx = na->num_rx_desc; + nm_rxq->intr_idx = intr_idx; + + ctx = &pi->ctx; + children = SYSCTL_CHILDREN(oid); + + snprintf(name, sizeof(name), "%d", idx); + oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, + "rx queue"); + children = SYSCTL_CHILDREN(oid); + + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id", + CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16, + "I", "absolute id of the queue"); + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", + CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16, + "I", "SGE context id of the queue"); + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", + CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I", + "consumer index"); + + children = SYSCTL_CHILDREN(oid); + oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, + "freelist"); + children = SYSCTL_CHILDREN(oid); + + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", + CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16, + "I", "SGE context id of the freelist"); + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, + &nm_rxq->fl_cidx, 0, "consumer index"); + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, + &nm_rxq->fl_pidx, 0, "producer index"); + + return (rc); +} + + +static int +free_nm_rxq(struct port_info *pi, struct sge_nm_rxq *nm_rxq) +{ + struct adapter *sc = pi->adapter; + + free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba, + nm_rxq->iq_desc); + free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba, + nm_rxq->fl_desc); + + return (0); +} + +static int +alloc_nm_txq(struct port_info *pi, struct sge_nm_txq *nm_txq, int iqidx, int idx, + struct sysctl_oid *oid) +{ + int rc; + size_t len; + struct adapter *sc = pi->adapter; + struct netmap_adapter *na = NA(pi->nm_ifp); + char name[16]; + struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); + + len = na->num_tx_desc * EQ_ESIZE + spg_len; + rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map, + &nm_txq->ba, (void **)&nm_txq->desc); + if (rc) + return (rc); + + nm_txq->pidx = nm_txq->cidx = 0; + nm_txq->sidx = na->num_tx_desc; + nm_txq->nid = idx; + nm_txq->iqidx = iqidx; + nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | + V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf)); + + snprintf(name, sizeof(name), "%d", idx); + oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD, + NULL, "netmap tx queue"); + children = SYSCTL_CHILDREN(oid); + + SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, + &nm_txq->cntxt_id, 0, "SGE context id of the queue"); + SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx", + CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I", + "consumer index"); + SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "pidx", + CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I", + "producer index"); + + return (rc); +} + +static int +free_nm_txq(struct port_info *pi, struct sge_nm_txq *nm_txq) +{ + struct adapter *sc = pi->adapter; + + free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba, + nm_txq->desc); + + return (0); +} +#endif + static int ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { @@ -2788,7 +3014,7 @@ alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ eq->udb_qid = eq->cntxt_id & mask; /* id in page */ - if (eq->udb_qid > PAGE_SIZE / UDBS_SEG_SIZE) + if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&eq->doorbells, DOORBELL_WCWR); else { udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ @@ -3016,73 +3242,60 @@ oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) *ba = error ? 0 : segs->ds_addr; } -static inline bool -is_new_response(const struct sge_iq *iq, struct rsp_ctrl **ctrl) -{ - *ctrl = (void *)((uintptr_t)iq->cdesc + - (iq->esize - sizeof(struct rsp_ctrl))); - - return (((*ctrl)->u.type_gen >> S_RSPD_GEN) == iq->gen); -} - -static inline void -iq_next(struct sge_iq *iq) -{ - iq->cdesc = (void *) ((uintptr_t)iq->cdesc + iq->esize); - if (__predict_false(++iq->cidx == iq->qsize - 1)) { - iq->cidx = 0; - iq->gen ^= 1; - iq->cdesc = iq->desc; - } -} - -#define FL_HW_IDX(x) ((x) >> 3) static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl) { - int ndesc = fl->pending / 8; - uint32_t v; + uint32_t n, v; - if (FL_HW_IDX(fl->pidx) == FL_HW_IDX(fl->cidx)) - ndesc--; /* hold back one credit */ - - if (ndesc <= 0) - return; /* nothing to do */ - - v = F_DBPRIO | V_QID(fl->cntxt_id) | V_PIDX(ndesc); - if (is_t5(sc)) - v |= F_DBTYPE; + n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx); + MPASS(n > 0); wmb(); - - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v); - fl->pending -= ndesc * 8; + v = fl->dbval | V_PIDX(n); + if (fl->udb) + *fl->udb = htole32(v); + else + t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v); + IDXINCR(fl->dbidx, n, fl->sidx); } /* - * Fill up the freelist by upto nbufs and maybe ring its doorbell. + * Fills up the freelist by allocating upto 'n' buffers. Buffers that are + * recycled do not count towards this allocation budget. * - * Returns non-zero to indicate that it should be added to the list of starving - * freelists. + * Returns non-zero to indicate that this freelist should be added to the list + * of starving freelists. */ static int -refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs) +refill_fl(struct adapter *sc, struct sge_fl *fl, int n) { - __be64 *d = &fl->desc[fl->pidx]; - struct fl_sdesc *sd = &fl->sdesc[fl->pidx]; + __be64 *d; + struct fl_sdesc *sd; uintptr_t pa; caddr_t cl; - struct cluster_layout *cll = &fl->cll_def; /* default layout */ - struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; + struct cluster_layout *cll; + struct sw_zone_info *swz; struct cluster_metadata *clm; + uint16_t max_pidx; + uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ FL_LOCK_ASSERT_OWNED(fl); - if (nbufs > fl->needed) - nbufs = fl->needed; - nbufs -= (fl->pidx + nbufs) % 8; + /* + * We always stop at the begining of the hardware descriptor that's just + * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, + * which would mean an empty freelist to the chip. + */ + max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; + if (fl->pidx == max_pidx * 8) + return (0); + + d = &fl->desc[fl->pidx]; + sd = &fl->sdesc[fl->pidx]; + cll = &fl->cll_def; /* default layout */ + swz = &sc->sge.sw_zone_info[cll->zidx]; - while (nbufs--) { + while (n > 0) { if (sd->cl != NULL) { @@ -3132,6 +3345,7 @@ refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs) goto alloc; } fl->cl_allocated++; + n--; pa = pmap_kextract((vm_offset_t)cl); pa += cll->region1; @@ -3148,18 +3362,26 @@ refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs) } sd->nmbuf = 0; recycled_fast: - fl->pending++; - fl->needed--; d++; sd++; - if (__predict_false(++fl->pidx == fl->cap)) { - fl->pidx = 0; - sd = fl->sdesc; - d = fl->desc; + if (__predict_false(++fl->pidx % 8 == 0)) { + uint16_t pidx = fl->pidx / 8; + + if (__predict_false(pidx == fl->sidx)) { + fl->pidx = 0; + pidx = 0; + sd = fl->sdesc; + d = fl->desc; + } + if (pidx == max_pidx) + break; + + if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) + ring_fl_db(sc, fl); } } - if (fl->pending >= 8) + if (fl->pidx / 8 != fl->dbidx) ring_fl_db(sc, fl); return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); @@ -3194,7 +3416,7 @@ static int alloc_fl_sdesc(struct sge_fl *fl) { - fl->sdesc = malloc(fl->cap * sizeof(struct fl_sdesc), M_CXGBE, + fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); return (0); @@ -3209,7 +3431,7 @@ free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) int i; sd = fl->sdesc; - for (i = 0; i < fl->cap; i++, sd++) { + for (i = 0; i < fl->sidx * 8; i++, sd++) { if (sd->cl == NULL) continue; diff --git a/sys/dev/cxgbe/t4_tracer.c b/sys/dev/cxgbe/t4_tracer.c index e9727f5f1..771636ad9 100644 --- a/sys/dev/cxgbe/t4_tracer.c +++ b/sys/dev/cxgbe/t4_tracer.c @@ -192,7 +192,6 @@ t4_cloner_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | IFM_NONE, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | IFM_NONE); ether_ifattach(ifp, lla); - if_up(ifp); mtx_lock(&sc->ifp_lock); ifp->if_softc = sc; diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index e2f5c79f0..f18e0c7bc 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -70,6 +70,33 @@ VNET_DECLARE(int, tcp_autorcvbuf_inc); VNET_DECLARE(int, tcp_autorcvbuf_max); #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) +/* + * For ULP connections HW may add headers, e.g., for digests, that aren't part + * of the messages sent by the host but that are part of the TCP payload and + * therefore consume TCP sequence space. Tx connection parameters that + * operate in TCP sequence space are affected by the HW additions and need to + * compensate for them to accurately track TCP sequence numbers. This array + * contains the compensating extra lengths for ULP packets. It is indexed by + * a packet's ULP submode. + */ +const unsigned int t4_ulp_extra_len[] = {0, 4, 4, 8}; + +/* + * Return the length of any HW additions that will be made to a Tx packet. + * Such additions can happen for some types of ULP packets. + */ +static inline unsigned int +ulp_extra_len(struct mbuf *m, int *ulp_mode) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, CXGBE_ISCSI_MBUF_TAG, NULL)) == NULL) + return (0); + *ulp_mode = *((int *)(mtag + 1)); + + return (t4_ulp_extra_len[*ulp_mode & 3]); +} + void send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) { @@ -341,8 +368,13 @@ t4_rcvd(struct toedev *tod, struct tcpcb *tp) KASSERT(toep->sb_cc >= sb->sb_cc, ("%s: sb %p has more data (%d) than last time (%d).", __func__, sb, sb->sb_cc, toep->sb_cc)); - toep->rx_credits += toep->sb_cc - sb->sb_cc; - toep->sb_cc = sb->sb_cc; + if (toep->ulp_mode == ULP_MODE_ISCSI) { + toep->rx_credits += toep->sb_cc; + toep->sb_cc = 0; + } else { + toep->rx_credits += toep->sb_cc - sb->sb_cc; + toep->sb_cc = sb->sb_cc; + } credits = toep->rx_credits; SOCKBUF_UNLOCK(sb); @@ -444,16 +476,25 @@ max_dsgl_nsegs(int tx_credits) static inline void write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, - unsigned int plen, uint8_t credits, int shove) + unsigned int plen, uint8_t credits, int shove, int ulp_mode) { struct fw_ofld_tx_data_wr *txwr = dst; + unsigned int wr_ulp_mode; txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | V_FW_WR_IMMDLEN(immdlen)); txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | V_FW_WR_LEN16(credits)); + + /* for iscsi, the mode & submode setting is per-packet */ + if (toep->ulp_mode == ULP_MODE_ISCSI) + wr_ulp_mode = V_FW_OFLD_TX_DATA_WR_ULPMODE(ulp_mode >> 4) | + V_FW_OFLD_TX_DATA_WR_ULPSUBMODE(ulp_mode & 3); + else + wr_ulp_mode = V_FW_OFLD_TX_DATA_WR_ULPMODE(toep->ulp_mode); + txwr->lsodisable_to_proxy = - htobe32(V_FW_OFLD_TX_DATA_WR_ULPMODE(toep->ulp_mode) | + htobe32(wr_ulp_mode | V_FW_OFLD_TX_DATA_WR_URGENT(0) | /* XXX */ V_FW_OFLD_TX_DATA_WR_SHOVE(shove)); txwr->plen = htobe32(plen); @@ -527,7 +568,7 @@ write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) * stalls). When drop is set this function MUST drop the bytes and wake up any * writers. */ -static void +void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) { struct mbuf *sndptr, *m, *sb_sndptr; @@ -660,7 +701,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); - write_tx_wr(txwr, toep, plen, plen, credits, shove); + write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { @@ -678,7 +719,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) } txwr = wrtod(wr); credits = howmany(wr_len, 16); - write_tx_wr(txwr, toep, 0, plen, credits, shove); + write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { @@ -734,6 +775,177 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) close_conn(sc, toep); } +/* Send ULP data over TOE using TX_DATA_WR. We send whole mbuf at once */ +void +t4_ulp_push_frames(struct adapter *sc, struct toepcb *toep, int drop) +{ + struct mbuf *sndptr, *m = NULL; + struct fw_ofld_tx_data_wr *txwr; + struct wrqe *wr; + unsigned int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; + struct inpcb *inp = toep->inp; + struct tcpcb *tp; + struct socket *so; + struct sockbuf *sb; + int tx_credits, ulp_len = 0, ulp_mode = 0, qlen = 0; + int shove, compl; + struct ofld_tx_sdesc *txsd; + + INP_WLOCK_ASSERT(inp); + if (toep->flags & TPF_ABORT_SHUTDOWN) + return; + + tp = intotcpcb(inp); + so = inp->inp_socket; + sb = &so->so_snd; + txsd = &toep->txsd[toep->txsd_pidx]; + + KASSERT(toep->flags & TPF_FLOWC_WR_SENT, + ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); + + /* + * This function doesn't resume by itself. Someone else must clear the + * flag and call this function. + */ + if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) + return; + + sndptr = t4_queue_iscsi_callback(so, toep, 1, &qlen); + if (!qlen) + return; + + do { + tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); + max_imm = max_imm_payload(tx_credits); + max_nsegs = max_dsgl_nsegs(tx_credits); + + if (drop) { + t4_cpl_iscsi_callback(toep->td, toep, &drop, + CPL_FW4_ACK); + drop = 0; + } + + plen = 0; + nsegs = 0; + max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ + for (m = sndptr; m != NULL; m = m->m_next) { + int n = sglist_count(mtod(m, void *), m->m_len); + + nsegs += n; + plen += m->m_len; + + /* This mbuf sent us _over_ the nsegs limit, return */ + if (plen > max_imm && nsegs > max_nsegs) { + toep->flags |= TPF_TX_SUSPENDED; + return; + } + + if (max_nsegs_1mbuf < n) + max_nsegs_1mbuf = n; + + /* This mbuf put us right at the max_nsegs limit */ + if (plen > max_imm && nsegs == max_nsegs) { + toep->flags |= TPF_TX_SUSPENDED; + return; + } + } + + shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); + /* nothing to send */ + if (plen == 0) { + KASSERT(m == NULL, + ("%s: nothing to send, but m != NULL", __func__)); + break; + } + + if (__predict_false(toep->flags & TPF_FIN_SENT)) + panic("%s: excess tx.", __func__); + + ulp_len = plen + ulp_extra_len(sndptr, &ulp_mode); + if (plen <= max_imm) { + + /* Immediate data tx */ + wr = alloc_wrqe(roundup(sizeof(*txwr) + plen, 16), + toep->ofld_txq); + if (wr == NULL) { + /* XXX: how will we recover from this? */ + toep->flags |= TPF_TX_SUSPENDED; + return; + } + txwr = wrtod(wr); + credits = howmany(wr->wr_len, 16); + write_tx_wr(txwr, toep, plen, ulp_len, credits, shove, + ulp_mode); + m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); + } else { + int wr_len; + + /* DSGL tx */ + wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; + wr = alloc_wrqe(roundup(wr_len, 16), toep->ofld_txq); + if (wr == NULL) { + /* XXX: how will we recover from this? */ + toep->flags |= TPF_TX_SUSPENDED; + return; + } + txwr = wrtod(wr); + credits = howmany(wr_len, 16); + write_tx_wr(txwr, toep, 0, ulp_len, credits, shove, + ulp_mode); + write_tx_sgl(txwr + 1, sndptr, m, nsegs, + max_nsegs_1mbuf); + if (wr_len & 0xf) { + uint64_t *pad = (uint64_t *) + ((uintptr_t)txwr + wr_len); + *pad = 0; + } + } + + KASSERT(toep->tx_credits >= credits, + ("%s: not enough credits", __func__)); + + toep->tx_credits -= credits; + toep->tx_nocompl += credits; + toep->plen_nocompl += plen; + if (toep->tx_credits <= toep->tx_total * 3 / 8 && + toep->tx_nocompl >= toep->tx_total / 4) + compl = 1; + + if (compl) { + txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); + toep->tx_nocompl = 0; + toep->plen_nocompl = 0; + } + tp->snd_nxt += ulp_len; + tp->snd_max += ulp_len; + + /* goto next mbuf */ + sndptr = m = t4_queue_iscsi_callback(so, toep, 2, &qlen); + + toep->flags |= TPF_TX_DATA_SENT; + if (toep->tx_credits < MIN_OFLD_TX_CREDITS) { + toep->flags |= TPF_TX_SUSPENDED; + } + + KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); + txsd->plen = plen; + txsd->tx_credits = credits; + txsd++; + if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { + toep->txsd_pidx = 0; + txsd = &toep->txsd[0]; + } + toep->txsd_avail--; + + t4_l2t_send(sc, wr, toep->l2te); + } while (m != NULL); + + /* Send a FIN if requested, but only if there's no more data to send */ + if (m == NULL && toep->flags & TPF_SEND_FIN) + close_conn(sc, toep); +} + int t4_tod_output(struct toedev *tod, struct tcpcb *tp) { @@ -768,8 +980,12 @@ t4_send_fin(struct toedev *tod, struct tcpcb *tp) KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); toep->flags |= TPF_SEND_FIN; - if (tp->t_state >= TCPS_ESTABLISHED) - t4_push_frames(sc, toep, 0); + if (tp->t_state >= TCPS_ESTABLISHED) { + if (toep->ulp_mode == ULP_MODE_ISCSI) + t4_ulp_push_frames(sc, toep, 0); + else + t4_push_frames(sc, toep, 0); + } return (0); } @@ -1019,6 +1235,91 @@ abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) } } +int +cpl_not_handled(struct sge_iq *, const struct rss_header *, struct mbuf *); +/* + * tom_cpl_iscsi_callback - + * iscsi and tom would share the following cpl messages, so when any of these + * message is received, after tom is done with processing it, the messages + * needs to be forwarded to iscsi for further processing: + * - CPL_SET_TCB_RPL + * - CPL_RX_DATA_DDP + */ +void (*tom_cpl_iscsi_callback)(struct tom_data *, struct socket *, void *, + unsigned int); + +struct mbuf *(*tom_queue_iscsi_callback)(struct socket *, unsigned int, int *); +/* + * Check if the handler function is set for a given CPL + * return 0 if the function is NULL or cpl_not_handled, 1 otherwise. + */ +int +t4tom_cpl_handler_registered(struct adapter *sc, unsigned int opcode) +{ + + MPASS(opcode < nitems(sc->cpl_handler)); + + return (sc->cpl_handler[opcode] && + sc->cpl_handler[opcode] != cpl_not_handled); +} + +/* + * set the tom_cpl_iscsi_callback function, this function should be used + * whenever both toe and iscsi need to process the same cpl msg. + */ +void +t4tom_register_cpl_iscsi_callback(void (*fp)(struct tom_data *, struct socket *, + void *, unsigned int)) +{ + + tom_cpl_iscsi_callback = fp; +} + +void +t4tom_register_queue_iscsi_callback(struct mbuf *(*fp)(struct socket *, + unsigned int, int *qlen)) +{ + + tom_queue_iscsi_callback = fp; +} + +int +t4_cpl_iscsi_callback(struct tom_data *td, struct toepcb *toep, void *m, + unsigned int opcode) +{ + struct socket *so; + + if (opcode == CPL_FW4_ACK) + so = toep->inp->inp_socket; + else { + INP_WLOCK(toep->inp); + so = toep->inp->inp_socket; + INP_WUNLOCK(toep->inp); + } + + if (tom_cpl_iscsi_callback && so) { + if (toep->ulp_mode == ULP_MODE_ISCSI) { + tom_cpl_iscsi_callback(td, so, m, opcode); + return (0); + } + } + + return (1); +} + +struct mbuf * +t4_queue_iscsi_callback(struct socket *so, struct toepcb *toep, + unsigned int cmd, int *qlen) +{ + + if (tom_queue_iscsi_callback && so) { + if (toep->ulp_mode == ULP_MODE_ISCSI) + return (tom_queue_iscsi_callback(so, cmd, qlen)); + } + + return (NULL); +} + /* * TCP RST from the peer, timeout, or some other such critical error. */ @@ -1408,14 +1709,22 @@ do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) if (toep->flags & TPF_TX_SUSPENDED && toep->tx_credits >= toep->tx_total / 4) { toep->flags &= ~TPF_TX_SUSPENDED; - t4_push_frames(sc, toep, plen); + if (toep->ulp_mode == ULP_MODE_ISCSI) + t4_ulp_push_frames(sc, toep, plen); + else + t4_push_frames(sc, toep, plen); } else if (plen > 0) { struct sockbuf *sb = &so->so_snd; - SOCKBUF_LOCK(sb); - sbdrop_locked(sb, plen); - sowwakeup_locked(so); - SOCKBUF_UNLOCK_ASSERT(sb); + if (toep->ulp_mode == ULP_MODE_ISCSI) + t4_cpl_iscsi_callback(toep->td, toep, &plen, + CPL_FW4_ACK); + else { + SOCKBUF_LOCK(sb); + sbdrop_locked(sb, plen); + sowwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(sb); + } } INP_WUNLOCK(inp); @@ -1439,6 +1748,12 @@ do_set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) if (is_ftid(sc, tid)) return (t4_filter_rpl(iq, rss, m)); /* TCB is a filter */ + else { + struct toepcb *toep = lookup_tid(sc, tid); + + t4_cpl_iscsi_callback(toep->td, toep, m, CPL_SET_TCB_RPL); + return (0); + } CXGBE_UNIMPLEMENTED(__func__); } diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c index 6c9367564..004420a5d 100644 --- a/sys/dev/cxgbe/tom/t4_ddp.c +++ b/sys/dev/cxgbe/tom/t4_ddp.c @@ -493,6 +493,7 @@ do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) unsigned int tid = GET_TID(cpl); uint32_t vld; struct toepcb *toep = lookup_tid(sc, tid); + struct tom_data *td = toep->td; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); @@ -504,6 +505,16 @@ do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) panic("%s: DDP error 0x%x (tid %d, toep %p)", __func__, vld, tid, toep); } + if (toep->ulp_mode == ULP_MODE_ISCSI) { + m = m_get(M_NOWAIT, MT_DATA); + if (m == NULL) + CXGBE_UNIMPLEMENTED("mbuf alloc failure"); + memcpy(mtod(m, unsigned char *), cpl, + sizeof(struct cpl_rx_data_ddp)); + if (!t4_cpl_iscsi_callback(td, toep, m, CPL_RX_DATA_DDP)) + return (0); + m_freem(m); + } handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index 6328240cd..1d883fed8 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -279,6 +279,7 @@ int t4_send_fin(struct toedev *, struct tcpcb *); int t4_send_rst(struct toedev *, struct tcpcb *); void t4_set_tcb_field(struct adapter *, struct toepcb *, int, uint16_t, uint64_t, uint64_t); +void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop); /* t4_ddp.c */ void t4_init_ddp(struct adapter *, struct tom_data *); @@ -289,4 +290,20 @@ struct mbuf *get_ddp_mbuf(int); void enable_ddp(struct adapter *, struct toepcb *toep); void release_ddp_resources(struct toepcb *toep); void insert_ddp_data(struct toepcb *, uint32_t); + +/* ULP related */ +#define CXGBE_ISCSI_MBUF_TAG 50 +int t4tom_cpl_handler_registered(struct adapter *, unsigned int); +void t4tom_register_cpl_iscsi_callback(void (*fp)(struct tom_data *, + struct socket *, void *, unsigned int)); +void t4tom_register_queue_iscsi_callback(struct mbuf *(*fp)(struct socket *, + unsigned int, int *)); +void t4_ulp_push_frames(struct adapter *sc, struct toepcb *toep, int); +int t4_cpl_iscsi_callback(struct tom_data *, struct toepcb *, void *, uint32_t); +struct mbuf *t4_queue_iscsi_callback(struct socket *, struct toepcb *, uint32_t, + int *); +extern void (*tom_cpl_iscsi_callback)(struct tom_data *, struct socket *, + void *, unsigned int); +extern struct mbuf *(*tom_queue_iscsi_callback)(struct socket*, unsigned int, + int *); #endif diff --git a/sys/modules/cxgbe/Makefile b/sys/modules/cxgbe/Makefile index d7ce6471b..756cb4c16 100644 --- a/sys/modules/cxgbe/Makefile +++ b/sys/modules/cxgbe/Makefile @@ -2,19 +2,19 @@ # $FreeBSD$ # -SUBDIR = if_cxgbe -SUBDIR+= t4_firmware -SUBDIR+= t5_firmware -SUBDIR+= ${_tom} -SUBDIR+= ${_iw_cxgbe} +SUBDIR= if_cxgbe +SUBDIR+= t4_firmware +SUBDIR+= t5_firmware +SUBDIR+= ${_tom} +SUBDIR+= ${_iw_cxgbe} .if ${MACHINE_CPUARCH} == "amd64" -_tom= tom +_tom= tom _iw_cxgbe= iw_cxgbe .endif .if ${MACHINE_CPUARCH} == "i386" -_tom= tom +_tom= tom .endif diff --git a/sys/modules/cxgbe/if_cxgbe/Makefile b/sys/modules/cxgbe/if_cxgbe/Makefile index f4ebcdd39..7fecb0684 100644 --- a/sys/modules/cxgbe/if_cxgbe/Makefile +++ b/sys/modules/cxgbe/if_cxgbe/Makefile @@ -4,21 +4,28 @@ .include -CXGBE = ${.CURDIR}/../../../dev/cxgbe +CXGBE= ${.CURDIR}/../../../dev/cxgbe .PATH: ${CXGBE} ${CXGBE}/common -KMOD = if_cxgbe -SRCS = t4_main.c t4_sge.c t4_l2t.c t4_tracer.c -SRCS+= t4_hw.c -SRCS+= device_if.h bus_if.h pci_if.h -SRCS+= opt_inet.h opt_inet6.h -SRCS+= opt_ofed.h - -CFLAGS+= -I${CXGBE} +KMOD= if_cxgbe +SRCS= bus_if.h +SRCS+= device_if.h +SRCS+= opt_inet.h +SRCS+= opt_inet6.h +SRCS+= opt_ofed.h +SRCS+= pci_if.h +SRCS+= t4_hw.c +SRCS+= t4_l2t.c +SRCS+= t4_main.c +SRCS+= t4_netmap.c +SRCS+= t4_sge.c +SRCS+= t4_tracer.c # Provide the timestamp of a packet in its header mbuf. #CFLAGS+= -DT4_PKT_TIMESTAMP +CFLAGS+= -I${CXGBE} + .if !defined(KERNBUILDDIR) .if ${MK_INET_SUPPORT} != "no" opt_inet.h: diff --git a/sys/modules/cxgbe/iw_cxgbe/Makefile b/sys/modules/cxgbe/iw_cxgbe/Makefile index 7704650ed..b3844c058 100644 --- a/sys/modules/cxgbe/iw_cxgbe/Makefile +++ b/sys/modules/cxgbe/iw_cxgbe/Makefile @@ -2,13 +2,29 @@ .include -CXGBE = ${.CURDIR}/../../../dev/cxgbe +CXGBE= ${.CURDIR}/../../../dev/cxgbe .PATH: ${CXGBE}/iw_cxgbe -KMOD= iw_cxgbe -SRCS= device.c cm.c provider.c mem.c cq.c qp.c resource.c ev.c id_table.c -SRCS+= bus_if.h device_if.h opt_sched.h pci_if.h pcib_if.h opt_ktr.h -SRCS+= opt_inet.h opt_ofed.h vnode_if.h +KMOD= iw_cxgbe +SRCS= bus_if.h +SRCS+= cm.c +SRCS+= cq.c +SRCS+= device.c +SRCS+= device_if.h +SRCS+= ev.c +SRCS+= id_table.c +SRCS+= mem.c +SRCS+= opt_inet.h +SRCS+= opt_ktr.h +SRCS+= opt_ofed.h +SRCS+= opt_sched.h +SRCS+= pci_if.h +SRCS+= pcib_if.h +SRCS+= provider.c +SRCS+= qp.c +SRCS+= resource.c +SRCS+= vnode_if.h + CFLAGS+= -I${CXGBE} -I${.CURDIR}/../../../ofed/include -DLINUX_TYPES_DEFINED .if !defined(KERNBUILDDIR) diff --git a/sys/modules/cxgbe/t4_firmware/Makefile b/sys/modules/cxgbe/t4_firmware/Makefile index ef3bc8b02..de2b60116 100644 --- a/sys/modules/cxgbe/t4_firmware/Makefile +++ b/sys/modules/cxgbe/t4_firmware/Makefile @@ -2,24 +2,24 @@ # $FreeBSD$ # -T4FW = ${.CURDIR}/../../../dev/cxgbe/firmware +T4FW= ${.CURDIR}/../../../dev/cxgbe/firmware .PATH: ${T4FW} -KMOD = t4fw_cfg -FIRMWS = ${KMOD}.txt:${KMOD}:1.0.0.0 +KMOD= t4fw_cfg +FIRMWS= ${KMOD}.txt:${KMOD}:1.0.0.0 # You can have additional configuration files in the ${T4FW} directory. # t4fw_cfg_.txt CFG_FILES != cd ${T4FW} && echo ${KMOD}_*.txt .for F in ${CFG_FILES} .if exists(${F}) -FIRMWS += ${F}:${F:C/.txt//}:1.0.0.0 +FIRMWS+= ${F}:${F:C/.txt//}:1.0.0.0 .endif .endfor -T4FW_VER = 1.11.27.0 -FIRMWS += t4fw.fw:t4fw:${T4FW_VER} -CLEANFILES += t4fw.fw +T4FW_VER= 1.11.27.0 +FIRMWS+= t4fw.fw:t4fw:${T4FW_VER} +CLEANFILES+= t4fw.fw t4fw.fw: t4fw-${T4FW_VER}.bin.uu uudecode -o ${.TARGET} ${.ALLSRC} diff --git a/sys/modules/cxgbe/t5_firmware/Makefile b/sys/modules/cxgbe/t5_firmware/Makefile index 2d26c13c0..2f414f3ed 100644 --- a/sys/modules/cxgbe/t5_firmware/Makefile +++ b/sys/modules/cxgbe/t5_firmware/Makefile @@ -2,24 +2,24 @@ # $FreeBSD$ # -T5FW = ${.CURDIR}/../../../dev/cxgbe/firmware +T5FW= ${.CURDIR}/../../../dev/cxgbe/firmware .PATH: ${T5FW} -KMOD = t5fw_cfg -FIRMWS = ${KMOD}.txt:${KMOD}:1.0.0.0 +KMOD= t5fw_cfg +FIRMWS= ${KMOD}.txt:${KMOD}:1.0.0.0 # You can have additional configuration files in the ${T5FW} directory. # t5fw_cfg_.txt CFG_FILES != cd ${T5FW} && echo ${KMOD}_*.txt .for F in ${CFG_FILES} .if exists(${F}) -FIRMWS += ${F}:${F:C/.txt//}:1.0.0.0 +FIRMWS+= ${F}:${F:C/.txt//}:1.0.0.0 .endif .endfor -T5FW_VER = 1.11.27.0 -FIRMWS += t5fw.fw:t5fw:${T5FW_VER} -CLEANFILES += t5fw.fw +T5FW_VER= 1.11.27.0 +FIRMWS+= t5fw.fw:t5fw:${T5FW_VER} +CLEANFILES+= t5fw.fw t5fw.fw: t5fw-${T5FW_VER}.bin.uu uudecode -o ${.TARGET} ${.ALLSRC} diff --git a/sys/modules/cxgbe/tom/Makefile b/sys/modules/cxgbe/tom/Makefile index d02afd4cc..408518aa3 100644 --- a/sys/modules/cxgbe/tom/Makefile +++ b/sys/modules/cxgbe/tom/Makefile @@ -4,13 +4,21 @@ .include -CXGBE = ${.CURDIR}/../../../dev/cxgbe +CXGBE= ${.CURDIR}/../../../dev/cxgbe .PATH: ${CXGBE}/tom -KMOD = t4_tom -SRCS = t4_tom.c t4_connect.c t4_listen.c t4_cpl_io.c t4_tom_l2t.c t4_ddp.c -SRCS+= device_if.h bus_if.h pci_if.h -SRCS+= opt_inet.h opt_inet6.h +KMOD= t4_tom +SRCS= bus_if.h +SRCS+= device_if.h +SRCS+= opt_inet.h +SRCS+= opt_inet6.h +SRCS+= pci_if.h +SRCS+= t4_connect.c +SRCS+= t4_cpl_io.c +SRCS+= t4_ddp.c +SRCS+= t4_listen.c +SRCS+= t4_tom.c +SRCS+= t4_tom_l2t.c CFLAGS+= -I${CXGBE} -- 2.45.0