From ab250e4c0813360205407468ee330500c6635e3b Mon Sep 17 00:00:00 2001 From: hselasky Date: Wed, 6 Sep 2017 15:33:23 +0000 Subject: [PATCH] MFC r322810 and r322830: Add new mlx5ib(4) driver to the kernel source tree which supports Remote DMA over Converged Ethernet, RoCE, for the ConnectX-4 series of PCI express network cards. There is currently no user-space support and this driver only supports kernel side non-routable RoCE V1. The krping kernel module can be used to test this driver. Full user-space support including RoCE V2 will be added as part of the ongoing upgrade to ibcore from Linux 4.9. Otherwise this driver is feature equivalent to mlx4ib(4). The mlx5ib(4) kernel module will only be built when WITH_OFED=YES is specified. Approved by: re (marius) Sponsored by: Mellanox Technologies git-svn-id: svn://svn.freebsd.org/base/stable/10@323223 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- share/man/man4/Makefile | 1 + share/man/man4/mlx5ib.4 | 124 + sys/conf/files | 21 + sys/dev/mlx5/mlx5_ib/mlx5_ib.h | 791 ++++++ sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c | 146 ++ sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c | 1300 ++++++++++ sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c | 97 + sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c | 542 ++++ sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c | 2310 +++++++++++++++++ sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c | 193 ++ sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c | 1310 ++++++++++ sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c | 2991 +++++++++++++++++++++++ sys/dev/mlx5/mlx5_ib/mlx5_ib_roce.c | 252 ++ sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c | 503 ++++ sys/dev/mlx5/mlx5_ib/user.h | 318 +++ sys/modules/Makefile | 2 + sys/modules/mlx5ib/Makefile | 24 + 17 files changed, 10925 insertions(+) create mode 100644 share/man/man4/mlx5ib.4 create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib.h create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_roce.c create mode 100644 sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c create mode 100644 sys/dev/mlx5/mlx5_ib/user.h create mode 100644 sys/modules/mlx5ib/Makefile diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index f9e0cc76b..96be30a4f 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -868,6 +868,7 @@ MAN+= iscsi_initiator.4 .if ${MK_OFED} != "no" MAN+= mlx4ib.4 +MAN+= mlx5ib.4 .endif .if ${MK_TESTS} != "no" diff --git a/share/man/man4/mlx5ib.4 b/share/man/man4/mlx5ib.4 new file mode 100644 index 000000000..092627596 --- /dev/null +++ b/share/man/man4/mlx5ib.4 @@ -0,0 +1,124 @@ +.\" Copyright (c) 2017 Mellanox Technologies +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd August 23, 2017 +.Dt MLX5IB 4 +.Os +.Sh NAME +.Nm mlx5ib +.Nd "Mellanox ConnectX-4 and ConnectX-4 LX based 100Gb, 50Gb, 40Gb, 25Gb and 10Gb network adapter driver" +.Sh SYNOPSIS +To compile this driver into the kernel, +place these lines in your kernel configuration file: +.Bd -ragged -offset indent +.Cd "options COMPAT_LINUXKPI" +.Cd "device mlx5" +.Cd "device mlx5ib" +.Ed +.Pp +To load the driver as a module at run-time, +run this command as root: +.Bd -literal -offset indent +kldload mlx5ib +.Ed +.Pp +To load the driver as a +module at boot time, place this line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +mlx5ib_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +driver provides support for infiniband and Remote DMA over Converged Ethernet, +RoCE, for PCI Express network adapters based on ConnectX-4 and ConnectX-4 LX. +.br +For further hardware information and questions related to hardware +requirements, see +.Pa http://www.mellanox.com/ . +.Pp +For more information on configuring this device, see +.Xr ifconfig 8 . +.Sh HARDWARE +The +.Nm +driver supports 100Gb, 50Gb, 40Gb, 25Gb and 10Gb network adapters. +ConnectX-4 supports: 10/20/25/40/50/56/100Gb/s speeds. +ConnectX-4 LX supports: 10/25/40/50Gb/s speeds (and reduced power consumption): +.Pp +.Bl -bullet -compact +.It +Mellanox MCX455A-ECAT +.It +Mellanox MCX456A-ECAT +.It +Mellanox MCX415A-CCAT +.It +Mellanox MCX416A-CCAT +.It +Mellanox MCX455A-FCAT +.It +Mellanox MCX456A-FCAT +.It +Mellanox MCX415A-BCAT +.It +Mellanox MCX416A-BCAT +.It +Mellanox MCX4131A-GCAT +.It +Mellanox MCX4131A-BCAT +.It +Mellanox MCX4121A-ACAT +.It +Mellanox MCX4111A-ACAT +.It +Mellanox MCX4121A-XCAT +.It +Mellanox MCX4111A-XCAT +.El +.Sh SUPPORT +For general information and support, +go to the Mellanox support website at: +.Pa http://www.mellanox.com/ . +.Pp +If an issue is identified with this driver with a supported adapter, +email all the specific information related to the issue to +.Aq Mt freebsd-drivers@mellanox.com . +.Sh SEE ALSO +.Xr mlx5en 4 , +.Xr ifconfig 8 +.Sh HISTORY +The +.Nm +device driver first appeared in +.Fx 12.x . +.Sh AUTHORS +.An -nosplit +The +.Nm +driver was written by +.An Mellanox Technologies . diff --git a/sys/conf/files b/sys/conf/files index 24333a608..1c481231b 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3933,6 +3933,27 @@ ofed/drivers/net/mlx4/en_tx.c optional mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +dev/mlx5/mlx5_ib/mlx5_ib_ah.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_cq.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_mad.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_main.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_mem.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_mr.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_qp.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_roce.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_srq.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" + dev/mlx5/mlx5_core/mlx5_alloc.c optional mlx5 pci \ compile-with "${OFED_C}" dev/mlx5/mlx5_core/mlx5_cmd.c optional mlx5 pci \ diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib.h b/sys/dev/mlx5/mlx5_ib/mlx5_ib.h new file mode 100644 index 000000000..7bfb9ea08 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib.h @@ -0,0 +1,791 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef MLX5_IB_H +#define MLX5_IB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define mlx5_ib_dbg(dev, format, arg...) \ +pr_debug("mlx5_dbg:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, curthread->td_proc->p_pid, ##arg) + +#define mlx5_ib_err(dev, format, arg...) \ +printf("mlx5_ib: ERR: ""mlx5_err:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, curthread->td_proc->p_pid, ##arg) + +#define mlx5_ib_warn(dev, format, arg...) \ +printf("mlx5_ib: WARN: ""mlx5_warn:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, curthread->td_proc->p_pid, ##arg) +#define BF_ENABLE 0 + +extern struct workqueue_struct *mlx5_ib_wq; + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum mlx5_ib_mmap_cmd { + MLX5_IB_MMAP_REGULAR_PAGE = 0, + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + MLX5_IB_MMAP_WC_PAGE = 2, + MLX5_IB_MMAP_NC_PAGE = 3, + MLX5_IB_MMAP_MAP_DC_INFO_PAGE = 4, + + /* Use EXP mmap commands until it is pushed to upstream */ + MLX5_IB_EXP_MMAP_CORE_CLOCK = 0xFB, + MLX5_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA = 0xFC, + MLX5_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA = 0xFD, + MLX5_IB_EXP_ALLOC_N_MMAP_WC = 0xFE, +}; + +enum { + MLX5_RES_SCAT_DATA32_CQE = 0x1, + MLX5_RES_SCAT_DATA64_CQE = 0x2, + MLX5_REQ_SCAT_DATA32_CQE = 0x11, + MLX5_REQ_SCAT_DATA64_CQE = 0x22, +}; + +enum { + MLX5_DCT_CS_RES_64 = 2, + MLX5_CNAK_RX_POLL_CQ_QUOTA = 256, +}; + +enum mlx5_ib_latency_class { + MLX5_IB_LATENCY_CLASS_LOW, + MLX5_IB_LATENCY_CLASS_MEDIUM, + MLX5_IB_LATENCY_CLASS_HIGH, + MLX5_IB_LATENCY_CLASS_FAST_PATH +}; + +enum mlx5_ib_mad_ifc_flags { + MLX5_MAD_IFC_IGNORE_MKEY = 1, + MLX5_MAD_IFC_IGNORE_BKEY = 2, + MLX5_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX5_CROSS_CHANNEL_UUAR = 0, +}; + +enum { + MLX5_IB_MAX_CTX_DYNAMIC_UARS = 256, + MLX5_IB_INVALID_UAR_INDEX = -1U +}; + +enum { + MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES = 13, + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES = 6, + MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES = 16, + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES = 9, +}; + +struct mlx5_ib_ucontext { + struct ib_ucontext ibucontext; + struct list_head db_page_list; + + /* protect doorbell record alloc/free + */ + struct mutex db_page_mutex; + struct mlx5_uuar_info uuari; + u32 dynamic_wc_uar_index[MLX5_IB_MAX_CTX_DYNAMIC_UARS]; + /* Transport Domain number */ + u32 tdn; +}; + +static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); +} + +struct mlx5_ib_pd { + struct ib_pd ibpd; + u32 pdn; + u32 pa_lkey; +}; + +struct wr_list { + u16 opcode; + u16 next; +}; + +struct mlx5_swr_ctx { + u64 wrid; + u32 wr_data; + struct wr_list w_list; + u32 wqe_head; + u8 sig_piped; + u8 rsvd[11]; +}; + +struct mlx5_rwr_ctx { + u64 wrid; +}; + +struct mlx5_ib_wq { + union { + struct mlx5_swr_ctx *swr_ctx; + struct mlx5_rwr_ctx *rwr_ctx; + }; + u16 unsig_count; + + /* serialize post to the work queue + */ + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; + u16 cur_post; + u16 last_poll; + void *qend; +}; + +enum { + MLX5_QP_USER, + MLX5_QP_KERNEL, + MLX5_QP_EMPTY +}; + +enum { + MLX5_WQ_USER, + MLX5_WQ_KERNEL +}; + +struct mlx5_ib_sqd { + struct mlx5_ib_qp *qp; + struct work_struct work; +}; + +struct mlx5_ib_mc_flows_list { + struct list_head flows_list; + /*Protect the flows_list*/ + struct mutex lock; +}; + +struct mlx5_ib_qp { + struct ib_qp ibqp; + struct mlx5_core_qp mqp; + struct mlx5_core_qp mrq; + struct mlx5_core_qp msq; + u32 tisn; + u32 tirn; + struct mlx5_buf buf; + + struct mlx5_db db; + struct mlx5_ib_wq rq; + + u32 doorbell_qpn; + u8 sq_signal_bits; + u8 fm_cache; + int sq_max_wqes_per_wr; + int sq_spare_wqes; + struct mlx5_ib_wq sq; + + struct ib_umem *umem; + int buf_size; + /* Raw Ethernet QP's SQ is allocated seperately + * from the RQ's buffer in user-space. + */ + struct ib_umem *sq_umem; + int sq_buf_size; + u64 sq_buf_addr; + int allow_mp_wqe; + + /* serialize qp state modifications + */ + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 state; + /* Raw Ethernet QP's SQ and RQ states */ + u8 rq_state; + u8 sq_state; + int mlx_type; + int wq_sig; + int scat_cqe; + int max_inline_data; + struct mlx5_bf *bf; + int has_rq; + + /* only for user space QPs. For kernel + * we have it from the bf object + */ + int uuarn; + + int create_type; + u32 pa_lkey; + + /* Store signature errors */ + bool signature_en; + + struct list_head qps_list; + struct list_head cq_recv_list; + struct list_head cq_send_list; + + struct mlx5_ib_mc_flows_list mc_flows_list; +}; + +struct mlx5_ib_cq_buf { + struct mlx5_buf buf; + struct ib_umem *umem; + int cqe_size; + int nent; +}; + +enum mlx5_ib_qp_flags { + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, + MLX5_IB_QP_CAP_RX_END_PADDING = 1 << 5, +}; + +struct mlx5_umr_wr { + union { + u64 virt_addr; + u64 offset; + } target; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u64 length; + int access_flags; + u32 mkey; +}; + +struct mlx5_shared_mr_info { + int mr_id; + struct ib_umem *umem; +}; + +struct mlx5_ib_cq { + struct ib_cq ibcq; + struct mlx5_core_cq mcq; + struct mlx5_ib_cq_buf buf; + struct mlx5_db db; + + /* serialize access to the CQ + */ + spinlock_t lock; + + /* protect resize cq + */ + struct mutex resize_mutex; + struct mlx5_ib_cq_buf *resize_buf; + struct ib_umem *resize_umem; + int cqe_size; + struct list_head list_send_qp; + struct list_head list_recv_qp; +}; + +struct mlx5_ib_srq { + struct ib_srq ibsrq; + struct mlx5_core_srq msrq; + struct mlx5_buf buf; + struct mlx5_db db; + u64 *wrid; + /* protect SRQ hanlding + */ + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + /* serialize arming a SRQ + */ + struct mutex mutex; + int wq_sig; +}; + +struct mlx5_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; +}; + +enum mlx5_ib_mtt_access_flags { + MLX5_IB_MTT_READ = (1 << 0), + MLX5_IB_MTT_WRITE = (1 << 1), +}; + +#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) + +struct mlx5_ib_mr { + struct ib_mr ibmr; + struct mlx5_core_mr mmr; + struct ib_umem *umem; + struct mlx5_shared_mr_info *smr_info; + struct list_head list; + int order; + int umred; + dma_addr_t dma; + int npages; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + struct mlx5_core_sig_ctx *sig; + u32 max_reg_descriptors; + u64 size; + u64 page_count; + struct mlx5_ib_mr **children; + int nchild; +}; + +struct mlx5_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + __be64 *mapped_page_list; + dma_addr_t map; +}; + +struct mlx5_ib_umr_context { + enum ib_wc_status status; + struct completion done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->status = -1; + init_completion(&context->done); +} + +struct umr_common { + struct ib_pd *pd; + struct ib_mr *mr; +}; + +enum { + MLX5_FMR_INVALID, + MLX5_FMR_VALID, + MLX5_FMR_BUSY, +}; + +struct mlx5_ib_fmr { + struct ib_fmr ibfmr; + struct mlx5_core_mr mr; + int access_flags; + int state; + /* protect fmr state + */ + spinlock_t lock; + u64 wrid; + struct ib_send_wr wr[2]; + u8 page_shift; + struct ib_fast_reg_page_list page_list; +}; + +struct cache_order { + struct kobject kobj; + int order; + int index; + struct mlx5_ib_dev *dev; +}; + +struct mlx5_cache_ent { + struct list_head head; + /* sync access to the cahce entry + */ + spinlock_t lock; + + + u32 order; + u32 size; + u32 cur; + u32 miss; + u32 limit; + + struct mlx5_ib_dev *dev; + struct work_struct work; + struct delayed_work dwork; + int pending; + struct cache_order co; +}; + +struct mlx5_mr_cache { + struct workqueue_struct *wq; + struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + int stopped; + struct dentry *root; + int last_add; + int rel_timeout; + int rel_imm; +}; + +struct mlx5_ib_resources { + struct ib_cq *c0; + struct ib_xrcd *x0; + struct ib_xrcd *x1; + struct ib_pd *p0; + struct ib_srq *s0; + struct ib_srq *s1; +}; + +struct mlx5_dc_tracer { + struct page *pg; + dma_addr_t dma; + int size; + int order; +}; + +struct mlx5_dc_desc { + dma_addr_t dma; + void *buf; +}; + +enum mlx5_op { + MLX5_WR_OP_MLX = 1, +}; + +struct mlx5_mlx_wr { + u8 sl; + u16 dlid; + int icrc; +}; + +struct mlx5_send_wr { + struct ib_send_wr wr; + union { + struct mlx5_mlx_wr mlx; + } sel; +}; + +struct mlx5_dc_data { + struct ib_mr *mr; + struct ib_qp *dcqp; + struct ib_cq *rcq; + struct ib_cq *scq; + unsigned int rx_npages; + unsigned int tx_npages; + struct mlx5_dc_desc *rxdesc; + struct mlx5_dc_desc *txdesc; + unsigned int max_wqes; + unsigned int cur_send; + unsigned int last_send_completed; + int tx_pending; + struct mlx5_ib_dev *dev; + int port; + int initialized; + struct kobject kobj; + unsigned long connects; + unsigned long cnaks; + unsigned long discards; + struct ib_wc wc_tbl[MLX5_CNAK_RX_POLL_CQ_QUOTA]; +}; + +struct mlx5_ib_port_sysfs_group { + struct kobject kobj; + bool enabled; + struct attribute_group counters; +}; + +#define MLX5_IB_GID_MAX 16 + +struct mlx5_ib_port { + struct mlx5_ib_dev *dev; + u8 port_num; /* 0 based */ + u8 port_gone; /* set when gone */ + u16 q_cnt_id; + struct mlx5_ib_port_sysfs_group group; + union ib_gid gid_table[MLX5_IB_GID_MAX]; +}; + +struct mlx5_ib_dev { + struct ib_device ib_dev; + struct mlx5_core_dev *mdev; + MLX5_DECLARE_DOORBELL_LOCK(uar_lock); + int num_ports; + /* serialize update of capability mask + */ + struct mutex cap_mask_mutex; + bool ib_active; + struct umr_common umrc; + /* sync used page count stats + */ + struct mlx5_ib_resources devr; + struct mutex slow_path_mutex; + int enable_atomic_resp; + enum ib_atomic_cap atomic_cap; + struct mlx5_mr_cache cache; + struct kobject mr_cache; + /* protect resources needed as part of reset flow */ + spinlock_t reset_flow_resource_lock; + struct list_head qp_list; + struct timer_list delay_timer; + int fill_delay; + struct mlx5_dc_tracer dctr; + struct mlx5_dc_data dcd[MLX5_MAX_PORTS]; + struct kobject *dc_kobj; + /* Array with num_ports elements */ + struct mlx5_ib_port *port; + struct kobject *ports_parent; +}; + +static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) +{ + return container_of(mcq, struct mlx5_ib_cq, mcq); +} + +static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); +} + +static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx5_ib_dev, ib_dev); +} + +static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr); +} + +static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx5_ib_cq, ibcq); +} + +static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) +{ + return container_of(mqp, struct mlx5_ib_qp, mqp); +} + +static inline struct mlx5_ib_qp *sq_to_mibqp(struct mlx5_core_qp *msq) +{ + return container_of(msq, struct mlx5_ib_qp, msq); +} + +static inline struct mlx5_ib_qp *rq_to_mibqp(struct mlx5_core_qp *mrq) +{ + return container_of(mrq, struct mlx5_ib_qp, mrq); +} + +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +{ + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx5_ib_pd, ibpd); +} + +static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); +} + +static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx5_ib_qp, ibqp); +} + +static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) +{ + return container_of(msrq, struct mlx5_ib_srq, msrq); +} + +static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_ib_mr, ibmr); +} + +static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl); +} + +struct mlx5_ib_ah { + struct ib_ah ibah; + struct mlx5_av av; +}; + +static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx5_ib_ah, ibah); +} + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, uintptr_t virt, + struct mlx5_db *db); +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +int mlx5_ib_resolve_grh(const struct ib_ah_attr *ah_attr, u8 *mac, int *is_mcast); +struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah, enum rdma_link_layer ll); +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx5_ib_destroy_ah(struct ib_ah *ah); +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int mlx5_ib_destroy_srq(struct ib_srq *srq); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_destroy_qp(struct ib_qp *qp); +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, + int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_destroy_cq(struct ib_cq *cq); +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata, int mr_id); +struct ib_mr *mlx5_ib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int access_flags, + u64 *virt_addr); +int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +int mlx5_ib_destroy_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); + +struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr); +int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova); +int mlx5_ib_unmap_fmr(struct list_head *fmr_list); +int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr); +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); +int mlx5_query_smp_attr_node_info_mad_ifc(struct ib_device *ibdev, + struct ib_smp *out_mad); +int mlx5_query_system_image_guid_mad_ifc(struct ib_device *ibdev, + __be64 *sys_image_guid); +int mlx5_query_max_pkeys_mad_ifc(struct ib_device *ibdev, + u16 *max_pkeys); +int mlx5_query_vendor_id_mad_ifc(struct ib_device *ibdev, + u32 *vendor_id); +int mlx5_query_pkey_mad_ifc(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey); +int mlx5_query_node_desc_mad_ifc(struct mlx5_ib_dev *dev, char *node_desc); +int mlx5_query_node_guid_mad_ifc(struct mlx5_ib_dev *dev, u64 *node_guid); +int mlx5_query_gids_mad_ifc(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); +int mlx5_query_port_mad_ifc(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order); +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int umr); +void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); +int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); +int mlx5_query_port_roce(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port, int index, + __be16 ah_udp_s_port); +int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port, + int index, int *gid_type); +struct net_device *mlx5_ib_get_netdev(struct ib_device *ib_dev, u8 port); +int modify_gid_roce(struct ib_device *ib_dev, u8 port, unsigned int index, + const union ib_gid *gid, struct net_device *ndev); +int query_gid_roce(struct ib_device *ib_dev, u8 port, int index, + union ib_gid *gid); +int mlx5_process_mad_mad_ifc(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_wc *in_wc, + struct ib_grh *in_grh, struct ib_mad *in_mad, + struct ib_mad *out_mad); + +static inline void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static inline u8 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ; +} + +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + +#endif /* MLX5_IB_H */ diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c new file mode 100644 index 000000000..d35390d33 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "mlx5_ib.h" + +#define IPV6_DEFAULT_HOPLIMIT 64 + +int mlx5_ib_resolve_grh(const struct ib_ah_attr *ah_attr, u8 *mac, int *is_mcast) +{ + struct in6_addr in6; + + if (is_mcast != NULL) + *is_mcast = 0; + + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6); + if (rdma_link_local_addr(&in6)) { + rdma_get_ll_mac(&in6, mac); + } else if (rdma_is_multicast_addr(&in6)) { + rdma_get_mcast_mac(&in6, mac); + if (is_mcast != NULL) + *is_mcast = 1; + } else { + return -EINVAL; + } + return 0; +} + + +struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, + struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah, enum rdma_link_layer ll) +{ + int err; + int gid_type; + + if (ah_attr->ah_flags & IB_AH_GRH) { + memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); + ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label | + (1 << 30) | + ah_attr->grh.sgid_index << 20); + ah->av.hop_limit = ah_attr->grh.hop_limit; + ah->av.tclass = ah_attr->grh.traffic_class; + } + + ah->av.stat_rate_sl = (ah_attr->static_rate << 4); + + if (ll == IB_LINK_LAYER_ETHERNET) { + err = mlx5_get_roce_gid_type(dev, ah_attr->port_num, + ah_attr->grh.sgid_index, + &gid_type); + if (err) + return ERR_PTR(err); + + mlx5_ib_resolve_grh(ah_attr, ah->av.rmac, NULL); + ah->av.udp_sport = mlx5_get_roce_udp_sport( + dev, + ah_attr->port_num, + ah_attr->grh.sgid_index, + 0); + ah->av.stat_rate_sl |= (ah_attr->sl & 0x7) << 1; + ah->av.hop_limit = ah_attr->grh.hop_limit; + /* TODO: initialize other eth fields */ + } else { + ah->av.rlid = cpu_to_be16(ah_attr->dlid); + ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; + ah->av.stat_rate_sl |= (ah_attr->sl & 0xf); + } + + return &ah->ibah; +} + +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + enum rdma_link_layer ll; + struct ib_ah *ret = ERR_PTR(-EINVAL); + + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + ll = pd->device->get_link_layer(pd->device, ah_attr->port_num); + + if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH)) + goto err_kfree_ah; + + return create_ib_ah(dev, ah_attr, ah, ll); /* never fails */ + +err_kfree_ah: + kfree(ah); + return ret; +} + +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah = to_mah(ibah); + u32 tmp; + + memset(ah_attr, 0, sizeof(*ah_attr)); + + tmp = be32_to_cpu(ah->av.grh_gid_fl); + if (tmp & (1 << 30)) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.sgid_index = (tmp >> 20) & 0xff; + ah_attr->grh.flow_label = tmp & 0xfffff; + memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16); + ah_attr->grh.hop_limit = ah->av.hop_limit; + ah_attr->grh.traffic_class = ah->av.tclass; + } + ah_attr->dlid = be16_to_cpu(ah->av.rlid); + ah_attr->static_rate = ah->av.stat_rate_sl >> 4; + ah_attr->sl = ah->av.stat_rate_sl & 0xf; + + return 0; +} + +int mlx5_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c new file mode 100644 index 000000000..db83b6413 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c @@ -0,0 +1,1300 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "user.h" + +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, int type) +{ + struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq); + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct ib_cq *ibcq = &cq->ibcq; + struct ib_event event; + + if (type != MLX5_EVENT_TYPE_CQ_ERROR) { + mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n", + type, mcq->cqn); + return; + } + + if (ibcq->event_handler) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size) +{ + return mlx5_buf_offset(&buf->buf, n * size); +} + +static void *get_cqe(struct mlx5_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); +} + +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) +{ + switch (wq->swr_ctx[idx].wr_data) { + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + + case IB_WR_FAST_REG_MR: + return IB_WC_FAST_REG_MR; + + default: + printf("mlx5_ib: WARN: ""unknown completion status\n"); + return 0; + } +} + +static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_wq *wq, int idx) +{ + wc->wc_flags = 0; + switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_NOP: + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + case MLX5_OPCODE_UMR: + wc->opcode = get_umr_comp(wq, idx); + break; + } +} + +enum { + MLX5_GRH_IN_BUFFER = 1, + MLX5_GRH_IN_CQE = 2, +}; + +static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_ib_srq *srq; + struct mlx5_ib_wq *wq; + u16 wqe_ctr; + u8 g; +#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) + u8 udp_header_valid; +#endif + + if (qp->ibqp.srq || qp->ibqp.xrcd) { + struct mlx5_core_srq *msrq = NULL; + + if (qp->ibqp.xrcd) { + msrq = mlx5_core_get_srq(dev->mdev, + be32_to_cpu(cqe->srqn)); + srq = to_mibsrq(msrq); + } else { + srq = to_msrq(qp->ibqp.srq); + } + if (srq) { + wqe_ctr = be16_to_cpu(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + if (msrq && atomic_dec_and_test(&msrq->refcount)) + complete(&msrq->free); + } + } else { + wq = &qp->rq; + wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + ++wq->tail; + } + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + break; + } + wc->slid = be16_to_cpu(cqe->slid); + wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf; + wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; + +#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) + udp_header_valid = wc->sl & 0x8; + if (udp_header_valid) + wc->wc_flags |= IB_WC_WITH_UDP_HDR; + +#endif +} + +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +{ + __be32 *p = (__be32 *)cqe; + int i; + + mlx5_ib_warn(dev, "dump error cqe\n"); + for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4) + printf("mlx5_ib: INFO: ""%08x %08x %08x %08x\n", be32_to_cpu(p[0]), be32_to_cpu(p[1]), be32_to_cpu(p[2]), be32_to_cpu(p[3])); +} + +static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, + struct mlx5_err_cqe *cqe, + struct ib_wc *wc) +{ + int dump = 1; + + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + dump = 0; + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_synd; + if (dump) + dump_cqe(dev, cqe); +} + +static int is_atomic_response(struct mlx5_ib_qp *qp, u16 idx) +{ + /* TBD: waiting decision + */ + return 0; +} + +static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, u16 idx) +{ + struct mlx5_wqe_data_seg *dpseg; + void *addr; + + dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + addr = (void *)(uintptr_t)be64_to_cpu(dpseg->addr); + return addr; +} + +static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 idx) +{ + void *addr; + int byte_count; + int i; + + if (!is_atomic_response(qp, idx)) + return; + + byte_count = be32_to_cpu(cqe64->byte_cnt); + addr = mlx5_get_atomic_laddr(qp, idx); + + if (byte_count == 4) { + *(u32 *)addr = be32_to_cpu(*((__be32 *)addr)); + } else { + for (i = 0; i < byte_count; i += 8) { + *(u64 *)addr = be64_to_cpu(*((__be64 *)addr)); + addr += 8; + } + } + + return; +} + +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 tail, u16 head) +{ + u16 idx; + + do { + idx = tail & (qp->sq.wqe_cnt - 1); + handle_atomic(qp, cqe64, idx); + if (idx == head) + break; + + tail = qp->sq.swr_ctx[idx].w_list.next; + } while (1); + tail = qp->sq.swr_ctx[idx].w_list.next; + qp->sq.last_poll = tail; +} + +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_buf_free(dev->mdev, &buf->buf); +} + +static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_wq *wq; + unsigned cur; + unsigned idx; + int np; + int i; + + wq = &qp->sq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + idx = wq->last_poll & (wq->wqe_cnt - 1); + wc->wr_id = wq->swr_ctx[idx].wrid; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + np++; + wc->qp = &qp->ibqp; + wc++; + wq->last_poll = wq->swr_ctx[idx].w_list.next; + } + *npolled = np; +} + +static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_wq *wq; + unsigned cur; + int np; + int i; + + wq = &qp->rq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + np++; + wc->qp = &qp->ibqp; + wc++; + } + *npolled = np; +} + +static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_qp *qp; + + *npolled = 0; + /* Find uncompleted WQEs belonging to that cq and retrun mmics ones */ + list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { + sw_send_comp(qp, num_entries, wc + *npolled, npolled); + if (*npolled >= num_entries) + return; + } + + list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) { + sw_recv_comp(qp, num_entries, wc + *npolled, npolled); + if (*npolled >= num_entries) + return; + } +} + +static inline u32 mlx5_ib_base_mkey(const u32 key) +{ + return key & 0xffffff00u; +} + +static int mlx5_poll_one(struct mlx5_ib_cq *cq, + struct mlx5_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_err_cqe *err_cqe; + struct mlx5_cqe64 *cqe64; + struct mlx5_core_qp *mqp; + struct mlx5_ib_wq *wq; + struct mlx5_sig_err_cqe *sig_err_cqe; + struct mlx5_core_mr *mmr; + struct mlx5_ib_mr *mr; + unsigned long flags; + u8 opcode; + u32 qpn; + u16 wqe_ctr; + void *cqe; + int idx; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->mcq.cons_index; + + /* Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx5_qp_lookup(dev->mdev, qpn); + if (unlikely(!mqp)) { + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n", + cq->mcq.cqn, qpn); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + switch (opcode) { + case MLX5_CQE_REQ: + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + handle_good_req(wc, cqe64, wq, idx); + handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); + wc->wr_id = wq->swr_ctx[idx].wrid; + wq->tail = wq->swr_ctx[idx].wqe_head + 1; + if (unlikely(wq->swr_ctx[idx].w_list.opcode & + MLX5_OPCODE_SIGNATURE_CANCELED)) + wc->status = IB_WC_GENERAL_ERR; + else + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + handle_responder(wc, cqe64, *cur_qp); + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err_cqe = (struct mlx5_err_cqe *)cqe64; + mlx5_handle_error_cqe(dev, err_cqe, wc); + mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n", + opcode == MLX5_CQE_REQ_ERR ? + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + wc->wr_id = wq->swr_ctx[idx].wrid; + wq->tail = wq->swr_ctx[idx].wqe_head + 1; + } else { + struct mlx5_ib_srq *srq; + + if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + ++wq->tail; + } + } + break; + case MLX5_CQE_SIG_ERR: + sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + + spin_lock_irqsave(&dev->mdev->priv.mr_table.lock, flags); + mmr = __mlx5_mr_lookup(dev->mdev, + mlx5_ib_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmr)) { + spin_unlock_irqrestore(&dev->mdev->priv.mr_table.lock, flags); + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", + cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); + return -EINVAL; + } + + mr = to_mibmr(mmr); + mr->sig->sig_err_exists = true; + mr->sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR\n", cq->mcq.cqn); + + spin_unlock_irqrestore(&dev->mdev->priv.mr_table.lock, flags); + goto repoll; + } + + return 0; +} + +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_ib_qp *cur_qp = NULL; + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int npolled; + int err = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled); + goto out; + } + + for (npolled = 0; npolled < num_entries; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + if (npolled) + mlx5_cq_set_ci(&cq->mcq); +out: + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + + + mlx5_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, + uar_page, + MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), + to_mcq(ibcq)->mcq.cons_index); + + return 0; +} + +static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, + int nent, int cqe_size) +{ + int err; + + err = mlx5_buf_alloc(dev->mdev, nent * cqe_size, + PAGE_SIZE * 2, &buf->buf); + if (err) { + mlx5_ib_err(dev, "alloc failed\n"); + return err; + } + + buf->cqe_size = cqe_size; + buf->nent = nent; + + return 0; +} + +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, + struct ib_ucontext *context, struct mlx5_ib_cq *cq, + int entries, struct mlx5_create_cq_mbox_in **cqb, + int *cqe_size, int *index, int *inlen) +{ + struct mlx5_exp_ib_create_cq ucmd; + size_t ucmdlen; + int page_shift; + int npages; + int ncont; + int err; + + memset(&ucmd, 0, sizeof(ucmd)); + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(struct mlx5_ib_create_cq)) ? + (sizeof(struct mlx5_ib_create_cq) - sizeof(ucmd.reserved)) : + sizeof(struct mlx5_ib_create_cq); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_err(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmdlen == sizeof(ucmd) && ucmd.reserved != 0) { + mlx5_ib_err(dev, "command corrupted\n"); + return -EINVAL; + } + + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) { + mlx5_ib_warn(dev, "wrong CQE size %d\n", ucmd.cqe_size); + return -EINVAL; + } + + *cqe_size = ucmd.cqe_size; + + cq->buf.umem = ib_umem_get(context, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; + } + + err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) { + mlx5_ib_warn(dev, "map failed\n"); + goto err_umem; + } + + mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", + (unsigned long long)ucmd.buf_addr, entries * ucmd.cqe_size, + npages, page_shift, ncont); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_db; + } + + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + *index = to_mucontext(context)->uuari.uars[0].index; + + if (*cqe_size == 64 && MLX5_CAP_GEN(dev->mdev, cqe_compression)) { + if (ucmd.exp_data.cqe_comp_en == 1 && + (ucmd.exp_data.comp_mask & MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_EN)) { + MLX5_SET(cqc, &(*cqb)->ctx, cqe_compression_en, 1); + if (ucmd.exp_data.cqe_comp_recv_type == + MLX5_IB_CQE_FORMAT_CSUM && + (ucmd.exp_data.comp_mask & + MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_RECV_TYPE)) + MLX5_SET(cqc, &(*cqb)->ctx, mini_cqe_res_format, + MLX5_IB_CQE_FORMAT_CSUM); + } + } + + return 0; + +err_db: + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_umem: + ib_umem_release(cq->buf.umem); + return err; +} + +static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) +{ + + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + ib_umem_release(cq->buf.umem); +} + +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) +{ + int i; + void *cqe; + struct mlx5_cqe64 *cqe64; + + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size, + struct mlx5_create_cq_mbox_in **cqb, + int *index, int *inlen) +{ + int err; + + err = mlx5_db_alloc(dev->mdev, &cq->db); + if (err) + return err; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + cq->mcq.cqe_sz = cqe_size; + + err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size); + if (err) + goto err_db; + + init_cq_buf(cq, &cq->buf); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); + + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + *index = dev->mdev->priv.uuari.uars[0].index; + + return 0; + +err_buf: + free_cq_buf(dev, &cq->buf); + +err_db: + mlx5_db_free(dev->mdev, &cq->db); + return err; +} + +static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, &cq->buf); + mlx5_db_free(dev->mdev, &cq->db); +} + +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, + int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_create_cq_mbox_in *cqb = NULL; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq; + int uninitialized_var(index); + int uninitialized_var(inlen); + int cqe_size; + int irqn; + int eqn; + int err; + + if (entries < 0 || roundup_pow_of_two(entries + 1) > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d(%ld), max %d\n", + entries, roundup_pow_of_two(entries + 1), + 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); + return ERR_PTR(-EINVAL); + } + + entries = roundup_pow_of_two(entries + 1); + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + + INIT_LIST_HEAD(&cq->list_send_qp); + INIT_LIST_HEAD(&cq->list_recv_qp); + + if (context) { + err = create_cq_user(dev, udata, context, cq, entries, + &cqb, &cqe_size, &index, &inlen); + if (err) + goto err_create; + } else { + cqe_size = (cache_line_size() >= 128 ? 128 : 64); + err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, + &index, &inlen); + if (err) + goto err_create; + } + + cq->cqe_size = cqe_size; + cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); + err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); + if (err) + goto err_cqb; + + cqb->ctx.c_eqn = cpu_to_be16(eqn); + cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma); + + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); + if (err) + goto err_cqb; + + mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); + cq->mcq.irqn = irqn; + cq->mcq.comp = mlx5_ib_cq_comp; + cq->mcq.event = mlx5_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { + err = -EFAULT; + goto err_cmd; + } + + + kvfree(cqb); + return &cq->ibcq; + +err_cmd: + mlx5_core_destroy_cq(dev->mdev, &cq->mcq); + +err_cqb: + kvfree(cqb); + if (context) + destroy_cq_user(cq, context); + else + destroy_cq_kernel(dev, cq); + +err_create: + kfree(cq); + + return ERR_PTR(err); +} + + +int mlx5_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + struct ib_ucontext *context = NULL; + + if (cq->uobject) + context = cq->uobject->context; + + mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + if (context) + destroy_cq_user(mcq, context); + else + destroy_cq_kernel(dev, mcq); + + kfree(mcq); + + return 0; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) +{ + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); +} + +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) +{ + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + u32 prod_index; + int nfreed = 0; + u8 owner_bit; + + if (!cq) + return; + + /* First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) + mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->mcq.cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx5_cq_set_ci(&cq->mcq); + } +} + +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) +{ + if (!cq) + return; + + spin_lock_irq(&cq->lock); + __mlx5_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} + +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx5_modify_cq_mbox_in *in; + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + + int err; + u32 fsel = 0; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->cqn = cpu_to_be32(mcq->mcq.cqn); + if (1) { + if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) { + fsel |= (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + if (cq_period & 0xf000) { + /* A value higher than 0xfff is required, better + * use the largest value possible. */ + cq_period = 0xfff; + printf("mlx5_ib: INFO: ""period supported is limited to 12 bits\n"); + } + + in->ctx.cq_period = cpu_to_be16(cq_period); + in->ctx.cq_max_count = cpu_to_be16(cq_count); + } else { + err = -ENOSYS; + goto out; + } + } + in->field_select = cpu_to_be32(fsel); + err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in)); + +out: + kfree(in); + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_buf(cq, cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_cqe_from_buf(cq->resize_buf, + (i + 1) & (cq->resize_buf->nent), + dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + printf("mlx5_ib: WARN: ""resize CQ failed to get resize CQE, CQN 0x%x\n", cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_modify_cq_mbox_in *in; + int err; + int npas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) { + mlx5_ib_warn(dev, "Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1 || roundup_pow_of_two(entries + 1) > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d(%ld), max %d\n", + entries, roundup_pow_of_two(entries + 1), + 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); + return -EINVAL; + } + + entries = roundup_pow_of_two(entries + 1); + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + npas = cq->resize_buf->buf.npages; + page_shift = cq->resize_buf->buf.page_shift; + } + } + + if (err) { + mlx5_ib_warn(dev, "resize failed: %d\n", err); + goto ex; + } + + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + in->pas, 0); + else + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + in->ctx.page_offset = 0; + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); + in->cqn = cpu_to_be32(cq->mcq.cqn); + + err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); + if (err) { + mlx5_ib_warn(dev, "modify cq failed: %d\n", err); + goto ex_alloc; + } + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + kvfree(in); + return 0; + +ex_alloc: + kvfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; +} + +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +{ + struct mlx5_ib_cq *cq; + + if (!ibcq) + return 128; + + cq = to_mcq(ibcq); + return cq->cqe_size; +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c new file mode 100644 index 000000000..f9d6721ec --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c @@ -0,0 +1,97 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include + +#include "mlx5_ib.h" + +struct mlx5_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + uintptr_t user_virt; + int refcnt; +}; + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, uintptr_t virt, + struct mlx5_db *db) +{ + struct mlx5_ib_user_db_page *page; + struct ib_umem_chunk *chunk; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + chunk = list_entry(page->umem->chunk_list.next, + struct ib_umem_chunk, list); + db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c new file mode 100644 index 000000000..2a4f6577b --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c @@ -0,0 +1,542 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include + +#define MAX_U32 0xffffffffULL +#define MAX_U16 0xffffUL + +/* Counters should be saturate once they reach their maximum value */ +#define ASSIGN_32BIT_COUNTER(counter, value) do { \ + if ((value) > MAX_U32) \ + counter = cpu_to_be32(MAX_U32); \ + else \ + counter = cpu_to_be32(value); \ +} while (0) + +/* Counters should be saturate once they reach their maximum value */ +#define ASSIGN_16BIT_COUNTER(counter, value) do { \ + if ((value) > MAX_U16) \ + counter = cpu_to_be16(MAX_U16); \ + else \ + counter = cpu_to_be16(value); \ +} while (0) + +enum { + MLX5_IB_VENDOR_CLASS1 = 0x9, + MLX5_IB_VENDOR_CLASS2 = 0xa +}; + +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + u8 op_modifier = 0; + + /* Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); +} + +static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid; + int err; + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else { + return IB_MAD_RESULT_SUCCESS; + } + + err = mlx5_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, + struct mlx5_vport_counters *vc) +{ + pma_cnt_ext->port_xmit_data = cpu_to_be64((vc->transmitted_ib_unicast.octets + + vc->transmitted_ib_multicast.octets) >> 2); + pma_cnt_ext->port_rcv_data = cpu_to_be64((vc->received_ib_unicast.octets + + vc->received_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_packets = cpu_to_be64(vc->transmitted_ib_unicast.packets + + vc->transmitted_ib_multicast.packets); + pma_cnt_ext->port_rcv_packets = cpu_to_be64(vc->received_ib_unicast.packets + + vc->received_ib_multicast.packets); + pma_cnt_ext->port_unicast_xmit_packets = cpu_to_be64(vc->transmitted_ib_unicast.packets); + pma_cnt_ext->port_unicast_rcv_packets = cpu_to_be64(vc->received_ib_unicast.packets); + pma_cnt_ext->port_multicast_xmit_packets = cpu_to_be64(vc->transmitted_ib_multicast.packets); + pma_cnt_ext->port_multicast_rcv_packets = cpu_to_be64(vc->received_ib_multicast.packets); +} + +static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, + struct mlx5_vport_counters *vc) +{ + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data, + (vc->transmitted_ib_unicast.octets + + vc->transmitted_ib_multicast.octets) >> 2); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data, + (vc->received_ib_unicast.octets + + vc->received_ib_multicast.octets) >> 2); + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets, + vc->transmitted_ib_unicast.packets + + vc->transmitted_ib_multicast.packets); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets, + vc->received_ib_unicast.packets + + vc->received_ib_multicast.packets); +} + +static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_vport_counters *vc; + int err; + int ext; + + vc = kzalloc(sizeof(*vc), GFP_KERNEL); + if (!vc) + return -ENOMEM; + + ext = in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT; + + err = mlx5_get_vport_counters(dev->mdev, port_num, vc); + if (!err) { + if (ext) { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + + pma_cnt_ext_assign(pma_cnt_ext, vc); + } else { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)(out_mad->data + 40); + + ASSIGN_16BIT_COUNTER(pma_cnt->port_rcv_errors, + (u16)vc->received_errors.packets); + + pma_cnt_assign(pma_cnt, vc); + } + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + } + + kfree(vc); + return err; +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + memset(out_mad->data, 0, sizeof(out_mad->data)); + + if (MLX5_CAP_GEN(mdev, vport_counters) && + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { + /* TBD: read error counters from the PPCNT */ + return process_pma_cmd(ibdev, port_num, in_mad, out_mad); + } else { + return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, + in_mad, out_mad); + } +} + +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + u16 packet_error; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + + packet_error = be16_to_cpu(out_mad->status); + + dev->mdev->port_caps[port - 1].ext_port_cap = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_smp_attr_node_info_mad_ifc(struct ib_device *ibdev, + struct ib_smp *out_mad) +{ + struct ib_smp *in_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + if (!in_mad) + return -ENOMEM; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, + out_mad); + + kfree(in_mad); + return err; +} + +int mlx5_query_system_image_guid_mad_ifc(struct ib_device *ibdev, + __be64 *sys_image_guid) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + if (err) + goto out; + + memcpy(sys_image_guid, out_mad->data + 4, 8); + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_max_pkeys_mad_ifc(struct ib_device *ibdev, + u16 *max_pkeys) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + if (err) + goto out; + + *max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_vendor_id_mad_ifc(struct ib_device *ibdev, + u32 *vendor_id) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + if (err) + goto out; + + *vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & 0xffff; + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_node_desc_mad_ifc(struct mlx5_ib_dev *dev, char *node_desc) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(node_desc, out_mad->data, 64); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_node_guid_mad_ifc(struct mlx5_ib_dev *dev, u64 *node_guid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_pkey_mad_ifc(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_gids_mad_ifc(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_port_mad_ifc(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int err = -ENOMEM; + + if (port < 1 || port > MLX5_CAP_GEN(mdev, num_ports)) { + mlx5_ib_warn(dev, "invalid port number %d\n", port); + return -EINVAL; + } + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof(*props)); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto out; + } + + props->lid = be16_to_cpup((__be16 *)(out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *)(out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); + props->gid_tbl_len = out_mad->data[50]; + props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); + props->pkey_tbl_len = mdev->port_caps[port - 1].pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (mdev->port_caps[port - 1].ext_port_cap & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c new file mode 100644 index 000000000..56831ae81 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c @@ -0,0 +1,2310 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#undef inode + +#include +#include +#include +#include "user.h" +#include "mlx5_ib.h" + +#include + +#define DRIVER_NAME "mlx5_ib" +#define DRIVER_VERSION "3.2-rc1" +#define DRIVER_RELDATE "May 2016" + +#undef MODULE_VERSION +#include + +MODULE_AUTHOR("Eli Cohen "); +MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1); +MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1); +MODULE_VERSION(mlx5ib, 1); + +static int deprecated_prof_sel = 2; +module_param_named(prof_sel, deprecated_prof_sel, int, 0444); +MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); + +enum { + MLX5_STANDARD_ATOMIC_SIZE = 0x8, +}; + +struct workqueue_struct *mlx5_ib_wq; + +static char mlx5_version[] = + DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" + DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; + +static void get_atomic_caps(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + int tmp; + u8 atomic_operations; + u8 atomic_size_qp; + u8 atomic_req_endianess; + + atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + atomic_req_endianess = MLX5_CAP_ATOMIC(dev->mdev, + atomic_req_8B_endianess_mode) || + !mlx5_host_is_le(); + + tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) + && (atomic_size_qp & 8)) { + if (atomic_req_endianess) { + props->atomic_cap = IB_ATOMIC_HCA; + } else { + props->atomic_cap = IB_ATOMIC_NONE; + } + } else { + props->atomic_cap = IB_ATOMIC_NONE; + } + + tmp = MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | MLX5_ATOMIC_OPS_MASKED_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) + &&(atomic_size_qp & 8)) { + if (atomic_req_endianess) + props->masked_atomic_cap = IB_ATOMIC_HCA; + else { + props->masked_atomic_cap = IB_ATOMIC_NONE; + } + } else { + props->masked_atomic_cap = IB_ATOMIC_NONE; + } +} + +static enum rdma_link_layer +mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + switch (MLX5_CAP_GEN(dev->mdev, port_type)) { + case MLX5_CAP_PORT_TYPE_IB: + return IB_LINK_LAYER_INFINIBAND; + case MLX5_CAP_PORT_TYPE_ETH: + return IB_LINK_LAYER_ETHERNET; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +} + +static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) +{ + return !dev->mdev->issi; +} + +enum { + MLX5_VPORT_ACCESS_METHOD_MAD, + MLX5_VPORT_ACCESS_METHOD_HCA, + MLX5_VPORT_ACCESS_METHOD_NIC, +}; + +static int mlx5_get_vport_access_method(struct ib_device *ibdev) +{ + if (mlx5_use_mad_ifc(to_mdev(ibdev))) + return MLX5_VPORT_ACCESS_METHOD_MAD; + + if (mlx5_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET) + return MLX5_VPORT_ACCESS_METHOD_NIC; + + return MLX5_VPORT_ACCESS_METHOD_HCA; +} + +static int mlx5_query_system_image_guid(struct ib_device *ibdev, + __be64 *sys_image_guid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + u64 tmp; + int err; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_system_image_guid_mad_ifc(ibdev, + sys_image_guid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + return err; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + return err; + + default: + return -EINVAL; + } +} + +static int mlx5_query_max_pkeys(struct ib_device *ibdev, + u16 *max_pkeys) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_max_pkeys_mad_ifc(ibdev, max_pkeys); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, + pkey_table_size)); + return 0; + + default: + return -EINVAL; + } +} + +static int mlx5_query_vendor_id(struct ib_device *ibdev, + u32 *vendor_id) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_vendor_id_mad_ifc(ibdev, vendor_id); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_core_query_vendor_id(dev->mdev, vendor_id); + + default: + return -EINVAL; + } +} + +static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, + __be64 *node_guid) +{ + u64 tmp; + int err; + + switch (mlx5_get_vport_access_method(&dev->ib_dev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_node_guid_mad_ifc(dev, node_guid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); + if (!err) + *node_guid = cpu_to_be64(tmp); + return err; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + if (!err) + *node_guid = cpu_to_be64(tmp); + return err; + + default: + return -EINVAL; + } +} + +struct mlx5_reg_node_desc { + u8 desc[64]; +}; + +static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) +{ + struct mlx5_reg_node_desc in; + + if (mlx5_use_mad_ifc(dev)) + return mlx5_query_node_desc_mad_ifc(dev, node_desc); + + memset(&in, 0, sizeof(in)); + + return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, + sizeof(struct mlx5_reg_node_desc), + MLX5_REG_NODE_DESC, 0, 0); +} + +static int mlx5_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + int max_sq_desc; + int max_rq_sg; + int max_sq_sg; + int err; + + + memset(props, 0, sizeof(*props)); + + err = mlx5_query_system_image_guid(ibdev, + &props->sys_image_guid); + if (err) + return err; + + err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); + if (err) + return err; + + err = mlx5_query_vendor_id(ibdev, &props->vendor_id); + if (err) + return err; + + props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | + ((u64)fw_rev_min(dev->mdev) << 16) | + fw_rev_sub(dev->mdev); + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + + if (MLX5_CAP_GEN(mdev, pkv)) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (MLX5_CAP_GEN(mdev, qkv)) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (MLX5_CAP_GEN(mdev, apm)) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if (MLX5_CAP_GEN(mdev, xrc)) + props->device_cap_flags |= IB_DEVICE_XRC; + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (MLX5_CAP_GEN(mdev, block_lb_mc)) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + props->vendor_part_id = mdev->pdev->device; + props->hw_ver = mdev->pdev->revision; + + props->max_mr_size = ~0ull; + props->page_size_cap = ~(u32)((1ull << MLX5_CAP_GEN(mdev, log_pg_sz)) -1); + props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); + props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / + sizeof(struct mlx5_wqe_data_seg); + max_sq_desc = min((int)MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); + max_sq_sg = (max_sq_desc - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); + props->max_sge = min(max_rq_sg, max_sq_sg); + props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); + props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; + props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); + props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); + props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); + props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); + props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; + props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq_sge = max_rq_sg - 1; + props->max_fast_reg_page_list_len = (unsigned int)-1; + get_atomic_caps(dev, props); + props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); + props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + props->max_ah = INT_MAX; + + return 0; +} + +enum mlx5_ib_width { + MLX5_IB_WIDTH_1X = 1 << 0, + MLX5_IB_WIDTH_2X = 1 << 1, + MLX5_IB_WIDTH_4X = 1 << 2, + MLX5_IB_WIDTH_8X = 1 << 3, + MLX5_IB_WIDTH_12X = 1 << 4 +}; + +static int translate_active_width(struct ib_device *ibdev, u8 active_width, + u8 *ib_width) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + int err = 0; + + if (active_width & MLX5_IB_WIDTH_1X) { + *ib_width = IB_WIDTH_1X; + } else if (active_width & MLX5_IB_WIDTH_2X) { + mlx5_ib_warn(dev, "active_width %d is not supported by IB spec\n", + (int)active_width); + err = -EINVAL; + } else if (active_width & MLX5_IB_WIDTH_4X) { + *ib_width = IB_WIDTH_4X; + } else if (active_width & MLX5_IB_WIDTH_8X) { + *ib_width = IB_WIDTH_8X; + } else if (active_width & MLX5_IB_WIDTH_12X) { + *ib_width = IB_WIDTH_12X; + } else { + mlx5_ib_dbg(dev, "Invalid active_width %d\n", + (int)active_width); + err = -EINVAL; + } + + return err; +} + +/* + * TODO: Move to IB core + */ +enum ib_max_vl_num { + __IB_MAX_VL_0 = 1, + __IB_MAX_VL_0_1 = 2, + __IB_MAX_VL_0_3 = 3, + __IB_MAX_VL_0_7 = 4, + __IB_MAX_VL_0_14 = 5, +}; + +enum mlx5_vl_hw_cap { + MLX5_VL_HW_0 = 1, + MLX5_VL_HW_0_1 = 2, + MLX5_VL_HW_0_2 = 3, + MLX5_VL_HW_0_3 = 4, + MLX5_VL_HW_0_4 = 5, + MLX5_VL_HW_0_5 = 6, + MLX5_VL_HW_0_6 = 7, + MLX5_VL_HW_0_7 = 8, + MLX5_VL_HW_0_14 = 15 +}; + +static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, + u8 *max_vl_num) +{ + switch (vl_hw_cap) { + case MLX5_VL_HW_0: + *max_vl_num = __IB_MAX_VL_0; + break; + case MLX5_VL_HW_0_1: + *max_vl_num = __IB_MAX_VL_0_1; + break; + case MLX5_VL_HW_0_3: + *max_vl_num = __IB_MAX_VL_0_3; + break; + case MLX5_VL_HW_0_7: + *max_vl_num = __IB_MAX_VL_0_7; + break; + case MLX5_VL_HW_0_14: + *max_vl_num = __IB_MAX_VL_0_14; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + u32 *rep; + int outlen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); + struct mlx5_ptys_reg *ptys; + struct mlx5_pmtu_reg *pmtu; + struct mlx5_pvlc_reg pvlc; + void *ctx; + int err; + + rep = mlx5_vzalloc(outlen); + ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); + pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL); + if (!rep || !ptys || !pmtu) { + err = -ENOMEM; + goto out; + } + + memset(props, 0, sizeof(*props)); + + /* what if I am pf with dual port */ + err = mlx5_query_hca_vport_context(mdev, port, 0, rep, outlen); + if (err) + goto out; + + ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context); + + props->lid = MLX5_GET(hca_vport_context, ctx, lid); + props->lmc = MLX5_GET(hca_vport_context, ctx, lmc); + props->sm_lid = MLX5_GET(hca_vport_context, ctx, sm_lid); + props->sm_sl = MLX5_GET(hca_vport_context, ctx, sm_sl); + props->state = MLX5_GET(hca_vport_context, ctx, vport_state); + props->phys_state = MLX5_GET(hca_vport_context, ctx, + port_physical_state); + props->port_cap_flags = MLX5_GET(hca_vport_context, ctx, cap_mask1); + props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); + props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); + props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); + props->bad_pkey_cntr = MLX5_GET(hca_vport_context, ctx, + pkey_violation_counter); + props->qkey_viol_cntr = MLX5_GET(hca_vport_context, ctx, + qkey_violation_counter); + props->subnet_timeout = MLX5_GET(hca_vport_context, ctx, + subnet_timeout); + props->init_type_reply = MLX5_GET(hca_vport_context, ctx, + init_type_reply); + + ptys->proto_mask |= MLX5_PTYS_IB; + ptys->local_port = port; + err = mlx5_core_access_ptys(mdev, ptys, 0); + if (err) + goto out; + + err = translate_active_width(ibdev, ptys->ib_link_width_oper, + &props->active_width); + if (err) + goto out; + + props->active_speed = (u8)ptys->ib_proto_oper; + + pmtu->local_port = port; + err = mlx5_core_access_pmtu(mdev, pmtu, 0); + if (err) + goto out; + + props->max_mtu = pmtu->max_mtu; + props->active_mtu = pmtu->oper_mtu; + + memset(&pvlc, 0, sizeof(pvlc)); + pvlc.local_port = port; + err = mlx5_core_access_pvlc(mdev, &pvlc, 0); + if (err) + goto out; + + err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap, + &props->max_vl_num); +out: + kvfree(rep); + kfree(ptys); + kfree(pmtu); + return err; +} + +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_port_mad_ifc(ibdev, port, props); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + return mlx5_query_port_ib(ibdev, port, props); + + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_query_port_roce(ibdev, port, props); + + default: + return -EINVAL; + } +} + +static inline int +mlx5_addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +{ + if (dev->if_addrlen != ETH_ALEN) + return -1; + memcpy(eui, IF_LLADDR(dev), 3); + memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); + + /* NOTE: The scope ID is added by the GID to IP conversion */ + + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + return 0; +} + +static void +mlx5_make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + mlx5_addrconf_ifid_eui48(&gid->raw[8], dev); +} + +static inline int +mlx5_ip2gid(const struct sockaddr *addr, union ib_gid *gid) +{ + switch (addr->sa_family) { + case AF_INET: + ipv6_addr_set_v4mapped(((const struct sockaddr_in *)addr)->sin_addr.s_addr, + (struct in6_addr *)gid->raw); + break; + case AF_INET6: + memcpy(gid->raw, &((const struct sockaddr_in6 *)addr)->sin6_addr, 16); + /* clear SCOPE ID */ + gid->raw[2] = 0; + gid->raw[3] = 0; + break; + default: + return -EINVAL; + } + return 0; +} + +static void +mlx5_ib_roce_port_update(void *arg) +{ + struct mlx5_ib_port *port = (struct mlx5_ib_port *)arg; + struct mlx5_ib_dev *dev = port->dev; + struct mlx5_core_dev *mdev = dev->mdev; + struct net_device *xdev[MLX5_IB_GID_MAX]; + struct net_device *idev; + struct net_device *ndev; + struct ifaddr *ifa; + union ib_gid gid_temp; + + while (port->port_gone == 0) { + int update = 0; + int gid_index = 0; + int j; + int error; + + ndev = mlx5_get_protocol_dev(mdev, MLX5_INTERFACE_PROTOCOL_ETH); + if (ndev == NULL) { + pause("W", hz); + continue; + } + + CURVNET_SET_QUIET(ndev->if_vnet); + + memset(&gid_temp, 0, sizeof(gid_temp)); + mlx5_make_default_gid(ndev, &gid_temp); + if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) { + port->gid_table[gid_index] = gid_temp; + update = 1; + } + xdev[gid_index] = ndev; + gid_index++; + + IFNET_RLOCK(); + TAILQ_FOREACH(idev, &V_ifnet, if_link) { + if (idev == ndev) + break; + } + if (idev != NULL) { + TAILQ_FOREACH(idev, &V_ifnet, if_link) { + if (idev != ndev) { + if (idev->if_type != IFT_L2VLAN) + continue; + if (ndev != rdma_vlan_dev_real_dev(idev)) + continue; + } + /* clone address information for IPv4 and IPv6 */ + IF_ADDR_RLOCK(idev); + TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + (ifa->ifa_addr->sa_family != AF_INET && + ifa->ifa_addr->sa_family != AF_INET6) || + gid_index >= MLX5_IB_GID_MAX) + continue; + memset(&gid_temp, 0, sizeof(gid_temp)); + mlx5_ip2gid(ifa->ifa_addr, &gid_temp); + /* check for existing entry */ + for (j = 0; j != gid_index; j++) { + if (bcmp(&gid_temp, &port->gid_table[j], sizeof(gid_temp)) == 0) + break; + } + /* check if new entry must be added */ + if (j == gid_index) { + if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) { + port->gid_table[gid_index] = gid_temp; + update = 1; + } + xdev[gid_index] = idev; + gid_index++; + } + } + IF_ADDR_RUNLOCK(idev); + } + } + IFNET_RUNLOCK(); + CURVNET_RESTORE(); + + if (update != 0 && + mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { + struct ib_event event = { + .device = &dev->ib_dev, + .element.port_num = port->port_num + 1, + .event = IB_EVENT_GID_CHANGE, + }; + + /* add new entries, if any */ + for (j = 0; j != gid_index; j++) { + error = modify_gid_roce(&dev->ib_dev, port->port_num, j, + port->gid_table + j, xdev[j]); + if (error != 0) + printf("mlx5_ib: Failed to update ROCE GID table: %d\n", error); + } + memset(&gid_temp, 0, sizeof(gid_temp)); + + /* clear old entries, if any */ + for (; j != MLX5_IB_GID_MAX; j++) { + if (bcmp(&gid_temp, port->gid_table + j, sizeof(gid_temp)) == 0) + continue; + port->gid_table[j] = gid_temp; + (void) modify_gid_roce(&dev->ib_dev, port->port_num, j, + port->gid_table + j, ndev); + } + + /* make sure ibcore gets updated */ + ib_dispatch_event(&event); + } + pause("W", hz); + } + do { + struct ib_event event = { + .device = &dev->ib_dev, + .element.port_num = port->port_num + 1, + .event = IB_EVENT_GID_CHANGE, + }; + /* make sure ibcore gets updated */ + ib_dispatch_event(&event); + + /* wait a bit */ + pause("W", hz); + } while (0); + port->port_gone = 2; + kthread_exit(); +} + +static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_gids_mad_ifc(ibdev, port, index, gid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid); + + case MLX5_VPORT_ACCESS_METHOD_NIC: + if (port == 0 || port > MLX5_CAP_GEN(mdev, num_ports) || + index < 0 || index >= MLX5_IB_GID_MAX || + dev->port[port - 1].port_gone != 0) + memset(gid, 0, sizeof(*gid)); + else + *gid = dev->port[port - 1].gid_table[index]; + return 0; + + default: + return -EINVAL; + } +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_pkey_mad_ifc(ibdev, port, index, pkey); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, + pkey); + + default: + return -EINVAL; + } +} + +static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_reg_node_desc in; + struct mlx5_reg_node_desc out; + int err; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + memcpy(&in, props->node_desc, 64); + err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_NODE_DESC, 0, 1); + if (err) + return err; + + memcpy(ibdev->node_desc, props->node_desc, 64); + + return err; +} + +static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + u8 is_eth = (mlx5_ib_port_link_layer(ibdev, port) == + IB_LINK_LAYER_ETHERNET); + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_port_attr attr; + u32 tmp; + int err; + + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not valid. + */ + if (is_eth) + return 0; + + mutex_lock(&dev->cap_mask_mutex); + + err = mlx5_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + tmp = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx5_set_port_caps(dev->mdev, port, tmp); + +out: + mutex_unlock(&dev->cap_mask_mutex); + return err; +} + +enum mlx5_cap_flags { + MLX5_CAP_COMPACT_AV = 1 << 0, +}; + +static void set_mlx5_flags(u32 *flags, struct mlx5_core_dev *dev) +{ + *flags |= MLX5_CAP_GEN(dev, compact_address_vector) ? + MLX5_CAP_COMPACT_AV : 0; +} + +static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_ucontext_req_v2 req; + struct mlx5_ib_alloc_ucontext_resp resp; + struct mlx5_ib_ucontext *context; + struct mlx5_uuar_info *uuari; + struct mlx5_uar *uars; + int gross_uuars; + int num_uars; + int ver; + int uuarn; + int err; + int i; + size_t reqlen; + + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + memset(&req, 0, sizeof(req)); + memset(&resp, 0, sizeof(resp)); + + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else { + mlx5_ib_err(dev, "request malformed, reqlen: %ld\n", (long)reqlen); + return ERR_PTR(-EINVAL); + } + + err = ib_copy_from_udata(&req, udata, reqlen); + if (err) { + mlx5_ib_err(dev, "copy failed\n"); + return ERR_PTR(err); + } + + if (req.reserved) { + mlx5_ib_err(dev, "request corrupted\n"); + return ERR_PTR(-EINVAL); + } + + if (req.total_num_uuars == 0 || req.total_num_uuars > MLX5_MAX_UUARS) { + mlx5_ib_warn(dev, "wrong num_uuars: %d\n", req.total_num_uuars); + return ERR_PTR(-ENOMEM); + } + + req.total_num_uuars = ALIGN(req.total_num_uuars, + MLX5_NON_FP_BF_REGS_PER_PAGE); + if (req.num_low_latency_uuars > req.total_num_uuars - 1) { + mlx5_ib_warn(dev, "wrong num_low_latency_uuars: %d ( > %d)\n", + req.total_num_uuars, req.total_num_uuars); + return ERR_PTR(-EINVAL); + } + + num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; + gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; + resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); + if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) + resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); + resp.cache_line_size = L1_CACHE_BYTES; + resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); + resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); + resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + set_mlx5_flags(&resp.flags, dev->mdev); + + if (offsetof(struct mlx5_ib_alloc_ucontext_resp, max_desc_sz_sq_dc) < udata->outlen) + resp.max_desc_sz_sq_dc = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq_dc); + + if (offsetof(struct mlx5_ib_alloc_ucontext_resp, atomic_arg_sizes_dc) < udata->outlen) + resp.atomic_arg_sizes_dc = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + uuari = &context->uuari; + mutex_init(&uuari->lock); + uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); + if (!uars) { + err = -ENOMEM; + goto out_ctx; + } + + uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), + sizeof(*uuari->bitmap), + GFP_KERNEL); + if (!uuari->bitmap) { + err = -ENOMEM; + goto out_uar_ctx; + } + /* + * clear all fast path uuars + */ + for (i = 0; i < gross_uuars; i++) { + uuarn = i & 3; + if (uuarn == 2 || uuarn == 3) + set_bit(i, uuari->bitmap); + } + + uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); + if (!uuari->count) { + err = -ENOMEM; + goto out_bitmap; + } + + for (i = 0; i < num_uars; i++) { + err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); + if (err) { + mlx5_ib_err(dev, "uar alloc failed at %d\n", i); + goto out_uars; + } + } + for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) + context->dynamic_wc_uar_index[i] = MLX5_IB_INVALID_UAR_INDEX; + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + resp.tot_uuars = req.total_num_uuars; + resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); + err = ib_copy_to_udata(udata, &resp, + min_t(size_t, udata->outlen, sizeof(resp))); + if (err) + goto out_uars; + + uuari->ver = ver; + uuari->num_low_latency_uuars = req.num_low_latency_uuars; + uuari->uars = uars; + uuari->num_uars = num_uars; + + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) { + err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn); + if (err) + goto out_uars; + } + + return &context->ibucontext; + +out_uars: + for (i--; i >= 0; i--) + mlx5_cmd_free_uar(dev->mdev, uars[i].index); + kfree(uuari->count); + +out_bitmap: + kfree(uuari->bitmap); + +out_uar_ctx: + kfree(uars); + +out_ctx: + kfree(context); + return ERR_PTR(err); +} + +static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + int i; + + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) + mlx5_dealloc_transport_domain(dev->mdev, context->tdn); + + for (i = 0; i < uuari->num_uars; i++) { + if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) + mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); + } + for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) { + if (context->dynamic_wc_uar_index[i] != MLX5_IB_INVALID_UAR_INDEX) + mlx5_cmd_free_uar(dev->mdev, context->dynamic_wc_uar_index[i]); + } + + kfree(uuari->count); + kfree(uuari->bitmap); + kfree(uuari->uars); + kfree(context); + + return 0; +} + +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) +{ + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; +} + +static int get_command(unsigned long offset) +{ + return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; +} + +static int get_arg(unsigned long offset) +{ + return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); +} + +static int get_index(unsigned long offset) +{ + return get_arg(offset); +} + +static int uar_mmap(struct vm_area_struct *vma, pgprot_t prot, bool is_wc, + struct mlx5_uuar_info *uuari, struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) +{ + unsigned long idx; + phys_addr_t pfn; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) { + mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n", + (long)PAGE_SIZE, (long)(vma->vm_end - vma->vm_start)); + return -EINVAL; + } + + idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) { + mlx5_ib_warn(dev, "wrong offset, idx:%ld num_uars:%d\n", + idx, uuari->num_uars); + return -EINVAL; + } + + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, + (unsigned long long)pfn); + + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) { + mlx5_ib_err(dev, "io remap failed\n"); + return -EAGAIN; + } + + mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA 0x%llx\n", is_wc ? "WC" : "NC", + (long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); + + return 0; +} + +static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + unsigned long command; + + command = get_command(vma->vm_pgoff); + switch (command) { + case MLX5_IB_MMAP_REGULAR_PAGE: + return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), + true, + uuari, dev, context); + + break; + + case MLX5_IB_MMAP_WC_PAGE: + return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), + true, uuari, dev, context); + break; + + case MLX5_IB_MMAP_NC_PAGE: + return uar_mmap(vma, pgprot_noncached(vma->vm_page_prot), + false, uuari, dev, context); + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) +{ + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_core_mr mr; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + seg = &in->seg; + seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); + goto err_in; + } + + kfree(in); + *key = mr.key; + + return 0; + +err_in: + kfree(in); + + return err; +} + +static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_core_mr mr; + int err; + + memset(&mr, 0, sizeof(mr)); + mr.key = key; + err = mlx5_core_destroy_mkey(dev->mdev, &mr); + if (err) + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); +} + +static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_pd_resp resp; + struct mlx5_ib_pd *pd; + int err; + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); + if (err) { + mlx5_ib_warn(dev, "pd alloc failed\n"); + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + resp.pdn = pd->pdn; + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + mlx5_ib_err(dev, "copy failed\n"); + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } else { + err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); + if (err) { + mlx5_ib_err(dev, "alloc mkey failed\n"); + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(err); + } + } + + return &pd->ibpd; +} + +static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + if (!pd->uobject) + free_pa_mkey(mdev, mpd->pa_lkey); + + mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); + kfree(mpd); + + return 0; +} + +static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + if (ibqp->qp_type == IB_QPT_RAW_PACKET) + err = -EOPNOTSUPP; + else + err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + if (ibqp->qp_type == IB_QPT_RAW_PACKET) + err = -EOPNOTSUPP; + else + err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int init_node_data(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); + if (err) + return err; + + return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); +} + +static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages); +} + +static ssize_t show_reg_pages(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev), + fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", (unsigned)dev->mdev->pdev->revision); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); +static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_fw_pages, + &dev_attr_reg_pages, +}; + +static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_qp *mqp; + struct mlx5_ib_cq *send_mcq, *recv_mcq; + struct mlx5_core_cq *mcq; + struct list_head cq_armed_list; + unsigned long flags_qp; + unsigned long flags_cq; + unsigned long flags; + + mlx5_ib_warn(ibdev, " started\n"); + INIT_LIST_HEAD(&cq_armed_list); + + /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ + spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); + list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { + spin_lock_irqsave(&mqp->sq.lock, flags_qp); + if (mqp->sq.tail != mqp->sq.head) { + send_mcq = to_mcq(mqp->ibqp.send_cq); + spin_lock_irqsave(&send_mcq->lock, flags_cq); + if (send_mcq->mcq.comp && + mqp->ibqp.send_cq->comp_handler) { + if (!send_mcq->mcq.reset_notify_added) { + send_mcq->mcq.reset_notify_added = 1; + list_add_tail(&send_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&send_mcq->lock, flags_cq); + } + spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); + spin_lock_irqsave(&mqp->rq.lock, flags_qp); + /* no handling is needed for SRQ */ + if (!mqp->ibqp.srq) { + if (mqp->rq.tail != mqp->rq.head) { + recv_mcq = to_mcq(mqp->ibqp.recv_cq); + spin_lock_irqsave(&recv_mcq->lock, flags_cq); + if (recv_mcq->mcq.comp && + mqp->ibqp.recv_cq->comp_handler) { + if (!recv_mcq->mcq.reset_notify_added) { + recv_mcq->mcq.reset_notify_added = 1; + list_add_tail(&recv_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&recv_mcq->lock, + flags_cq); + } + } + spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); + } + /*At that point all inflight post send were put to be executed as of we + * lock/unlock above locks Now need to arm all involved CQs. + */ + list_for_each_entry(mcq, &cq_armed_list, reset_notify) { + mcq->comp(mcq); + } + spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); + mlx5_ib_warn(ibdev, " ended\n"); + return; +} + +static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; + struct ib_event ibev; + + u8 port = 0; + + switch (event) { + case MLX5_DEV_EVENT_SYS_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + mlx5_ib_handle_internal_error(ibdev); + break; + + case MLX5_DEV_EVENT_PORT_UP: + ibev.event = IB_EVENT_PORT_ACTIVE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PORT_DOWN: + case MLX5_DEV_EVENT_PORT_INITIALIZED: + ibev.event = IB_EVENT_PORT_ERR; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_LID_CHANGE: + ibev.event = IB_EVENT_LID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PKEY_CHANGE: + ibev.event = IB_EVENT_PKEY_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_GUID_CHANGE: + ibev.event = IB_EVENT_GID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_CLIENT_REREG: + ibev.event = IB_EVENT_CLIENT_REREGISTER; + port = (u8)param; + break; + + default: + break; + } + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = port; + + if ((event != MLX5_DEV_EVENT_SYS_ERROR) && + (port < 1 || port > ibdev->num_ports)) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); + return; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); +} + +static void get_ext_port_caps(struct mlx5_ib_dev *dev) +{ + int port; + + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) + mlx5_query_ext_port_caps(dev, port); +} + +static void config_atomic_responder(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + enum ib_atomic_cap cap = props->atomic_cap; + +#if 0 + if (cap == IB_ATOMIC_HCA || + cap == IB_ATOMIC_GLOB) +#endif + dev->enable_atomic_resp = 1; + + dev->atomic_cap = cap; +} + +enum mlx5_addr_align { + MLX5_ADDR_ALIGN_0 = 0, + MLX5_ADDR_ALIGN_64 = 64, + MLX5_ADDR_ALIGN_128 = 128, +}; + +static int get_port_caps(struct mlx5_ib_dev *dev) +{ + struct ib_device_attr *dprops = NULL; + struct ib_port_attr *pprops = NULL; + int err = -ENOMEM; + int port; + + pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + if (!pprops) + goto out; + + dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); + if (!dprops) + goto out; + + err = mlx5_ib_query_device(&dev->ib_dev, dprops); + if (err) { + mlx5_ib_warn(dev, "query_device failed %d\n", err); + goto out; + } + config_atomic_responder(dev, dprops); + + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", + port, err); + break; + } + dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys; + dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", + dprops->max_pkeys, pprops->gid_tbl_len); + } + +out: + kfree(pprops); + kfree(dprops); + + return err; +} + +static void destroy_umrc_res(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_mr_cache_cleanup(dev); + if (err) + mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + + ib_dereg_mr(dev->umrc.mr); + ib_dealloc_pd(dev->umrc.pd); +} + +enum { + MAX_UMR_WR = 128, +}; + +static int create_umr_res(struct mlx5_ib_dev *dev) +{ + struct ib_pd *pd; + struct ib_mr *mr; + int ret; + + pd = ib_alloc_pd(&dev->ib_dev); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + ret = PTR_ERR(pd); + goto error_0; + } + + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mr)) { + mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); + ret = PTR_ERR(mr); + goto error_1; + } + + dev->umrc.mr = mr; + dev->umrc.pd = pd; + + ret = mlx5_mr_cache_init(dev); + if (ret) { + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + goto error_4; + } + + return 0; + +error_4: + ib_dereg_mr(mr); +error_1: + ib_dealloc_pd(pd); +error_0: + return ret; +} + +static int create_dev_resources(struct mlx5_ib_resources *devr) +{ + struct ib_srq_init_attr attr; + struct mlx5_ib_dev *dev; + int ret = 0; + + dev = container_of(devr, struct mlx5_ib_dev, devr); + + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->p0)) { + ret = PTR_ERR(devr->p0); + goto error0; + } + devr->p0->device = &dev->ib_dev; + devr->p0->uobject = NULL; + atomic_set(&devr->p0->usecnt, 0); + + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL); + if (IS_ERR(devr->c0)) { + ret = PTR_ERR(devr->c0); + goto error1; + } + devr->c0->device = &dev->ib_dev; + devr->c0->uobject = NULL; + devr->c0->comp_handler = NULL; + devr->c0->event_handler = NULL; + devr->c0->cq_context = NULL; + atomic_set(&devr->c0->usecnt, 0); + + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x0)) { + ret = PTR_ERR(devr->x0); + goto error2; + } + devr->x0->device = &dev->ib_dev; + devr->x0->inode = NULL; + atomic_set(&devr->x0->usecnt, 0); + mutex_init(&devr->x0->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x0->tgt_qp_list); + + devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x1)) { + ret = PTR_ERR(devr->x1); + goto error3; + } + devr->x1->device = &dev->ib_dev; + devr->x1->inode = NULL; + atomic_set(&devr->x1->usecnt, 0); + mutex_init(&devr->x1->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x1->tgt_qp_list); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.xrc.cq = devr->c0; + attr.ext.xrc.xrcd = devr->x0; + + devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s0)) { + ret = PTR_ERR(devr->s0); + goto error4; + } + devr->s0->device = &dev->ib_dev; + devr->s0->pd = devr->p0; + devr->s0->uobject = NULL; + devr->s0->event_handler = NULL; + devr->s0->srq_context = NULL; + devr->s0->srq_type = IB_SRQT_XRC; + devr->s0->ext.xrc.xrcd = devr->x0; + devr->s0->ext.xrc.cq = devr->c0; + atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); + atomic_inc(&devr->s0->ext.xrc.cq->usecnt); + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s0->usecnt, 0); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_BASIC; + devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s1)) { + ret = PTR_ERR(devr->s1); + goto error5; + } + devr->s1->device = &dev->ib_dev; + devr->s1->pd = devr->p0; + devr->s1->uobject = NULL; + devr->s1->event_handler = NULL; + devr->s1->srq_context = NULL; + devr->s1->srq_type = IB_SRQT_BASIC; + devr->s1->ext.xrc.cq = devr->c0; + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s1->usecnt, 0); + + return 0; + +error5: + mlx5_ib_destroy_srq(devr->s0); +error4: + mlx5_ib_dealloc_xrcd(devr->x1); +error3: + mlx5_ib_dealloc_xrcd(devr->x0); +error2: + mlx5_ib_destroy_cq(devr->c0); +error1: + mlx5_ib_dealloc_pd(devr->p0); +error0: + return ret; +} + +static void destroy_dev_resources(struct mlx5_ib_resources *devr) +{ + mlx5_ib_destroy_srq(devr->s1); + mlx5_ib_destroy_srq(devr->s0); + mlx5_ib_dealloc_xrcd(devr->x0); + mlx5_ib_dealloc_xrcd(devr->x1); + mlx5_ib_destroy_cq(devr->c0); + mlx5_ib_dealloc_pd(devr->p0); +} + +static void enable_dc_tracer(struct mlx5_ib_dev *dev) +{ + struct device *device = dev->ib_dev.dma_device; + struct mlx5_dc_tracer *dct = &dev->dctr; + int order; + void *tmp; + int size; + int err; + + size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096; + if (size <= PAGE_SIZE) + order = 0; + else + order = 1; + + dct->pg = alloc_pages(GFP_KERNEL, order); + if (!dct->pg) { + mlx5_ib_err(dev, "failed to allocate %d pages\n", order); + return; + } + + tmp = page_address(dct->pg); + memset(tmp, 0xff, size); + + dct->size = size; + dct->order = order; + dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE); + if (dma_mapping_error(device, dct->dma)) { + mlx5_ib_err(dev, "dma mapping error\n"); + goto map_err; + } + + err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma); + if (err) { + mlx5_ib_warn(dev, "failed to enable DC tracer\n"); + goto cmd_err; + } + + return; + +cmd_err: + dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE); +map_err: + __free_pages(dct->pg, dct->order); + dct->pg = NULL; +} + +static void disable_dc_tracer(struct mlx5_ib_dev *dev) +{ + struct device *device = dev->ib_dev.dma_device; + struct mlx5_dc_tracer *dct = &dev->dctr; + int err; + + if (!dct->pg) + return; + + err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma); + if (err) { + mlx5_ib_warn(dev, "failed to disable DC tracer\n"); + return; + } + + dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE); + __free_pages(dct->pg, dct->order); + dct->pg = NULL; +} + +enum { + MLX5_DC_CNAK_SIZE = 128, + MLX5_NUM_BUF_IN_PAGE = PAGE_SIZE / MLX5_DC_CNAK_SIZE, + MLX5_CNAK_TX_CQ_SIGNAL_FACTOR = 128, + MLX5_DC_CNAK_SL = 0, + MLX5_DC_CNAK_VL = 0, +}; + +static int init_dc_improvements(struct mlx5_ib_dev *dev) +{ + if (!mlx5_core_is_pf(dev->mdev)) + return 0; + + if (!(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace))) + return 0; + + enable_dc_tracer(dev); + + return 0; +} + +static void cleanup_dc_improvements(struct mlx5_ib_dev *dev) +{ + + disable_dc_tracer(dev); +} + +static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num) +{ + mlx5_vport_dealloc_q_counter(dev->mdev, + MLX5_INTERFACE_PROTOCOL_IB, + dev->port[port_num].q_cnt_id); + dev->port[port_num].q_cnt_id = 0; +} + +static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_ports; i++) + mlx5_ib_dealloc_q_port_counter(dev, i); +} + +static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) +{ + int i; + int ret; + + for (i = 0; i < dev->num_ports; i++) { + ret = mlx5_vport_alloc_q_counter(dev->mdev, + MLX5_INTERFACE_PROTOCOL_IB, + &dev->port[i].q_cnt_id); + if (ret) { + mlx5_ib_warn(dev, + "couldn't allocate queue counter for port %d\n", + i + 1); + goto dealloc_counters; + } + } + + return 0; + +dealloc_counters: + while (--i >= 0) + mlx5_ib_dealloc_q_port_counter(dev, i); + + return ret; +} + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_ib_port *, + struct port_attribute *, char *buf); + ssize_t (*store)(struct mlx5_ib_port *, + struct port_attribute *, + const char *buf, size_t count); +}; + +struct port_counter_attribute { + struct port_attribute attr; + size_t offset; +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx5_ib_port_sysfs_group *p = + container_of(kobj, struct mlx5_ib_port_sysfs_group, + kobj); + struct mlx5_ib_port *mibport = container_of(p, struct mlx5_ib_port, + group); + + if (!port_attr->show) + return -EIO; + + return port_attr->show(mibport, port_attr, buf); +} + +static ssize_t show_port_counter(struct mlx5_ib_port *p, + struct port_attribute *port_attr, + char *buf) +{ + int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); + struct port_counter_attribute *counter_attr = + container_of(port_attr, struct port_counter_attribute, attr); + void *out; + int ret; + + out = mlx5_vzalloc(outlen); + if (!out) + return -ENOMEM; + + ret = mlx5_vport_query_q_counter(p->dev->mdev, + p->q_cnt_id, 0, + out, outlen); + if (ret) + goto free; + + ret = sprintf(buf, "%d\n", + be32_to_cpu(*(__be32 *)(out + counter_attr->offset))); + +free: + kfree(out); + return ret; +} + +#define PORT_COUNTER_ATTR(_name) \ +struct port_counter_attribute port_counter_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_port_counter, NULL), \ + .offset = MLX5_BYTE_OFF(query_q_counter_out, _name) \ +} + +static PORT_COUNTER_ATTR(rx_write_requests); +static PORT_COUNTER_ATTR(rx_read_requests); +static PORT_COUNTER_ATTR(rx_atomic_requests); +static PORT_COUNTER_ATTR(rx_dct_connect); +static PORT_COUNTER_ATTR(out_of_buffer); +static PORT_COUNTER_ATTR(out_of_sequence); +static PORT_COUNTER_ATTR(duplicate_request); +static PORT_COUNTER_ATTR(rnr_nak_retry_err); +static PORT_COUNTER_ATTR(packet_seq_err); +static PORT_COUNTER_ATTR(implied_nak_seq_err); +static PORT_COUNTER_ATTR(local_ack_timeout_err); + +static struct attribute *counter_attrs[] = { + &port_counter_attr_rx_write_requests.attr.attr, + &port_counter_attr_rx_read_requests.attr.attr, + &port_counter_attr_rx_atomic_requests.attr.attr, + &port_counter_attr_rx_dct_connect.attr.attr, + &port_counter_attr_out_of_buffer.attr.attr, + &port_counter_attr_out_of_sequence.attr.attr, + &port_counter_attr_duplicate_request.attr.attr, + &port_counter_attr_rnr_nak_retry_err.attr.attr, + &port_counter_attr_packet_seq_err.attr.attr, + &port_counter_attr_implied_nak_seq_err.attr.attr, + &port_counter_attr_local_ack_timeout_err.attr.attr, + NULL +}; + +static struct attribute_group port_counters_group = { + .name = "counters", + .attrs = counter_attrs +}; + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show +}; + +static struct kobj_type port_type = { + .sysfs_ops = &port_sysfs_ops, +}; + +static int add_port_attrs(struct mlx5_ib_dev *dev, + struct kobject *parent, + struct mlx5_ib_port_sysfs_group *port, + u8 port_num) +{ + int ret; + + ret = kobject_init_and_add(&port->kobj, &port_type, + parent, + "%d", port_num); + if (ret) + return ret; + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && + MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { + ret = sysfs_create_group(&port->kobj, &port_counters_group); + if (ret) + goto put_kobj; + } + + port->enabled = true; + return ret; + +put_kobj: + kobject_put(&port->kobj); + return ret; +} + +static void destroy_ports_attrs(struct mlx5_ib_dev *dev, + unsigned int num_ports) +{ + unsigned int i; + + for (i = 0; i < num_ports; i++) { + struct mlx5_ib_port_sysfs_group *port = + &dev->port[i].group; + + if (!port->enabled) + continue; + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && + MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) + sysfs_remove_group(&port->kobj, + &port_counters_group); + kobject_put(&port->kobj); + port->enabled = false; + } + + if (dev->ports_parent) { + kobject_put(dev->ports_parent); + dev->ports_parent = NULL; + } +} + +static int create_port_attrs(struct mlx5_ib_dev *dev) +{ + int ret = 0; + unsigned int i = 0; + struct device *device = &dev->ib_dev.dev; + + dev->ports_parent = kobject_create_and_add("mlx5_ports", + &device->kobj); + if (!dev->ports_parent) + return -ENOMEM; + + for (i = 0; i < dev->num_ports; i++) { + ret = add_port_attrs(dev, + dev->ports_parent, + &dev->port[i].group, + i + 1); + + if (ret) + goto _destroy_ports_attrs; + } + + return 0; + +_destroy_ports_attrs: + destroy_ports_attrs(dev, i); + return ret; +} + +static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_dev *dev; + int err; + int i; + + printk_once(KERN_INFO "%s", mlx5_version); + + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return NULL; + + dev->mdev = mdev; + + dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), + GFP_KERNEL); + if (!dev->port) + goto err_dealloc; + + for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { + dev->port[i].dev = dev; + dev->port[i].port_num = i; + dev->port[i].port_gone = 0; + memset(dev->port[i].gid_table, 0, sizeof(dev->port[i].gid_table)); + } + + err = get_port_caps(dev); + if (err) + goto err_free_port; + + if (mlx5_use_mad_ifc(dev)) + get_ext_port_caps(dev); + + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) { + if (MLX5_CAP_GEN(mdev, roce)) { + err = mlx5_nic_vport_enable_roce(mdev); + if (err) + goto err_free_port; + } else { + goto err_free_port; + } + } + + MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); + + strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = mdev->special_contexts.resd_lkey; + dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); + dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.num_comp_vectors = + dev->mdev->priv.eq_table.num_comp_vectors; + dev->ib_dev.dma_device = &mdev->pdev->dev; + + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + dev->ib_dev.query_device = mlx5_ib_query_device; + dev->ib_dev.query_port = mlx5_ib_query_port; + dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; + dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.query_pkey = mlx5_ib_query_pkey; + dev->ib_dev.modify_device = mlx5_ib_modify_device; + dev->ib_dev.modify_port = mlx5_ib_modify_port; + dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; + dev->ib_dev.mmap = mlx5_ib_mmap; + dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; + dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; + dev->ib_dev.create_ah = mlx5_ib_create_ah; + dev->ib_dev.query_ah = mlx5_ib_query_ah; + dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; + dev->ib_dev.create_srq = mlx5_ib_create_srq; + dev->ib_dev.modify_srq = mlx5_ib_modify_srq; + dev->ib_dev.query_srq = mlx5_ib_query_srq; + dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; + dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; + dev->ib_dev.create_qp = mlx5_ib_create_qp; + dev->ib_dev.modify_qp = mlx5_ib_modify_qp; + dev->ib_dev.query_qp = mlx5_ib_query_qp; + dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; + dev->ib_dev.post_send = mlx5_ib_post_send; + dev->ib_dev.post_recv = mlx5_ib_post_recv; + dev->ib_dev.create_cq = mlx5_ib_create_cq; + dev->ib_dev.modify_cq = mlx5_ib_modify_cq; + dev->ib_dev.resize_cq = mlx5_ib_resize_cq; + dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; + dev->ib_dev.poll_cq = mlx5_ib_poll_cq; + dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; + dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; + dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.reg_phys_mr = mlx5_ib_reg_phys_mr; + dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; + dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; + dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; + dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; + dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; + dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; + + if (MLX5_CAP_GEN(mdev, xrc)) { + dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; + dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + err = init_node_data(dev); + if (err) + goto err_disable_roce; + + mutex_init(&dev->cap_mask_mutex); + INIT_LIST_HEAD(&dev->qp_list); + spin_lock_init(&dev->reset_flow_resource_lock); + + err = create_dev_resources(&dev->devr); + if (err) + goto err_disable_roce; + + + err = mlx5_ib_alloc_q_counters(dev); + if (err) + goto err_odp; + + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_q_cnt; + + err = create_umr_res(dev); + if (err) + goto err_dev; + + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) { + if (init_dc_improvements(dev)) + mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n"); + } + + err = create_port_attrs(dev); + if (err) + goto err_dc; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(&dev->ib_dev.dev, + mlx5_class_attributes[i]); + if (err) + goto err_port_attrs; + } + + if (1) { + struct thread *rl_thread = NULL; + struct proc *rl_proc = NULL; + + for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { + (void) kproc_kthread_add(mlx5_ib_roce_port_update, dev->port + i, &rl_proc, &rl_thread, + RFHIGHPID, 0, "mlx5-ib-roce-port", "mlx5-ib-roce_port-%d", i); + } + } + + dev->ib_active = true; + + return dev; + +err_port_attrs: + destroy_ports_attrs(dev, dev->num_ports); + +err_dc: + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) + cleanup_dc_improvements(dev); + destroy_umrc_res(dev); + +err_dev: + ib_unregister_device(&dev->ib_dev); + +err_q_cnt: + mlx5_ib_dealloc_q_counters(dev); + +err_odp: + destroy_dev_resources(&dev->devr); + +err_disable_roce: + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) + mlx5_nic_vport_disable_roce(mdev); +err_free_port: + kfree(dev->port); + +err_dealloc: + ib_dealloc_device((struct ib_device *)dev); + + return NULL; +} + +static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) +{ + struct mlx5_ib_dev *dev = context; + int i; + + for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { + dev->port[i].port_gone = 1; + while (dev->port[i].port_gone != 2) + pause("W", hz); + } + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + device_remove_file(&dev->ib_dev.dev, + mlx5_class_attributes[i]); + } + + destroy_ports_attrs(dev, dev->num_ports); + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) + cleanup_dc_improvements(dev); + mlx5_ib_dealloc_q_counters(dev); + ib_unregister_device(&dev->ib_dev); + destroy_umrc_res(dev); + destroy_dev_resources(&dev->devr); + + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) + mlx5_nic_vport_disable_roce(mdev); + + kfree(dev->port); + ib_dealloc_device(&dev->ib_dev); +} + +static struct mlx5_interface mlx5_ib_interface = { + .add = mlx5_ib_add, + .remove = mlx5_ib_remove, + .event = mlx5_ib_event, + .protocol = MLX5_INTERFACE_PROTOCOL_IB, +}; + +static int __init mlx5_ib_init(void) +{ + int err; + + if (deprecated_prof_sel != 2) + printf("mlx5_ib: WARN: ""prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); + + err = mlx5_register_interface(&mlx5_ib_interface); + if (err) + goto clean_odp; + + mlx5_ib_wq = create_singlethread_workqueue("mlx5_ib_wq"); + if (!mlx5_ib_wq) { + printf("mlx5_ib: ERR: ""%s: failed to create mlx5_ib_wq\n", __func__); + goto err_unreg; + } + + return err; + +err_unreg: + mlx5_unregister_interface(&mlx5_ib_interface); + +clean_odp: + return err; +} + +static void __exit mlx5_ib_cleanup(void) +{ + destroy_workqueue(mlx5_ib_wq); + mlx5_unregister_interface(&mlx5_ib_interface); +} + +module_init_order(mlx5_ib_init, SI_ORDER_THIRD); +module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD); diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c new file mode 100644 index 000000000..46d1749cf --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c @@ -0,0 +1,193 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include "mlx5_ib.h" + +CTASSERT(sizeof(uintptr_t) == sizeof(unsigned long)); + +/* @umem: umem object to scan + * @addr: ib virtual address requested by the user + * @count: number of PAGE_SIZE pages covered by umem + * @shift: page shift for the compound pages found in the region + * @ncont: number of compund pages + * @order: log2 of the number of compound pages + */ + +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order) +{ + struct ib_umem_chunk *chunk; + unsigned long tmp; + unsigned long m; + int i, j, k; + u64 base = 0; + int p = 0; + int skip; + int mask; + u64 len; + u64 pfn; + struct scatterlist *sg; + unsigned long page_shift = ilog2(umem->page_size); + + addr = addr >> page_shift; + tmp = (uintptr_t)addr; + m = find_first_bit(&tmp, 8 * sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + i = 0; + + list_for_each_entry(chunk, &umem->chunk_list, list) { + for (j = 0; j < chunk->nmap; j++) { + sg = chunk->page_list + j; + len = sg_dma_len(sg) >> page_shift; + pfn = sg_dma_address(sg) >> page_shift; + for (k = 0; k < len; k++) { + if (!(i & mask)) { + tmp = (uintptr_t)pfn; + m = min_t(unsigned long, m, + find_first_bit(&tmp, 8 * sizeof(tmp))); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } else { + if (base + p != pfn) { + tmp = (uintptr_t)p; + m = find_first_bit(&tmp, 8 * sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } + } + p++; + i++; + } + } + } + + if (i) { + m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); + + if (order) + *order = ilog2(roundup_pow_of_two(i) >> m); + + *ncont = DIV_ROUND_UP(i, (1 << m)); + } else { + m = 0; + + if (order) + *order = 0; + + *ncont = 0; + } + *shift = page_shift + m; + *count = i; +} + +/* + * Populate the given array with bus addresses from the umem. + * + * dev - mlx5_ib device + * umem - umem to use to fill the pages + * page_shift - determines the page size used in the resulting array + * offset - offset into the umem to start from, + * only implemented for ODP umems + * num_pages - total number of pages to fill + * pas - bus addresses array to fill + * access_flags - access flags to set on all present pages. + use enum mlx5_ib_mtt_access_flags for this. + */ +static void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, + __be64 *pas, int access_flags) +{ + unsigned long umem_page_shift = ilog2(umem->page_size); + struct ib_umem_chunk *chunk; + int shift = page_shift - umem_page_shift; + int mask = (1 << shift) - 1; + int i, j, k; + u64 cur = 0; + u64 base; + int len; + struct scatterlist *sg; + + i = 0; + list_for_each_entry(chunk, &umem->chunk_list, list) { + for (j = 0; j < chunk->nmap; j++) { + sg = chunk->page_list + j; + len = sg_dma_len(sg) >> umem_page_shift; + base = sg_dma_address(sg); + for (k = 0; k < len; k++) { + if (!(i & mask)) { + cur = base + (k << umem_page_shift); + cur |= access_flags; + + pas[i >> shift] = cpu_to_be64(cur); + mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", + i >> shift, (unsigned long long) + be64_to_cpu(pas[i >> shift])); + } else + mlx5_ib_dbg(dev, "=====> 0x%llx\n", + (unsigned long long) + (base + (k << umem_page_shift))); + i++; + } + } + } +} + +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags) +{ + return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, + pas, + access_flags); +} + +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) +{ + u64 page_size; + u64 page_mask; + u64 off_size; + u64 off_mask; + u64 buf_off; + + page_size = (u64)1 << page_shift; + page_mask = page_size - 1; + buf_off = addr & page_mask; + off_size = page_size >> 6; + off_mask = off_size - 1; + + if (buf_off & off_mask) + return -EINVAL; + + *offset = (u32)(buf_off >> ilog2(off_size)); + return 0; +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c new file mode 100644 index 000000000..58af55599 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c @@ -0,0 +1,1310 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" + +CTASSERT((uintptr_t)PAGE_MASK > (uintptr_t)PAGE_SIZE); + +enum { + MAX_PENDING_REG_MR = 8, + MAX_MR_RELEASE_TIMEOUT = (60 * 20) /* Allow release timeout up to 20 min */ +}; + +#define MLX5_UMR_ALIGN 2048 + +static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev); +static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev); + +static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + + return err; +} + +static int order2idx(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return 0; + else + return order - cache->ent[0].order; +} + +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_core_mr *mmr = &mr->mmr; + struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + unsigned long flags; + int err; + u8 key; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d, order %d\n", status, ent->order); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); + key = dev->mdev->priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); + mmr->key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + mlx5_ib_dbg(dev, "callbacked mkey 0x%x created\n", + be32_to_cpu(mr->out.mkey)); + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); + + spin_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mmr->key), mmr); + spin_unlock_irqrestore(&table->lock, flags); + if (err) { + mlx5_ib_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n", + mmr->key, err); + mlx5_core_destroy_mkey(mdev, mmr); + } +} + +static int add_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int npages = 1 << ent->order; + int err = 0; + int i; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + break; + } + mr->order = ent->order; + mr->umred = 1; + mr->dev = dev; + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; + in->seg.log2_page_size = 12; + + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, + sizeof(*in), reg_mr_callback, + mr, &mr->out); + if (err) { + spin_lock_irq(&ent->lock); + ent->pending--; + spin_unlock_irq(&ent->lock); + mlx5_ib_warn(dev, "create mkey failed %d\n", err); + kfree(mr); + break; + } + } + + kfree(in); + return err; +} + +static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + int i; + + for (i = 0; i < num; i++) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static int someone_adding(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur < cache->ent[i].limit) + return 1; + } + + return 0; +} + +static int someone_releasing(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur > 2 * cache->ent[i].limit) + return 1; + } + + return 0; +} + +static void __cache_work_func(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_dev *dev = ent->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int i = order2idx(dev, ent->order); + int err; + s64 dtime; + + if (cache->stopped) + return; + + ent = &dev->cache.ent[i]; + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + cancel_delayed_work(&ent->dwork); + if (!queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3))) + mlx5_ib_warn(dev, "failed queueing delayed work\n"); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + cancel_delayed_work(&ent->dwork); + if (!queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000))) + mlx5_ib_warn(dev, "failed queueing delayed work\n"); + } else { + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); + } + } + } else if (ent->cur > 2 * ent->limit) { + dtime = (cache->last_add + (s64)cache->rel_timeout * HZ) - jiffies; + if (cache->rel_imm || + (cache->rel_timeout >= 0 && !someone_adding(cache) && dtime <= 0)) { + remove_keys(dev, i, 1); + if (ent->cur > ent->limit) + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); + } else if (cache->rel_timeout >= 0) { + dtime = max_t(s64, dtime, 0); + dtime = min_t(s64, dtime, (MAX_MR_RELEASE_TIMEOUT * HZ)); + cancel_delayed_work(&ent->dwork); + if (!queue_delayed_work(cache->wq, &ent->dwork, dtime)) + mlx5_ib_warn(dev, "failed queueing delayed work\n"); + } + } else if (cache->rel_imm && !someone_releasing(cache)) { + cache->rel_imm = 0; + } +} + +static void delayed_cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, dwork.work); + __cache_work_func(ent); +} + +static void cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, work); + __cache_work_func(ent); +} + +static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int shrink = 0; + int c; + + c = order2idx(dev, mr->order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); + return; + } + ent = &cache->ent[c]; + spin_lock_irq(&ent->lock); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + if (ent->cur > 2 * ent->limit) + shrink = 1; + spin_unlock_irq(&ent->lock); + + if (shrink) + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); +} + +static void clean_keys(struct mlx5_ib_dev *dev, int c) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + + cancel_delayed_work(&ent->dwork); + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey 0x%x from order %d\n", + mr->mmr.key, ent->order); + else + kfree(mr); + } +} + +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + +enum { + MLX5_VF_MR_LIMIT = 2, +}; + +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int limit; + int err; + int i; + + mutex_init(&dev->slow_path_mutex); + cache->rel_timeout = 300; + cache->wq = create_singlethread_workqueue("mkey_cache"); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); + return -ENOMEM; + } + + setup_timer(&dev->delay_timer, delay_time_func, (uintptr_t)dev); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + INIT_LIST_HEAD(&cache->ent[i].head); + spin_lock_init(&cache->ent[i].lock); + + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); + spin_lock_init(&ent->lock); + ent->order = i + 2; + ent->dev = dev; + + if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) { + if (mlx5_core_is_pf(dev->mdev)) + limit = dev->mdev->profile->mr_cache[i].limit; + else + limit = MLX5_VF_MR_LIMIT; + } else { + limit = 0; + } + + INIT_WORK(&ent->work, cache_work_func); + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + ent->limit = limit; + if (!queue_work(cache->wq, &ent->work)) + mlx5_ib_warn(dev, "failed queueing work\n"); + } + + err = mlx5_mr_sysfs_init(dev); + if (err) + mlx5_ib_warn(dev, "failed to init mr cache sysfs\n"); + + return 0; +} + +static void wait_for_async_commands(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int total = 0; + int i; + int j; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + for (j = 0 ; j < 1000; j++) { + if (!ent->pending) + break; + msleep(50); + } + } + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + total += ent->pending; + } + + if (total) + mlx5_ib_dbg(dev, "aborted, %d pending requests\n", total); + else + mlx5_ib_dbg(dev, "done with all pending requests\n"); +} + +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +{ + int i; + + dev->cache.stopped = 1; + flush_workqueue(dev->cache.wq); + mlx5_mr_sysfs_cleanup(dev); + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + clean_keys(dev, i); + + destroy_workqueue(dev->cache.wq); + wait_for_async_commands(dev); + del_timer_sync(&dev->delay_timer); + return 0; +} + +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + seg = &in->seg; + seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); + if (err) + goto err_in; + + kfree(in); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +static int get_octo_len(u64 addr, u64 len, u64 page_size) +{ + u64 offset; + int npages; + + offset = addr & (page_size - 1ULL); + npages = ALIGN(len + offset, page_size) >> ilog2(page_size); + return (npages + 1) / 2; +} + +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +{ + struct mlx5_ib_umr_context *context; + struct ib_wc wc; + int err; + + while (1) { + err = ib_poll_cq(cq, 1, &wc); + if (err < 0) { + printf("mlx5_ib: WARN: ""poll cq error %d\n", err); + return; + } + if (err == 0) + break; + + context = (struct mlx5_ib_umr_context *)(uintptr_t)wc.wr_id; + context->status = wc.status; + complete(&context->done); + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +} + +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, + u64 length, struct ib_umem *umem, + int npages, int page_shift, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int inlen; + int err; + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_1; + } + mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); + + /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + * in the page list submitted with the command. */ + in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; + in->seg.flags = convert_access(access_flags) | + MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.start_addr = cpu_to_be64(virt_addr); + in->seg.len = cpu_to_be64(length); + in->seg.bsfs_octo_size = 0; + in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); + in->seg.log2_page_size = page_shift; + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "create mkey failed\n"); + goto err_2; + } + mr->umem = umem; + mr->dev = dev; + kvfree(in); + + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + + return mr; + +err_2: + kvfree(in); + +err_1: + kfree(mr); + + return ERR_PTR(err); +} + +enum { + MLX5_MAX_REG_ORDER = MAX_MR_CACHE_ENTRIES + 1, + MLX5_MAX_REG_SIZE = 2ul * 1024 * 1024 * 1024, +}; + +static int clean_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + int umred = mr->umred; + int err; + int i; + + if (!umred) { + for (i = 0; i < mr->nchild; ++i) { + free_cached_mr(dev, mr->children[i]); + } + kfree(mr->children); + + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + } + return 0; +} + +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata, int mr_id) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + struct ib_umem *umem; + int page_shift; + int npages; + int ncont; + int order; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + (unsigned long long)start, (unsigned long long)virt_addr, + (unsigned long long)length, access_flags); + umem = ib_umem_get(pd->uobject->context, start, length, access_flags, 0); + if (IS_ERR(umem)) { + mlx5_ib_warn(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } + + mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); + if (!npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + err = -EINVAL; + goto error; + } + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + npages, ncont, order, page_shift); + + mutex_lock(&dev->slow_path_mutex); + mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, access_flags); + mutex_unlock(&dev->slow_path_mutex); + + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + mr = NULL; + goto error; + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + + mr->umem = umem; + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + + return &mr->ibmr; + +error: + /* + * Destroy the umem *before* destroying the MR, to ensure we + * will not have any in-flight notifiers when destroying the + * MR. + * + * As the MR is completely invalid to begin with, and this + * error path is only taken if we can't push the mr entry into + * the pagefault tree, this is safe. + */ + + ib_umem_release(umem); + return ERR_PTR(err); +} + +CTASSERT(sizeof(((struct ib_phys_buf *)0)->size) == 8); + +struct ib_mr * +mlx5_ib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int access_flags, + u64 *virt_addr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + u64 total_size; + u32 octo_len; + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + unsigned long mask; + int shift; + int npages; + int inlen; + int err; + int i, j, n; + + mask = buffer_list[0].addr ^ *virt_addr; + total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0) + mask |= buffer_list[i].addr; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + + total_size += buffer_list[i].size; + } + + if (mask & ~PAGE_MASK) + return ERR_PTR(-EINVAL); + + shift = __ffs(mask | 1 << 31); + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1); + buffer_list[0].addr &= ~0ULL << shift; + + npages = 0; + for (i = 0; i < num_phys_buf; ++i) + npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift; + + if (!npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + return ERR_PTR(-EINVAL); + } + + mr = kzalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + octo_len = get_octo_len(*virt_addr, total_size, 1ULL << shift); + octo_len = ALIGN(octo_len, 4); + + inlen = sizeof(*in) + (octo_len * 16); + in = mlx5_vzalloc(inlen); + if (!in) { + kfree(mr); + return ERR_PTR(-ENOMEM); + } + + n = 0; + for (i = 0; i < num_phys_buf; ++i) { + for (j = 0; + j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift; + ++j) { + u64 temp = buffer_list[i].addr + ((u64) j << shift); + if (pg_cap) + temp |= MLX5_IB_MTT_PRESENT; + in->pas[n++] = cpu_to_be64(temp); + } + } + + /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + * in the page list submitted with the command. */ + in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; + in->seg.flags = convert_access(access_flags) | + MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.start_addr = cpu_to_be64(*virt_addr); + in->seg.len = cpu_to_be64(total_size); + in->seg.bsfs_octo_size = 0; + in->seg.xlt_oct_size = cpu_to_be32(octo_len); + in->seg.log2_page_size = shift; + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->xlat_oct_act_size = cpu_to_be32(octo_len); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); + mr->umem = NULL; + mr->dev = dev; + mr->npages = npages; + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + + kvfree(in); + + if (err) { + kfree(mr); + return ERR_PTR(err); + } + return &mr->ibmr; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct ib_umem *umem = mr->umem; + int npages = mr->npages; + int umred = mr->umred; + int err; + + err = clean_mr(mr); + if (err) + return err; + + if (umem) { + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + } + + if (umred) + free_cached_mr(dev, mr); + else + kfree(mr); + + return 0; +} + +int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int err; + + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + } + + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + + kfree(mr); + + return err; +} + +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + /* + * TBD not needed - issue 197292 */ + in->seg.log2_page_size = PAGE_SHIFT; + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); + kfree(in); + if (err) { + mlx5_ib_warn(dev, "failed create mkey\n"); + goto err_free; + } + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof(u64); + + mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->mapped_page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + struct mlx5_ib_dev *dev = to_mdev(page_list->device); + int size = page_list->max_page_list_len * sizeof(u64); + + dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list, + mfrpl->map); + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); +} + +struct order_attribute { + struct attribute attr; + ssize_t (*show)(struct cache_order *, struct order_attribute *, char *buf); + ssize_t (*store)(struct cache_order *, struct order_attribute *, + const char *buf, size_t count); +}; + +static ssize_t cur_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->cur); + return err; +} + +static ssize_t limit_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->limit); + return err; +} + +static ssize_t limit_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 var; + int err; + +#define kstrtouint(a,b,c) ({*(c) = strtol(a,0,b); 0;}) +#define kstrtoint(a,b,c) ({*(c) = strtol(a,0,b); 0;}) + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var > ent->size) + return -EINVAL; + + ent->limit = var; + + if (ent->cur < ent->limit) { + err = add_keys(dev, co->index, 2 * ent->limit - ent->cur); + if (err) + return err; + } + + return count; +} + +static ssize_t miss_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->miss); + return err; +} + +static ssize_t miss_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 var; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var != 0) + return -EINVAL; + + ent->miss = var; + + return count; +} + +static ssize_t size_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->size); + return err; +} + +static ssize_t size_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 var; + int err; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var < ent->limit) + return -EINVAL; + + if (var > ent->size) { + do { + err = add_keys(dev, co->index, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); + } else if (var < ent->size) { + remove_keys(dev, co->index, ent->size - var); + } + + return count; +} + +static ssize_t order_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct order_attribute *oa = + container_of(attr, struct order_attribute, attr); + struct cache_order *co = container_of(kobj, struct cache_order, kobj); + + if (!oa->show) + return -EIO; + + return oa->show(co, oa, buf); +} + +static ssize_t order_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t size) +{ + struct order_attribute *oa = + container_of(attr, struct order_attribute, attr); + struct cache_order *co = container_of(kobj, struct cache_order, kobj); + + if (!oa->store) + return -EIO; + + return oa->store(co, oa, buf, size); +} + +static const struct sysfs_ops order_sysfs_ops = { + .show = order_attr_show, + .store = order_attr_store, +}; + +#define ORDER_ATTR(_name) struct order_attribute order_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) +#define ORDER_ATTR_RO(_name) struct order_attribute order_attr_##_name = \ + __ATTR(_name, 0444, _name##_show, NULL) + +static ORDER_ATTR_RO(cur); +static ORDER_ATTR(limit); +static ORDER_ATTR(miss); +static ORDER_ATTR(size); + +static struct attribute *order_default_attrs[] = { + &order_attr_cur.attr, + &order_attr_limit.attr, + &order_attr_miss.attr, + &order_attr_size.attr, + NULL +}; + +static struct kobj_type order_type = { + .sysfs_ops = &order_sysfs_ops, + .default_attrs = order_default_attrs +}; + + + +struct cache_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_ib_dev *dev, char *buf); + ssize_t (*store)(struct mlx5_ib_dev *dev, const char *buf, size_t count); +}; + +static ssize_t rel_imm_show(struct mlx5_ib_dev *dev, char *buf) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int err; + + err = snprintf(buf, 20, "%d\n", cache->rel_imm); + return err; +} + +static ssize_t rel_imm_store(struct mlx5_ib_dev *dev, const char *buf, size_t count) +{ + struct mlx5_mr_cache *cache = &dev->cache; + u32 var; + int i; + int found = 0; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var > 1) + return -EINVAL; + + if (var == cache->rel_imm) + return count; + + cache->rel_imm = var; + if (cache->rel_imm == 1) { + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur > 2 * cache->ent[i].limit) { + queue_work(cache->wq, &cache->ent[i].work); + found = 1; + } + } + if (!found) + cache->rel_imm = 0; + } + + return count; +} +static ssize_t rel_timeout_show(struct mlx5_ib_dev *dev, char *buf) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int err; + + err = snprintf(buf, 20, "%d\n", cache->rel_timeout); + return err; +} + +static ssize_t rel_timeout_store(struct mlx5_ib_dev *dev, const char *buf, size_t count) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int var; + int i; + + if (kstrtoint(buf, 0, &var)) + return -EINVAL; + + if (var < -1 || var > MAX_MR_RELEASE_TIMEOUT) + return -EINVAL; + + if (var == cache->rel_timeout) + return count; + + if (cache->rel_timeout == -1 || (var < cache->rel_timeout && var != -1)) { + cache->rel_timeout = var; + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur > 2 * cache->ent[i].limit) + queue_work(cache->wq, &cache->ent[i].work); + } + } else { + cache->rel_timeout = var; + } + + return count; +} + +static ssize_t cache_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct cache_attribute *ca = + container_of(attr, struct cache_attribute, attr); + struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache); + + if (!ca->show) + return -EIO; + + return ca->show(dev, buf); +} + +static ssize_t cache_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t size) +{ + struct cache_attribute *ca = + container_of(attr, struct cache_attribute, attr); + struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache); + + if (!ca->store) + return -EIO; + + return ca->store(dev, buf, size); +} + +static const struct sysfs_ops cache_sysfs_ops = { + .show = cache_attr_show, + .store = cache_attr_store, +}; + +#define CACHE_ATTR(_name) struct cache_attribute cache_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static CACHE_ATTR(rel_imm); +static CACHE_ATTR(rel_timeout); + +static struct attribute *cache_default_attrs[] = { + &cache_attr_rel_imm.attr, + &cache_attr_rel_timeout.attr, + NULL +}; + +static struct kobj_type cache_type = { + .sysfs_ops = &cache_sysfs_ops, + .default_attrs = cache_default_attrs +}; + +static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct device *device = &dev->ib_dev.dev; + struct cache_order *co; + int o; + int i; + int err; + + err = kobject_init_and_add(&dev->mr_cache, &cache_type, + &device->kobj, "mr_cache"); + if (err) + return -ENOMEM; + + for (o = 2, i = 0; i < MAX_MR_CACHE_ENTRIES; o++, i++) { + co = &cache->ent[i].co; + co->order = o; + co->index = i; + co->dev = dev; + err = kobject_init_and_add(&co->kobj, &order_type, + &dev->mr_cache, "%d", o); + if (err) + goto err_put; + } + + return 0; + +err_put: + for (; i >= 0; i--) { + co = &cache->ent[i].co; + kobject_put(&co->kobj); + } + kobject_put(&dev->mr_cache); + + return err; +} + +static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct cache_order *co; + int i; + + for (i = MAX_MR_CACHE_ENTRIES - 1; i >= 0; i--) { + co = &cache->ent[i].co; + kobject_put(&co->kobj); + } + kobject_put(&dev->mr_cache); +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c new file mode 100644 index 000000000..9e81035a6 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c @@ -0,0 +1,2991 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include "user.h" +#include +#include + +#define IPV6_DEFAULT_HOPLIMIT 64 + + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state); + +/* not supported currently */ +static int workqueue_signature; + +enum { + MLX5_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX5_IB_LINK_TYPE_IB = 0, + MLX5_IB_LINK_TYPE_ETH = 1 +}; + +enum { + MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_CACHE_LINE_SIZE = 64, +}; + +enum { + MLX5_RQ_NUM_STATE = MLX5_RQC_STATE_ERR + 1, + MLX5_SQ_NUM_STATE = MLX5_SQC_STATE_ERR + 1, + MLX5_QP_STATE = MLX5_QP_NUM_STATE + 1, + MLX5_QP_STATE_BAD = MLX5_QP_STATE + 1, +}; + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_FAST_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, +}; + +struct umr_wr { + u64 virt_addr; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u32 length; + int access_flags; + u32 mkey; +}; + +static int is_qp0(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_SMI; +} + +static int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_GSI; +} + +static int is_sqp(enum ib_qp_type qp_type) +{ + return is_qp0(qp_type) || is_qp1(qp_type); +} + +static void *get_wqe(struct mlx5_ib_qp *qp, int offset) +{ + return mlx5_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); +} + + +static int +query_wqe_idx(struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_query_qp_mbox_out *outb; + struct mlx5_qp_context *context; + int ret; + + outb = kzalloc(sizeof(*outb), GFP_KERNEL); + if (!outb) + return -ENOMEM; + + context = &outb->ctx; + + mutex_lock(&qp->mutex); + ret = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); + if (ret) + goto out_free; + + ret = be16_to_cpu(context->hw_sq_wqe_counter) & (qp->sq.wqe_cnt - 1); + +out_free: + mutex_unlock(&qp->mutex); + kfree(outb); + + return ret; +} + +static int mlx5_handle_sig_pipelining(struct mlx5_ib_qp *qp) +{ + int wqe_idx; + + wqe_idx = query_wqe_idx(qp); + if (wqe_idx < 0) { + printf("mlx5_ib: ERR: ""Failed to query QP 0x%x wqe index\n", qp->mqp.qpn); + return wqe_idx; + } + + if (qp->sq.swr_ctx[wqe_idx].sig_piped) { + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_wqe_ctrl_seg *cwqe; + + cwqe = mlx5_get_send_wqe(qp, wqe_idx); + cwqe->opmod_idx_opcode = cpu_to_be32(be32_to_cpu(cwqe->opmod_idx_opcode) & 0xffffff00); + qp->sq.swr_ctx[wqe_idx].w_list.opcode |= MLX5_OPCODE_SIGNATURE_CANCELED; + mlx5_ib_dbg(dev, "Cancel QP 0x%x wqe_index 0x%x\n", + qp->mqp.qpn, wqe_idx); + } + + return 0; +} + +static void mlx5_ib_sqd_work(struct work_struct *work) +{ + struct mlx5_ib_sqd *sqd; + struct mlx5_ib_qp *qp; + struct ib_qp_attr qp_attr; + + sqd = container_of(work, struct mlx5_ib_sqd, work); + qp = sqd->qp; + + if (mlx5_handle_sig_pipelining(qp)) + goto out; + + mutex_lock(&qp->mutex); + if (__mlx5_ib_modify_qp(&qp->ibqp, &qp_attr, 0, IB_QPS_SQD, IB_QPS_RTS)) + printf("mlx5_ib: ERR: ""Failed to resume QP 0x%x\n", qp->mqp.qpn); + mutex_unlock(&qp->mutex); +out: + kfree(sqd); +} + +static void mlx5_ib_sigerr_sqd_event(struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_sqd *sqd; + + sqd = kzalloc(sizeof(*sqd), GFP_ATOMIC); + if (!sqd) + return; + + sqd->qp = qp; + INIT_WORK(&sqd->work, mlx5_ib_sqd_work); + queue_work(mlx5_ib_wq, &sqd->work); +} + +static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) +{ + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct ib_event event; + + if (type == MLX5_EVENT_TYPE_SQ_DRAINED && + to_mibqp(qp)->state != IB_QPS_SQD) { + mlx5_ib_sigerr_sqd_event(to_mibqp(qp)); + return; + } + + if (type == MLX5_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, + int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) +{ + int wqe_size; + int wq_size; + + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) + return -EINVAL; + + if (!has_rq) { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + cap->max_recv_wr = 0; + cap->max_recv_sge = 0; + } else { + if (ucmd) { + qp->rq.wqe_cnt = ucmd->rq_wqe_count; + qp->rq.wqe_shift = ucmd->rq_wqe_shift; + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } else { + wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; + wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; + wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) { + mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", + wqe_size, + MLX5_CAP_GEN(dev->mdev, + max_wqe_sz_rq)); + return -EINVAL; + } + qp->rq.wqe_shift = ilog2(wqe_size); + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } + } + + return 0; +} + +static int sq_overhead(enum ib_qp_type qp_type) +{ + int size = 0; + + switch (qp_type) { + case IB_QPT_XRC_INI: + size += sizeof(struct mlx5_wqe_xrc_seg); + /* fall through */ + case IB_QPT_RC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case IB_QPT_XRC_TGT: + return 0; + + case IB_QPT_UC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_datagram_seg); + break; + + default: + return -EINVAL; + } + + return size; +} + +static int calc_send_wqe(struct ib_qp_init_attr *attr) +{ + int inl_size = 0; + int size; + + size = sq_overhead(attr->qp_type); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + + attr->cap.max_inline_data; + } + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +} + +static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size) +{ + int max_sge; + + if (attr->qp_type == IB_QPT_RC) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else if (attr->qp_type == IB_QPT_XRC_INI) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_xrc_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else + max_sge = (wqe_size - sq_overhead(attr->qp_type)) / + sizeof(struct mlx5_wqe_data_seg); + + return min_t(int, max_sge, wqe_size - sq_overhead(attr->qp_type) / + sizeof(struct mlx5_wqe_data_seg)); +} + +static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + struct mlx5_ib_qp *qp) +{ + int wqe_size; + int wq_size; + + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = calc_send_wqe(attr); + mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); + if (wqe_size < 0) + return wqe_size; + + if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { + mlx5_ib_warn(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - + sizeof(struct mlx5_wqe_inline_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * (u64)wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { + mlx5_ib_warn(dev, "wqe count(%d) exceeds limits(%d)\n", + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + return -ENOMEM; + } + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.max_gs = get_send_sge(attr, wqe_size); + if (qp->sq.max_gs < attr->cap.max_send_sge) { + mlx5_ib_warn(dev, "max sge(%d) exceeds limits(%d)\n", + qp->sq.max_gs, attr->cap.max_send_sge); + return -ENOMEM; + } + + attr->cap.max_send_sge = qp->sq.max_gs; + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; + + return wq_size; +} + +static int set_user_buf_size(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_create_qp *ucmd, + struct ib_qp_init_attr *attr) +{ + int desc_sz = 1 << qp->sq.wqe_shift; + + if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { + mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", + desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + return -EINVAL; + } + + if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { + mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", + ucmd->sq_wqe_count, ucmd->sq_wqe_count); + return -EINVAL; + } + + qp->sq.wqe_cnt = ucmd->sq_wqe_count; + + if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { + mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + return -EINVAL; + } + + + if (attr->qp_type == IB_QPT_RAW_PACKET) { + qp->buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->sq_buf_size = qp->sq.wqe_cnt << 6; + } else { + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + qp->sq_buf_size = 0; + } + + return 0; +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + !attr->cap.max_recv_wr) + return 0; + + return 1; +} + +static int first_med_uuar(void) +{ + return 1; +} + +static int next_uuar(int n) +{ + n++; + + while (((n % 4) & 2)) + n++; + + return n; +} + +static int num_med_uuar(struct mlx5_uuar_info *uuari) +{ + int n; + + n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - + uuari->num_low_latency_uuars - 1; + + return n >= 0 ? n : 0; +} + +static int max_uuari(struct mlx5_uuar_info *uuari) +{ + return uuari->num_uars * 4; +} + +static int first_hi_uuar(struct mlx5_uuar_info *uuari) +{ + int med; + int i; + int t; + + med = num_med_uuar(uuari); + for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { + t++; + if (t == med) + return next_uuar(i); + } + + return 0; +} + +static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) +{ + int i; + + for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { + if (!test_bit(i, uuari->bitmap)) { + set_bit(i, uuari->bitmap); + uuari->count[i]++; + return i; + } + } + + return -ENOMEM; +} + +static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) +{ + int minidx = first_med_uuar(); + int i; + + for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { + if (uuari->count[i] < uuari->count[minidx]) + minidx = i; + } + + uuari->count[minidx]++; + + return minidx; +} + +static int alloc_uuar(struct mlx5_uuar_info *uuari, + enum mlx5_ib_latency_class lat) +{ + int uuarn = -EINVAL; + + mutex_lock(&uuari->lock); + switch (lat) { + case MLX5_IB_LATENCY_CLASS_LOW: + uuarn = 0; + uuari->count[uuarn]++; + break; + + case MLX5_IB_LATENCY_CLASS_MEDIUM: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_HIGH: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_FAST_PATH: + uuarn = 2; + break; + } + mutex_unlock(&uuari->lock); + + return uuarn; +} + +static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; + int high_uuar = nuuars - uuari->num_low_latency_uuars; + + mutex_lock(&uuari->lock); + if (uuarn == 0) { + --uuari->count[uuarn]; + goto out; + } + + if (uuarn < high_uuar) { + free_med_class_uuar(uuari, uuarn); + goto out; + } + + free_high_class_uuar(uuari, uuarn); + +out: + mutex_unlock(&uuari->lock); +} + +static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX5_QP_STATE_RST; + case IB_QPS_INIT: return MLX5_QP_STATE_INIT; + case IB_QPS_RTR: return MLX5_QP_STATE_RTR; + case IB_QPS_RTS: return MLX5_QP_STATE_RTS; + case IB_QPS_SQD: return MLX5_QP_STATE_SQD; + case IB_QPS_SQE: return MLX5_QP_STATE_SQER; + case IB_QPS_ERR: return MLX5_QP_STATE_ERR; + default: return -1; + } +} + +static int to_mlx5_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX5_QP_ST_RC; + case IB_QPT_UC: return MLX5_QP_ST_UC; + case IB_QPT_UD: return MLX5_QP_ST_UD; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; + case IB_QPT_SMI: return MLX5_QP_ST_QP0; + case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; + case IB_QPT_RAW_PACKET: + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; + case IB_QPT_MAX: + default: return -EINVAL; + } +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); + +static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) +{ + return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; +} + +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct ib_qp_init_attr *attr, + struct mlx5_create_qp_mbox_in **in, + int *inlen, + struct mlx5_exp_ib_create_qp *ucmd) +{ + struct mlx5_exp_ib_create_qp_resp resp; + struct mlx5_ib_ucontext *context; + int page_shift = 0; + int uar_index; + int npages; + u32 offset = 0; + int uuarn; + int ncont = 0; + int err; + + context = to_mucontext(pd->uobject->context); + memset(&resp, 0, sizeof(resp)); + resp.size_of_prefix = offsetof(struct mlx5_exp_ib_create_qp_resp, prefix_reserved); + /* + * TBD: should come from the verbs when we have the API + */ + if (ucmd->exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_WC_UAR_IDX) { + if (ucmd->exp.wc_uar_index == MLX5_EXP_CREATE_QP_DB_ONLY_UUAR) { + /* Assign LATENCY_CLASS_LOW (DB only UUAR) to this QP */ + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "DB only uuar allocation failed\n"); + return uuarn; + } + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + } else if (ucmd->exp.wc_uar_index >= MLX5_IB_MAX_CTX_DYNAMIC_UARS || + context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index] == + MLX5_IB_INVALID_UAR_INDEX) { + mlx5_ib_warn(dev, "dynamic uuar allocation failed\n"); + return -EINVAL; + } else { + uar_index = context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index]; + uuarn = MLX5_EXP_INVALID_UUAR; + } + } else { + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } + } + } + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + } + mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + err = set_user_buf_size(dev, qp, (struct mlx5_ib_create_qp *)ucmd, attr); + if (err) + goto err_uuar; + + if (ucmd->buf_addr && qp->buf_size) { + qp->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + mlx5_ib_warn(dev, "umem_get failed\n"); + err = PTR_ERR(qp->umem); + goto err_uuar; + } + } else { + qp->umem = NULL; + } + + if (qp->umem) { + mlx5_ib_cont_pages(qp->umem, ucmd->buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + (unsigned long long)ucmd->buf_addr, qp->buf_size, + npages, page_shift, ncont, offset); + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_umem; + } + if (qp->umem) + mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + (*in)->ctx.params2 = cpu_to_be32(offset << 6); + + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + resp.uuar_index = uuarn; + qp->uuarn = uuarn; + + err = mlx5_ib_db_map_user(context, ucmd->db_addr, &qp->db); + if (err) { + mlx5_ib_warn(dev, "map failed\n"); + goto err_free; + } + + err = ib_copy_to_udata(udata, &resp, sizeof(struct mlx5_ib_create_qp_resp)); + if (err) { + mlx5_ib_err(dev, "copy failed\n"); + goto err_unmap; + } + qp->create_type = MLX5_QP_USER; + + return 0; + +err_unmap: + mlx5_ib_db_unmap_user(context, &qp->db); + +err_free: + kvfree(*in); + +err_umem: + if (qp->umem) + ib_umem_release(qp->umem); + +err_uuar: + free_uuar(&context->uuari, uuarn); + return err; +} + +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &qp->db); + if (qp->umem) + ib_umem_release(qp->umem); + if (qp->sq_umem) + ib_umem_release(qp->sq_umem); + /* + * Free only the UUARs handled by the kernel. + * UUARs of UARs allocated dynamically are handled by user. + */ + if (qp->uuarn != MLX5_EXP_INVALID_UUAR) + free_uuar(&context->uuari, qp->uuarn); +} + +static int create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in **in, int *inlen) +{ + enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; + struct mlx5_uuar_info *uuari; + int uar_index; + int uuarn; + int err; + + uuari = &dev->mdev->priv.uuari; + if (init_attr->create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; + + uuarn = alloc_uuar(uuari, lc); + if (uuarn < 0) { + mlx5_ib_warn(dev, "\n"); + return -ENOMEM; + } + + qp->bf = &uuari->bfs[uuarn]; + uar_index = qp->bf->uar->index; + + err = calc_sq_size(dev, init_attr, qp); + if (err < 0) { + mlx5_ib_warn(dev, "err %d\n", err); + goto err_uuar; + } + + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto err_uuar; + } + + qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + /* Set "fast registration enabled" for all kernel QPs */ + (*in)->ctx.params1 |= cpu_to_be32(1 << 11); + (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + + mlx5_fill_page_array(&qp->buf, (*in)->pas); + + err = mlx5_db_alloc(dev->mdev, &qp->db); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto err_free; + } + + qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx), + GFP_KERNEL); + qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->rq.rwr_ctx), + GFP_KERNEL); + if (!qp->sq.swr_ctx || !qp->rq.rwr_ctx) { + err = -ENOMEM; + goto err_wrid; + } + qp->create_type = MLX5_QP_KERNEL; + + return 0; + +err_wrid: + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.swr_ctx); + kfree(qp->rq.rwr_ctx); + +err_free: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &qp->buf); + +err_uuar: + free_uuar(&dev->mdev->priv.uuari, uuarn); + return err; +} + +static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.swr_ctx); + kfree(qp->rq.rwr_ctx); + mlx5_buf_free(dev->mdev, &qp->buf); + free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); +} + +static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +{ + enum ib_qp_type qt = attr->qp_type; + + if (attr->srq || (qt == IB_QPT_XRC_TGT) || (qt == IB_QPT_XRC_INI)) + return cpu_to_be32(MLX5_SRQ_RQ); + else if (!qp->has_rq) + return cpu_to_be32(MLX5_ZERO_LEN_RQ); + else + return cpu_to_be32(MLX5_NON_ZERO_RQ); +} + +static int is_connected(enum ib_qp_type qp_type) +{ + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + return 1; + + return 0; +} + +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case IB_QPT_XRC_INI: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; + break; + + case IB_QPT_MAX: + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +enum { + MLX5_QP_END_PAD_MODE_ALIGN = MLX5_WQ_END_PAD_MODE_ALIGN, + MLX5_QP_END_PAD_MODE_NONE = MLX5_WQ_END_PAD_MODE_NONE, +}; + +static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_create_qp_mbox_in *in = NULL; + struct mlx5_exp_ib_create_qp ucmd; + struct mlx5_ib_create_qp *pucmd = NULL; + struct mlx5_ib_cq *send_cq; + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + int inlen = sizeof(*in); + size_t ucmd_size; + int err; + int st; + u32 uidx; + void *qpc; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { + mlx5_ib_warn(dev, "block multicast loopback isn't supported\n"); + return -EINVAL; + } else { + qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + } + } + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + if (pd && pd->uobject) { + memset(&ucmd, 0, sizeof(ucmd)); + ucmd_size = sizeof(struct mlx5_ib_create_qp); + if (ucmd_size > offsetof(struct mlx5_exp_ib_create_qp, size_of_prefix)) { + mlx5_ib_warn(dev, "mlx5_ib_create_qp is too big to fit as prefix of mlx5_exp_ib_create_qp\n"); + return -EINVAL; + } + err = ib_copy_from_udata(&ucmd, udata, min(udata->inlen, ucmd_size)); + if (err) { + mlx5_ib_err(dev, "copy failed\n"); + return err; + } + pucmd = (struct mlx5_ib_create_qp *)&ucmd; + if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_UIDX) + uidx = ucmd.exp.uidx; + else + uidx = 0xffffff; + + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + } else { + qp->wq_sig = !!workqueue_signature; + uidx = 0xffffff; + } + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, + qp, (pd && pd->uobject) ? pucmd : NULL); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + return err; + } + + if (pd) { + if (pd->uobject) { + __u32 max_wqes = + 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); + if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || + ucmd.rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_warn(dev, "invalid rq params\n"); + return -EINVAL; + } + if (ucmd.sq_wqe_count > max_wqes) { + mlx5_ib_warn(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd.sq_wqe_count, max_wqes); + return -EINVAL; + } + err = create_user_qp(dev, pd, qp, udata, init_attr, &in, + &inlen, &ucmd); + if (err) + mlx5_ib_warn(dev, "err %d\n", err); + } else { + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + mlx5_ib_warn(dev, "Raw Eth QP is disabled for Kernel consumers\n"); + return -EINVAL; + } + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + if (err) + mlx5_ib_warn(dev, "err %d\n", err); + else + qp->pa_lkey = to_mpd(pd)->pa_lkey; + } + + if (err) + return err; + } else { + in = mlx5_vzalloc(sizeof(*in)); + if (!in) + return -ENOMEM; + + qp->create_type = MLX5_QP_EMPTY; + } + + if (is_sqp(init_attr->qp_type)) + qp->port = init_attr->port_num; + + st = to_mlx5_st(init_attr->qp_type); + if (st < 0) { + mlx5_ib_warn(dev, "invalid service type\n"); + err = st; + goto err_create; + } + in->ctx.flags |= cpu_to_be32(st << 16 | MLX5_QP_PM_MIGRATED << 11); + + in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); + + if (qp->wq_sig) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + + if (qp->flags & MLX5_IB_QP_CAP_RX_END_PADDING) + in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_ALIGN << 2); + else + in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_NONE << 2); + + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { + int rcqe_sz; + int scqe_sz; + + rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); + scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); + + if (rcqe_sz == 128) { + in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; + } else { + in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; + } + + if (init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) { + in->ctx.cs_req = 0; + } else { + if (scqe_sz == 128) + in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; + else + in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; + } + } + + if (qp->rq.wqe_cnt) { + in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); + in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; + } + + in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); + + if (qp->sq.wqe_cnt) + in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); + else + in->ctx.sq_crq_size |= cpu_to_be16(0x8000); + + /* Set default resources */ + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); + break; + case IB_QPT_XRC_INI: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + break; + default: + if (init_attr->srq) { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); + } else { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s1)->msrq.srqn); + } + } + + if (init_attr->send_cq) + in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); + + in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); + + if (MLX5_CAP_GEN(mdev, cqe_version)) { + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(qpc, qpc, user_index, uidx); + } + + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) { + mlx5_ib_warn(dev, "Raw Ethernet QP is allowed only for Ethernet link layer\n"); + return -ENOSYS; + } + if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_SQ_BUFF_ADD) { + qp->sq_buf_addr = ucmd.exp.sq_buf_addr; + } else { + mlx5_ib_warn(dev, "Raw Ethernet QP needs SQ buff address\n"); + return -EINVAL; + } + err = -EOPNOTSUPP; + } else { + err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); + qp->mqp.event = mlx5_ib_qp_event; + } + + if (err) { + mlx5_ib_warn(dev, "create qp failed\n"); + goto err_create; + } + + kvfree(in); + /* Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq, + &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* Maintain device to QPs access, needed for further handling via reset + * flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling via reset flow + */ + if (send_cq) + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + if (recv_cq) + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + return 0; + +err_create: + if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(pd, qp); + else if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + + kvfree(in); + return err; +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, + SINGLE_DEPTH_NESTING); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } else { + spin_lock(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, + SINGLE_DEPTH_NESTING); + } + } else { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } + } else if (recv_cq) { + spin_lock(&recv_cq->lock); + __acquire(&send_cq->lock); + } else { + __acquire(&send_cq->lock); + __acquire(&recv_cq->lock); + } +} + +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } + } else { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } + } else if (recv_cq) { + __release(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } else { + __release(&recv_cq->lock); + __release(&send_cq->lock); + } +} + +static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) +{ + return to_mpd(qp->ibqp.pd); +} + +static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_modify_qp_mbox_in *in; + unsigned long flags; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return; + + if (qp->state != IB_QPS_RESET) { + if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { + if (mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, in, 0, + &qp->mqp)) + mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", + qp->mqp.qpn); + } + } + + get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* del from lists under both locks above to protect reset flow paths */ + list_del(&qp->qps_list); + if (send_cq) + list_del(&qp->cq_send_list); + + if (recv_cq) + list_del(&qp->cq_recv_list); + + if (qp->create_type == MLX5_QP_KERNEL) { + __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + } + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + } else { + err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", + qp->mqp.qpn); + } + + kfree(in); + + if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + else if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(&get_pd(qp)->ibpd, qp); +} + +static const char *ib_qp_type_str(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_SMI: + return "IB_QPT_SMI"; + case IB_QPT_GSI: + return "IB_QPT_GSI"; + case IB_QPT_RC: + return "IB_QPT_RC"; + case IB_QPT_UC: + return "IB_QPT_UC"; + case IB_QPT_UD: + return "IB_QPT_UD"; + case IB_QPT_RAW_IPV6: + return "IB_QPT_RAW_IPV6"; + case IB_QPT_RAW_ETHERTYPE: + return "IB_QPT_RAW_ETHERTYPE"; + case IB_QPT_XRC_INI: + return "IB_QPT_XRC_INI"; + case IB_QPT_XRC_TGT: + return "IB_QPT_XRC_TGT"; + case IB_QPT_RAW_PACKET: + return "IB_QPT_RAW_PACKET"; + case IB_QPT_MAX: + default: + return "Invalid QP type"; + } +} + +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + u16 xrcdn = 0; + int err; + u32 rcqn; + u32 scqn; + + init_attr->qpg_type = IB_QPG_NONE; + + if (pd) { + dev = to_mdev(pd->device); + } else { + /* being cautious here */ + if (init_attr->qp_type != IB_QPT_XRC_TGT) { + printf("mlx5_ib: WARN: ""%s: no PD for transport %s\n", __func__, ib_qp_type_str(init_attr->qp_type)); + return ERR_PTR(-EINVAL); + } + dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + } + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!MLX5_CAP_GEN(dev->mdev, xrc)) { + mlx5_ib_warn(dev, "XRC not supported\n"); + return ERR_PTR(-ENOSYS); + } + init_attr->recv_cq = NULL; + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = NULL; + } + + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + err = create_qp_common(dev, pd, init_attr, udata, qp); + if (err) { + mlx5_ib_warn(dev, "create_qp_common failed\n"); + kfree(qp); + return ERR_PTR(err); + } + + if (is_qp0(init_attr->qp_type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(init_attr->qp_type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->mqp.qpn; + + rcqn = init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1; + scqn = init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1; + mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + qp->ibqp.qp_num, qp->mqp.qpn, rcqn, scqn); + + qp->xrcdn = xrcdn; + + break; + + case IB_QPT_RAW_IPV6: + case IB_QPT_MAX: + default: + mlx5_ib_warn(dev, "unsupported qp type %d\n", + init_attr->qp_type); + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +int mlx5_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + + destroy_qp_common(dev, mqp); + + kfree(mqp); + + return 0; +} + +static u32 atomic_mode_qp(struct mlx5_ib_dev *dev) +{ + unsigned long mask; + unsigned long tmp; + + mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp) & + MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + + tmp = find_last_bit(&mask, BITS_PER_LONG); + if (tmp < 2 || tmp >= BITS_PER_LONG) + return MLX5_ATOMIC_MODE_NONE; + + if (tmp == 2) + return MLX5_ATOMIC_MODE_CX; + + return tmp << MLX5_ATOMIC_MODE_OFF; +} + +static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + u32 hw_access_flags = 0; + u8 dest_rd_atomic; + u32 access_flags; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX5_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= (MLX5_QP_BIT_RAE | + atomic_mode_qp(dev)); + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX5_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +enum { + MLX5_PATH_FLAG_FL = 1 << 0, + MLX5_PATH_FLAG_FREE_AR = 1 << 1, + MLX5_PATH_FLAG_COUNTER = 1 << 2, +}; + +static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +{ + if (rate == IB_RATE_PORT_CURRENT) { + return 0; + } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { + return -EINVAL; + } else { + while (rate != IB_RATE_2_5_GBPS && + !(1 << (rate + MLX5_STAT_RATE_OFFSET) & + MLX5_CAP_GEN(dev->mdev, stat_rate_support))) + --rate; + } + + return rate + MLX5_STAT_RATE_OFFSET; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, + struct mlx5_qp_path *path, u8 port, int attr_mask, + u32 path_flags, const struct ib_qp_attr *attr, + int alt) +{ + enum rdma_link_layer ll = dev->ib_dev.get_link_layer(&dev->ib_dev, + port); + int err; + int gid_type; + + if ((ll == IB_LINK_LAYER_ETHERNET) || (ah->ah_flags & IB_AH_GRH)) { + int len = dev->ib_dev.gid_tbl_len[port - 1]; + if (ah->grh.sgid_index >= len) { + printf("mlx5_ib: ERR: ""sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, len - 1); + return -EINVAL; + } + } + + if (ll == IB_LINK_LAYER_ETHERNET) { + if (!(ah->ah_flags & IB_AH_GRH)) + return -EINVAL; + + err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index, + &gid_type); + if (err) + return err; + err = mlx5_ib_resolve_grh(ah, path->rmac, NULL); + if (err) + return err; + path->udp_sport = mlx5_get_roce_udp_sport(dev, port, + ah->grh.sgid_index, + 0); + path->dci_cfi_prio_sl = (ah->sl & 0xf) << 4; + } else { + path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->grh_mlid = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + if (ah->ah_flags & IB_AH_GRH) + path->grh_mlid |= 1 << 7; + if (attr_mask & IB_QP_PKEY_INDEX) + path->pkey_index = cpu_to_be16(alt ? + attr->alt_pkey_index : + attr->pkey_index); + + path->dci_cfi_prio_sl = ah->sl & 0xf; + } + + path->fl_free_ar |= (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; + + if (ah->ah_flags & IB_AH_GRH) { + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + err = ib_rate_to_mlx5(dev, ah->static_rate); + if (err < 0) + return err; + path->static_rate = err; + path->port = port; + + if (attr_mask & IB_QP_TIMEOUT) + path->ackto_lt = alt ? attr->alt_timeout << 3 : attr->timeout << 3; + + return 0; +} + +static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_DC_KEY | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_RAE, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_DC_KEY, + }, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RAE, + }, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_SRQN | + MLX5_QP_OPTPAR_CQN_RCV, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RAE, + }, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | + MLX5_QP_OPTPAR_RAE, + + }, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, +}; + +static int ib_nr_to_mlx5_nr(int ib_mask) +{ + switch (ib_mask) { + case IB_QP_STATE: + return 0; + case IB_QP_CUR_STATE: + return 0; + case IB_QP_EN_SQD_ASYNC_NOTIFY: + return 0; + case IB_QP_ACCESS_FLAGS: + return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE; + case IB_QP_PKEY_INDEX: + return MLX5_QP_OPTPAR_PKEY_INDEX; + case IB_QP_PORT: + return MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_QKEY: + return MLX5_QP_OPTPAR_Q_KEY; + case IB_QP_AV: + return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_PATH_MTU: + return 0; + case IB_QP_TIMEOUT: + return MLX5_QP_OPTPAR_ACK_TIMEOUT; + case IB_QP_RETRY_CNT: + return MLX5_QP_OPTPAR_RETRY_COUNT; + case IB_QP_RNR_RETRY: + return MLX5_QP_OPTPAR_RNR_RETRY; + case IB_QP_RQ_PSN: + return 0; + case IB_QP_MAX_QP_RD_ATOMIC: + return MLX5_QP_OPTPAR_SRA_MAX; + case IB_QP_ALT_PATH: + return MLX5_QP_OPTPAR_ALT_ADDR_PATH; + case IB_QP_MIN_RNR_TIMER: + return MLX5_QP_OPTPAR_RNR_TIMEOUT; + case IB_QP_SQ_PSN: + return 0; + case IB_QP_MAX_DEST_RD_ATOMIC: + return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; + case IB_QP_PATH_MIG_STATE: + return MLX5_QP_OPTPAR_PM_STATE; + case IB_QP_CAP: + return 0; + case IB_QP_DEST_QPN: + return 0; + } + return 0; +} + +static int ib_mask_to_mlx5_opt(int ib_mask) +{ + int result = 0; + int i; + + for (i = 0; i < 8 * sizeof(int); i++) { + if ((1 << i) & ib_mask) + result |= ib_nr_to_mlx5_nr(1 << i); + } + + return result; +} + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { + [MLX5_QP_STATE_RST] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, + }, + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, + [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQD_RTS_QP, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, + }, + [MLX5_QP_STATE_ERR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + } + }; + + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_qp_context *context; + struct mlx5_modify_qp_mbox_in *in; + struct mlx5_ib_pd *pd; + enum mlx5_qp_state mlx5_cur, mlx5_new; + enum mlx5_qp_optpar optpar; + int sqd_event; + int mlx5_st; + int err; + u16 op; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + context = &in->ctx; + err = to_mlx5_st(ibqp->qp_type); + if (err < 0) + goto out; + + context->flags = cpu_to_be32(err << 16); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + } else { + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + } else if (ibqp->qp_type == IB_QPT_UD) { + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); + err = -EINVAL; + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | + (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg); + } + + if (attr_mask & IB_QP_DEST_QPN) + context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PKEY_INDEX) + context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index); + + /* todo implement counter_index functionality */ + + if (is_sqp(ibqp->qp_type)) + context->pri_path.port = qp->port; + + if (attr_mask & IB_QP_PORT) + context->pri_path.port = attr->port_num; + + if (attr_mask & IB_QP_AV) { + err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + attr_mask, 0, attr, 0); + if (err) + goto out; + } + + if (attr_mask & IB_QP_TIMEOUT) + context->pri_path.ackto_lt |= attr->timeout << 3; + + if (attr_mask & IB_QP_ALT_PATH) { + err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + attr->alt_port_num, + attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, + 0, attr, 1); + if (err) + goto out; + } + + pd = get_pd(qp); + get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); + + context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; + context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; + context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + + if (attr_mask & IB_QP_RNR_RETRY) + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + + if (attr_mask & IB_QP_RETRY_CNT) + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn & 0xffffff); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + } + + if ((attr_mask & IB_QP_ACCESS_FLAGS) && + (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !dev->enable_atomic_resp) { + mlx5_ib_warn(dev, "atomic responder is not supported\n"); + err = -EINVAL; + goto out; + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn & 0xffffff); + + if (attr_mask & IB_QP_QKEY) + context->qkey = cpu_to_be32(attr->qkey); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_crq_size |= cpu_to_be16(1 << 4); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : + qp->port) - 1; + struct mlx5_ib_port *mibport = &dev->port[port_num]; + + context->qp_counter_set_usr_page |= + cpu_to_be32(mibport->q_cnt_id << 24); + } + + mlx5_cur = to_mlx5_state(cur_state); + mlx5_new = to_mlx5_state(new_state); + mlx5_st = to_mlx5_st(ibqp->qp_type); + if (mlx5_st < 0) + goto out; + + if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || + !optab[mlx5_cur][mlx5_new]) + return -EINVAL; + + op = optab[mlx5_cur][mlx5_new]; + optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + in->optparam = cpu_to_be32(optpar); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) + err = -EOPNOTSUPP; + else + err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event, + &qp->mqp); + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !ibqp->uobject) { + mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.cur_post = 0; + qp->sq.last_poll = 0; + if (qp->db.db) { + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; + } + } + +out: + kfree(in); + return err; +} + +static int ignored_ts_check(enum ib_qp_type qp_type) +{ + return 0; +} + +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + int port; + + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ignored_ts_check(ibqp->qp_type) && + !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) + goto out; + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= + dev->mdev->port_caps[port - 1].pkey_table_len) + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) + goto out; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) + goto out; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static __be16 get_klm_octo(int npages) +{ + return cpu_to_be16(ALIGN(npages, 8) / 2); +} + +static __be64 frwr_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, int li) +{ + memset(umr, 0, sizeof(*umr)); + + if (li) { + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = 1 << 7; + return; + } + + umr->flags = (1 << 5); /* fail if not free */ + umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len); + umr->mkey_mask = frwr_mkey_mask(); +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, + int li, int *writ) +{ + memset(seg, 0, sizeof(*seg)); + if (li) { + seg->status = MLX5_MKEY_STATUS_FREE; + return; + } + + seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | + MLX5_ACCESS_MODE_MTT; + *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); + seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + seg->len = cpu_to_be64(wr->wr.fast_reg.length); + seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2); + seg->log2_page_size = wr->wr.fast_reg.page_shift; +} + +static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, + struct ib_send_wr *wr, + struct mlx5_core_dev *mdev, + struct mlx5_ib_pd *pd, + int writ) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + u64 *page_list = wr->wr.fast_reg.page_list->page_list; + u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0); + int i; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) + mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm); + dseg->addr = cpu_to_be64(mfrpl->map); + dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64)); + dseg->lkey = cpu_to_be32(pd->pa_lkey); +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 calc_wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx5_wqe_inline_seg *seg; + void *qend = qp->sq.qend; + void *addr; + int inl = 0; + int copy; + int len; + int i; + + seg = wqe; + wqe += sizeof(*seg); + for (i = 0; i < wr->num_sge; i++) { + addr = (void *)(uintptr_t)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + if (unlikely(wqe + len > qend)) { + copy = (int)(qend - wqe); + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = mlx5_get_send_wqe(qp, 0); + } + memcpy(wqe, addr, len); + wqe += len; + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, + struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) +{ + int writ = 0; + int li; + + li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0; + if (unlikely(wr->send_flags & IB_SEND_INLINE)) + return -EINVAL; + + set_frwr_umr_segment(*seg, wr, li); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + set_mkey_segment(*seg, wr, li, &writ); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + + set_frwr_pages(*seg, wr, mdev, pd, writ); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) +{ + __be32 *p = NULL; + int tidx = idx; + int i, j; + + pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + void *buf = mlx5_get_send_wqe(qp, tidx); + tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); + p = buf; + j = 0; + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, + unsigned bytecnt, struct mlx5_ib_qp *qp) +{ + while (bytecnt > 0) { + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + bytecnt -= 64; + if (unlikely(src == qp->sq.qend)) + src = mlx5_get_send_wqe(qp, 0); + } +} + +static u8 get_fence(u8 fence, struct ib_send_wr *wr) +{ + if (unlikely(wr->opcode == IB_WR_LOCAL_INV && + wr->send_flags & IB_SEND_FENCE)) + return MLX5_FENCE_MODE_STRONG_ORDERING; + + if (unlikely(fence)) { + if (wr->send_flags & IB_SEND_FENCE) + return MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + return fence; + + } else { + return 0; + } +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq) +{ + int err = 0; + + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { + mlx5_ib_warn(to_mdev(qp->ibqp.device), "work queue overflow\n"); + err = -ENOMEM; + return err; + } + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_get_send_wqe(qp, *idx); + *ctrl = *seg; + *(u32 *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (wr->send_flags & IB_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + + return err; +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + u8 size, unsigned idx, + struct ib_send_wr *wr, + int nreq, u8 fence, u8 next_fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + qp->fm_cache = next_fence; + if (unlikely(qp->wq_sig)) + ctrl->signature = calc_wq_sig(ctrl); + + qp->sq.swr_ctx[idx].wrid = wr->wr_id; + qp->sq.swr_ctx[idx].w_list.opcode = mlx5_opcode; + qp->sq.swr_ctx[idx].wqe_head = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.swr_ctx[idx].w_list.next = qp->sq.cur_post; + qp->sq.swr_ctx[idx].sig_piped = 0; +} + +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *dpseg; + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf = qp->bf; + int uninitialized_var(size); + void *qend = qp->sq.qend; + unsigned long flags; + unsigned idx; + int err = 0; + int inl = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + + spin_lock_irqsave(&qp->sq.lock, flags); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { + mlx5_ib_warn(dev, "Invalid opcode 0x%x\n", wr->opcode); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + fence = qp->fm_cache; + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "Max gs exceeded %d (max = %d)\n", wr->num_sge, qp->sq.max_gs); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare WQE\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + switch (ibqp->qp_type) { + case IB_QPT_XRC_INI: + xrc = seg; + xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num); + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + /* fall through */ + case IB_QPT_RC: + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -ENOSYS; + *bad_wr = wr; + goto out; + + case IB_WR_LOCAL_INV: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.swr_ctx[idx].wr_data = IB_WR_LOCAL_INV; + ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare LOCAL_INV WQE\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_FAST_REG_MR: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.swr_ctx[idx].wr_data = IB_WR_FAST_REG_MR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "Failed to prepare FAST_REG_MR WQE\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + default: + break; + } + break; + + case IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + default: + break; + } + break; + + case IB_QPT_SMI: + if (!mlx5_core_is_pf(mdev)) { + err = -EINVAL; + mlx5_ib_warn(dev, "Only physical function is allowed to send SMP MADs\n"); + *bad_wr = wr; + goto out; + } + case IB_QPT_GSI: + case IB_QPT_UD: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + int uninitialized_var(sz); + + err = set_data_inl_seg(qp, wr, seg, &sz); + if (unlikely(err)) { + mlx5_ib_warn(dev, "Failed to prepare inline data segment\n"); + *bad_wr = wr; + goto out; + } + inl = 1; + size += sz; + } else { + dpseg = seg; + for (i = 0; i < num_sge; i++) { + if (unlikely(dpseg == qend)) { + seg = mlx5_get_send_wqe(qp, 0); + dpseg = seg; + } + if (likely(wr->sg_list[i].length)) { + set_data_ptr_seg(dpseg, wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + dpseg++; + } + } + } + + finish_wqe(qp, ctrl, size, idx, wr, nreq, + get_fence(fence, wr), next_fence, + mlx5_ib_opcode[wr->opcode]); + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell */ + wmb(); + + if (bf->need_lock) + spin_lock(&bf->lock); + else + __acquire(&bf->lock); + + /* TBD enable WC */ + if (BF_ENABLE && nreq == 1 && bf->uuarn && inl && size > 1 && + size <= bf->buf_size / 16) { + mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); + /* wc_wmb(); */ + } else { + mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset, + MLX5_GET_DOORBELL_LOCK(&bf->lock32)); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + } + bf->offset ^= bf->buf_size; + if (bf->need_lock) + spin_unlock(&bf->lock); + else + __release(&bf->lock); + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) +{ + sig->signature = calc_sig(sig, size); +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + if (qp->wq_sig) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->wq_sig) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); + } + + qp->rq.rwr_ctx[ind].wrid = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) +{ + switch (mlx5_state) { + case MLX5_QP_STATE_RST: return IB_QPS_RESET; + case MLX5_QP_STATE_INIT: return IB_QPS_INIT; + case MLX5_QP_STATE_RTR: return IB_QPS_RTR; + case MLX5_QP_STATE_RTS: return IB_QPS_RTS; + case MLX5_QP_STATE_SQ_DRAINING: + case MLX5_QP_STATE_SQD: return IB_QPS_SQD; + case MLX5_QP_STATE_SQER: return IB_QPS_SQE; + case MLX5_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) +{ + switch (mlx5_mig_state) { + case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX5_QP_PM_REARM: return IB_MIG_REARM; + case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx5_flags) +{ + int ib_flags = 0; + + if (mlx5_flags & MLX5_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx5_flags & MLX5_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx5_flags & MLX5_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx5_qp_path *path) +{ + struct mlx5_core_dev *dev = ibdev->mdev; + + memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); + ib_ah_attr->port_num = path->port; + + if (ib_ah_attr->port_num == 0 || + ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports)) + return; + + ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof(ib_ah_attr->grh.dgid.raw)); + } +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_query_qp_mbox_out *outb; + struct mlx5_qp_context *context; + int mlx5_state; + int err = 0; + + mutex_lock(&qp->mutex); + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + err = -EOPNOTSUPP; + goto out; + } else { + outb = kzalloc(sizeof(*outb), GFP_KERNEL); + if (!outb) { + err = -ENOMEM; + goto out; + } + + context = &outb->ctx; + err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, + sizeof(*outb)); + if (err) { + kfree(outb); + goto out; + } + + mlx5_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mlx5_state); + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = be16_to_cpu(context->alt_path.pkey_index); + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); + qp_attr->port_num = context->pri_path.port; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto_lt >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; + qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + + + kfree(outb); + } + + qp_attr->qp_state = qp->state; + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.max_post; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + qp_init_attr->qp_context = ibqp->qp_context; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + qp_init_attr->qp_type = ibqp->qp_type; + qp_init_attr->recv_cq = ibqp->recv_cq; + qp_init_attr->send_cq = ibqp->send_cq; + qp_init_attr->srq = ibqp->srq; + qp_attr->cap.max_inline_data = qp->max_inline_data; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out: + mutex_unlock(&qp->mutex); + return err; +} + +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_xrcd *xrcd; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); + if (err) { + kfree(xrcd); + return ERR_PTR(-ENOMEM); + } + + return &xrcd->ibxrcd; +} + +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct mlx5_ib_dev *dev = to_mdev(xrcd->device); + u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + int err; + + err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); + if (err) { + mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); + return err; + } + + kfree(xrcd); + + return 0; +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_roce.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_roce.c new file mode 100644 index 000000000..f7b17c561 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_roce.c @@ -0,0 +1,252 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" + +struct net_device *mlx5_ib_get_netdev(struct ib_device *ib_dev, u8 port) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_dev); + + return mlx5_get_protocol_dev(dev->mdev, MLX5_INTERFACE_PROTOCOL_ETH); +} + + +static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, + struct net_device *ndev, + void *mlx5_addr) +{ +#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) + char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_l3_address); + void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_mac_47_32); + union ib_gid zgid; + u16 vtag; + + memset(&zgid, 0, sizeof(zgid)); + if (0 == memcmp(gid, &zgid, sizeof(zgid))) + return; + + ether_addr_copy(mlx5_addr_mac, IF_LLADDR(ndev)); + + if (VLAN_TAG(ndev, &vtag) == 0) { + MLX5_SET_RA(mlx5_addr, vlan_valid, 1); + MLX5_SET_RA(mlx5_addr, vlan_id, vtag); + } + +#ifndef MLX5_USE_ROCE_VERSION_2 + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); + + memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); +#else + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); + + if (ipv6_addr_v4mapped((void *)gid)) { + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV4); + memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); + } else { + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV6); + memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); + } +#endif +} + +int modify_gid_roce(struct ib_device *ib_dev, u8 port, unsigned int index, + const union ib_gid *gid, struct net_device *ndev) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_dev); + u32 in[MLX5_ST_SZ_DW(set_roce_address_in)]; + u32 out[MLX5_ST_SZ_DW(set_roce_address_out)]; + void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); + + memset(in, 0, sizeof(in)); + + ib_gid_to_mlx5_roce_addr(gid, ndev, in_addr); + + MLX5_SET(set_roce_address_in, in, roce_address_index, index); + MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); + + memset(out, 0, sizeof(out)); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); +} + +static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, + u8 *active_width) +{ + switch (eth_proto_oper) { + case MLX5_PROT_MASK(MLX5_1000BASE_CX_SGMII): + case MLX5_PROT_MASK(MLX5_1000BASE_KX): + case MLX5_PROT_MASK(MLX5_100BASE_TX): + case MLX5_PROT_MASK(MLX5_1000BASE_T): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_SDR; + break; + case MLX5_PROT_MASK(MLX5_10GBASE_T): + case MLX5_PROT_MASK(MLX5_10GBASE_CX4): + case MLX5_PROT_MASK(MLX5_10GBASE_KX4): + case MLX5_PROT_MASK(MLX5_10GBASE_KR): + case MLX5_PROT_MASK(MLX5_10GBASE_CR): + case MLX5_PROT_MASK(MLX5_10GBASE_SR): + case MLX5_PROT_MASK(MLX5_10GBASE_ER): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5_PROT_MASK(MLX5_25GBASE_CR): + case MLX5_PROT_MASK(MLX5_25GBASE_KR): + case MLX5_PROT_MASK(MLX5_25GBASE_SR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5_PROT_MASK(MLX5_40GBASE_CR4): + case MLX5_PROT_MASK(MLX5_40GBASE_KR4): + case MLX5_PROT_MASK(MLX5_40GBASE_SR4): + case MLX5_PROT_MASK(MLX5_40GBASE_LR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5_PROT_MASK(MLX5_50GBASE_CR2): + case MLX5_PROT_MASK(MLX5_50GBASE_KR2): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_FDR; + break; + case MLX5_PROT_MASK(MLX5_56GBASE_R4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR; + break; + case MLX5_PROT_MASK(MLX5_100GBASE_CR4): + case MLX5_PROT_MASK(MLX5_100GBASE_SR4): + case MLX5_PROT_MASK(MLX5_100GBASE_KR4): + case MLX5_PROT_MASK(MLX5_100GBASE_LR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_query_roce_port_ptys(struct ib_device *ib_dev, + struct ib_port_attr *props, u8 port) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_dev); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ptys_reg *ptys; + int err; + + ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); + if (!ptys) + return -ENOMEM; + + ptys->proto_mask |= MLX5_PTYS_EN; + ptys->local_port = port; + + err = mlx5_core_access_ptys(mdev, ptys, 0); + if (err) + goto out; + + err = translate_eth_proto_oper(ptys->eth_proto_oper, + &props->active_speed, + &props->active_width); +out: + kfree(ptys); + return err; +} + +int mlx5_query_port_roce(struct ib_device *ib_dev, u8 port, + struct ib_port_attr *props) +{ + struct net_device *netdev = mlx5_ib_get_netdev(ib_dev, port); + struct mlx5_ib_dev *dev = to_mdev(ib_dev); + enum ib_mtu netdev_ib_mtu; + + memset(props, 0, sizeof(*props)); + + props->port_cap_flags |= IB_PORT_CM_SUP; + + props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, + roce_address_table_size); + props->max_mtu = IB_MTU_4096; + props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); + props->pkey_tbl_len = 1; + props->state = IB_PORT_DOWN; + props->phys_state = 3; + + if (mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, + (u16 *)&props->qkey_viol_cntr)) + printf("mlx5_ib: WARN: ""%s failed to query qkey violations counter\n", __func__); + + + if (!netdev) + return 0; + + if (netif_running(netdev) && netif_carrier_ok(netdev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + netdev_ib_mtu = iboe_get_mtu(netdev->if_mtu); + props->active_mtu = min(props->max_mtu, netdev_ib_mtu); + + mlx5_query_roce_port_ptys(ib_dev, props, port); + + return 0; +} + +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port, + int index, __be16 ah_s_udp_port) +{ +#ifndef MLX5_USE_ROCE_VERSION_2 + return 0; +#else + return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); +#endif +} + +int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port, + int index, int *gid_type) +{ + union ib_gid gid; + int ret; + + ret = ib_get_cached_gid(&dev->ib_dev, port, index, &gid); + + if (!ret) + *gid_type = -1; + + return ret; +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c new file mode 100644 index 000000000..93dec9c04 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c @@ -0,0 +1,503 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int srq_signature; + +static void *get_wqe(struct mlx5_ib_srq *srq, int n) +{ + return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, int type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on SRQ %06x\n", type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, + struct ib_udata *udata, int buf_size, int *inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_srq ucmd; + size_t ucmdlen; + void *xsrqc; + int err; + int npages; + int page_shift; + int ncont; + int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + u32 offset; + + ucmdlen = (drv_data < sizeof(ucmd)) ? + drv_data : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_err(dev, "failed copy udata\n"); + return -EFAULT; + } + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved1 != 0) { + mlx5_ib_warn(dev, "corrupted ucmd\n"); + return -EINVAL; + } + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, + 0, 0); + if (IS_ERR(srq->umem)) { + mlx5_ib_warn(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); + return err; + } + + mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages, + &page_shift, &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, + &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!(*in)) { + mlx5_ib_err(dev, "failed allocate mbox\n"); + err = -ENOMEM; + goto err_umem; + } + + mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0); + + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_warn(dev, "map doorbell failed\n"); + goto err_in; + } + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); + + if (MLX5_CAP_GEN(dev->mdev, cqe_version)) { + xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + /* 0xffffff means we ask to work with cqe version 0 */ + if (drv_data > offsetof(struct mlx5_ib_create_srq, uidx)) + MLX5_SET(xrc_srqc, xsrqc, user_index, ucmd.uidx); + else + MLX5_SET(xrc_srqc, xsrqc, user_index, 0xffffff); + } + + return 0; + +err_in: + kvfree(*in); + +err_umem: + ib_umem_release(srq->umem); + + return err; +} + +static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, int buf_size, + int *inlen) +{ + int err; + int i; + struct mlx5_wqe_srq_next_seg *next; + int page_shift; + void *xsrqc; + + err = mlx5_db_alloc(dev->mdev, &srq->db); + if (err) { + mlx5_ib_warn(dev, "alloc dbell rec failed\n"); + return err; + } + + if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + mlx5_ib_err(dev, "buf alloc failed\n"); + err = -ENOMEM; + goto err_db; + } + page_shift = srq->buf.page_shift; + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; i++) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * srq->buf.npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + mlx5_ib_err(dev, "failed allocate mbox\n"); + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&srq->buf, (*in)->pas); + + srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_in; + } + srq->wq_sig = !!srq_signature; + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + if (MLX5_CAP_GEN(dev->mdev, cqe_version)) { + xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(xrc_srqc, xsrqc, user_index, 0xffffff); + } + + return 0; + +err_in: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &srq->buf); + +err_db: + mlx5_db_free(dev->mdev, &srq->db); + return err; +} + +static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) +{ + mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + ib_umem_release(srq->umem); +} + + +static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) +{ + kfree(srq->wrid); + mlx5_buf_free(dev->mdev, &srq->buf); + mlx5_db_free(dev->mdev, &srq->db); +} + +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_srq *srq; + int desc_size; + int buf_size; + int err; + struct mlx5_create_srq_mbox_in *uninitialized_var(in); + int uninitialized_var(inlen); + int is_xrc; + u32 flgs, xrcdn; + __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= max_srq_wqes) { + mlx5_ib_warn(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + max_srq_wqes); + return ERR_PTR(-EINVAL); + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + desc_size = roundup_pow_of_two(desc_size); + desc_size = max_t(int, 32, desc_size); + srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + srq->msrq.wqe_shift = ilog2(desc_size); + buf_size = srq->msrq.max * desc_size; + mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n", + desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, + srq->msrq.max_avail_gather); + + if (pd->uobject) + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + else + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + + if (err) { + mlx5_ib_warn(dev, "create srq %s failed, err %d\n", + pd->uobject ? "user" : "kernel", err); + goto err_srq; + } + + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + in->ctx.state_log_sz = ilog2(srq->msrq.max); + flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; + xrcdn = 0; + if (is_xrc) { + xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn); + } else if (init_attr->srq_type == IB_SRQT_BASIC) { + xrcdn = to_mxrcd(dev->devr.x0)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn); + } + + in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF)); + + in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn); + in->ctx.db_record = cpu_to_be64(srq->db.dma); + err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen, is_xrc); + kvfree(in); + if (err) { + mlx5_ib_warn(dev, "create SRQ failed, err %d\n", err); + goto err_usr_kern_srq; + } + + mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); + + srq->msrq.event = mlx5_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { + mlx5_ib_err(dev, "copy to user failed\n"); + err = -EFAULT; + goto err_core; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_core: + mlx5_core_destroy_srq(dev->mdev, &srq->msrq); + +err_usr_kern_srq: + if (pd->uobject) + destroy_srq_user(pd, srq); + else + destroy_srq_kernel(dev, srq); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs yet */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + struct mlx5_query_srq_mbox_out *out; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out); + if (ret) + goto out_box; + + srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm); + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + +out_box: + kfree(out); + return ret; +} + +int mlx5_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + + mlx5_core_destroy_srq(dev->mdev, &msrq->msrq); + + if (srq->uobject) { + mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + destroy_srq_kernel(dev, msrq); + } + + kfree(srq); + return 0; +} + +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) +{ + struct mlx5_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *)(next + 1); + + for (i = 0; i < wr->num_sge; i++) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_avail_gather) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } +out: + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/sys/dev/mlx5/mlx5_ib/user.h b/sys/dev/mlx5/mlx5_ib/user.h new file mode 100644 index 000000000..bc5595231 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/user.h @@ -0,0 +1,318 @@ +/*- + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef MLX5_IB_USER_H +#define MLX5_IB_USER_H + +#include + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + +enum { + MLX5_WQ_FLAG_SIGNATURE = 1 << 0, +}; + + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; +}; + +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved; + __u32 max_desc_sz_sq_dc; + __u32 atomic_arg_sizes_dc; + __u32 reserved1; + __u32 flags; + __u32 reserved2[5]; +}; + +enum mlx5_exp_ib_alloc_ucontext_data_resp_mask { + MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_COMP_MAX_NUM = 1 << 0, + MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_VERSION = 1 << 1, + MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MIN = 1 << 2, + MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MAX = 1 << 3, + MLX5_EXP_ALLOC_CTX_RESP_MASK_HCA_CORE_CLOCK_OFFSET = 1 << 4, +}; + +struct mlx5_exp_ib_alloc_ucontext_data_resp { + __u32 comp_mask; /* use mlx5_ib_exp_alloc_ucontext_data_resp_mask */ + __u16 cqe_comp_max_num; + __u8 cqe_version; + __u8 reserved; + __u16 rroce_udp_sport_min; + __u16 rroce_udp_sport_max; + __u32 hca_core_clock_offset; +}; + +struct mlx5_exp_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved; + __u32 max_desc_sz_sq_dc; + __u32 atomic_arg_sizes_dc; + __u32 reserved1; + __u32 flags; + __u32 reserved2[5]; + /* Some more reserved fields for + * future growth of mlx5_ib_alloc_ucontext_resp */ + __u64 prefix_reserved[8]; + struct mlx5_exp_ib_alloc_ucontext_data_resp exp_data; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +enum mlx5_exp_ib_create_cq_mask { + MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_EN = 1 << 0, + MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_RECV_TYPE = 1 << 1, + MLX5_EXP_CREATE_CQ_MASK_RESERVED = 1 << 2, +}; + +enum mlx5_exp_cqe_comp_recv_type { + MLX5_IB_CQE_FORMAT_HASH, + MLX5_IB_CQE_FORMAT_CSUM, +}; + +struct mlx5_exp_ib_create_cq_data { + __u32 comp_mask; /* use mlx5_exp_ib_creaet_cq_mask */ + __u8 cqe_comp_en; + __u8 cqe_comp_recv_type; /* use mlx5_exp_cqe_comp_recv_type */ + __u16 reserved; +}; + +struct mlx5_exp_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ + + /* Some more reserved fields for future growth of mlx5_ib_create_cq */ + __u64 prefix_reserved[8]; + + /* sizeof prefix aligned with mlx5_ib_create_cq */ + __u64 size_of_prefix; + struct mlx5_exp_ib_create_cq_data exp_data; +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; + __u32 flags; + __u32 reserved; /* explicit padding (optional on i386) */ + __u32 uidx; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; +}; + +enum mlx5_exp_ib_create_qp_mask { + MLX5_EXP_CREATE_QP_MASK_UIDX = 1 << 0, + MLX5_EXP_CREATE_QP_MASK_SQ_BUFF_ADD = 1 << 1, + MLX5_EXP_CREATE_QP_MASK_WC_UAR_IDX = 1 << 2, + MLX5_EXP_CREATE_QP_MASK_FLAGS_IDX = 1 << 3, + MLX5_EXP_CREATE_QP_MASK_RESERVED = 1 << 4, +}; + +enum mlx5_exp_create_qp_flags { + MLX5_EXP_CREATE_QP_MULTI_PACKET_WQE_REQ_FLAG = 1 << 0, +}; + +enum mlx5_exp_drv_create_qp_uar_idx { + MLX5_EXP_CREATE_QP_DB_ONLY_UUAR = -1 +}; + +struct mlx5_exp_ib_create_qp_data { + __u32 comp_mask; /* use mlx5_exp_ib_create_qp_mask */ + __u32 uidx; + __u64 sq_buf_addr; + __u32 wc_uar_index; + __u32 flags; /* use mlx5_exp_create_qp_flags */ +}; + +struct mlx5_exp_ib_create_qp { + /* To allow casting to mlx5_ib_create_qp the prefix is the same as + * struct mlx5_ib_create_qp prefix + */ + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; + + /* Some more reserved fields for future growth of mlx5_ib_create_qp */ + __u64 prefix_reserved[8]; + + /* sizeof prefix aligned with mlx5_ib_create_qp */ + __u64 size_of_prefix; + + /* Experimental data + * Add new experimental data only inside the exp struct + */ + struct mlx5_exp_ib_create_qp_data exp; +}; + +enum { + MLX5_EXP_INVALID_UUAR = -1, +}; + +struct mlx5_ib_create_qp_resp { + __u32 uuar_index; + __u32 rsvd; +}; + +enum mlx5_exp_ib_create_qp_resp_mask { + MLX5_EXP_CREATE_QP_RESP_MASK_FLAGS_IDX = 1 << 0, + MLX5_EXP_CREATE_QP_RESP_MASK_RESERVED = 1 << 1, +}; + +enum mlx5_exp_create_qp_resp_flags { + MLX5_EXP_CREATE_QP_RESP_MULTI_PACKET_WQE_FLAG = 1 << 0, +}; + +struct mlx5_exp_ib_create_qp_resp_data { + __u32 comp_mask; /* use mlx5_exp_ib_create_qp_resp_mask */ + __u32 flags; /* use mlx5_exp_create_qp_resp_flags */ +}; + +struct mlx5_exp_ib_create_qp_resp { + __u32 uuar_index; + __u32 rsvd; + + /* Some more reserved fields for future growth of mlx5_ib_create_qp_resp */ + __u64 prefix_reserved[8]; + + /* sizeof prefix aligned with mlx5_ib_create_qp_resp */ + __u64 size_of_prefix; + + /* Experimental data + * Add new experimental data only inside the exp struct + */ + struct mlx5_exp_ib_create_qp_resp_data exp; +}; + +struct mlx5_ib_create_dct { + __u32 uidx; + __u32 reserved; +}; + +struct mlx5_ib_arm_dct { + __u64 reserved0; + __u64 reserved1; +}; + +struct mlx5_ib_arm_dct_resp { + __u64 reserved0; + __u64 reserved1; +}; + +struct mlx5_ib_create_wq { + __u64 buf_addr; + __u64 db_addr; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 user_index; + __u32 flags; +}; + +#endif /* MLX5_IB_USER_H */ diff --git a/sys/modules/Makefile b/sys/modules/Makefile index e183bf065..f104bfb8e 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -225,6 +225,7 @@ SUBDIR= \ ${_mlxen} \ ${_mlx5} \ ${_mlx5en} \ + ${_mlx5ib} \ ${_mly} \ mmc \ mmcsd \ @@ -762,6 +763,7 @@ _mly= mly .if ${MK_OFED} != "no" || defined(ALL_MODULES) _mlx4= mlx4 _mlx4ib= mlx4ib +_mlx5ib= mlx5ib _mlxen= mlxen _mthca= mthca .endif diff --git a/sys/modules/mlx5ib/Makefile b/sys/modules/mlx5ib/Makefile new file mode 100644 index 000000000..bc554ee8c --- /dev/null +++ b/sys/modules/mlx5ib/Makefile @@ -0,0 +1,24 @@ +# $FreeBSD$ +.PATH: ${SRCTOP}/sys/dev/mlx5/mlx5_ib + +KMOD=mlx5ib +SRCS= \ +mlx5_ib_ah.c \ +mlx5_ib_cq.c \ +mlx5_ib_doorbell.c \ +mlx5_ib_mad.c \ +mlx5_ib_main.c \ +mlx5_ib_mem.c \ +mlx5_ib_mr.c \ +mlx5_ib_qp.c \ +mlx5_ib_roce.c \ +mlx5_ib_srq.c \ +device_if.h bus_if.h vnode_if.h pci_if.h \ +opt_inet.h opt_inet6.h + +CFLAGS+= -I${SRCTOP}/sys/ofed/include +CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith ${GCC_MS_EXTENSIONS} -- 2.42.0