1 XRC implementation, consolidated (version 2):
3 xrc ops were moved to their own structure at the end of
4 struct ibv_context (to preserve binary compatibility).
6 Check for ibv_context.xrc_ops member via AC_CHECK_MEMBER
8 XRC QPs have MSB set in qp number, for identification in
12 (OFED 1.3 commit 39fe7f47e8fc07f356098df048d00740ba585fc5)
14 Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
17 1. checkpatch.pl cleanup
18 2. Changed xrc_ops to more ops
19 3. Check for xrc verbs in ibv_more_ops via AC_CHECK_MEMBER
21 diff --git a/configure.in b/configure.in
22 index 25f27f7..46a3a64 100644
25 @@ -42,6 +42,12 @@ AC_CHECK_HEADER(valgrind/memcheck.h,
26 dnl Checks for typedefs, structures, and compiler characteristics.
29 +AC_CHECK_MEMBER(struct ibv_context.more_ops,
30 + [AC_DEFINE([HAVE_IBV_MORE_OPS], 1, [Define to 1 if more_ops is a member of ibv_context])],,
31 + [#include <infiniband/verbs.h>])
32 +AC_CHECK_MEMBER(struct ibv_more_ops.create_xrc_srq,
33 + [AC_DEFINE([HAVE_IBV_XRC_OPS], 1, [Define to 1 if have xrc ops])],,
34 + [#include <infiniband/verbs.h>])
36 dnl Checks for library functions
37 AC_CHECK_FUNC(ibv_read_sysfs_file, [],
38 diff --git a/src/cq.c b/src/cq.c
39 index 68e16e9..c598b87 100644
42 @@ -194,8 +194,9 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
46 - struct mlx4_srq *srq;
47 + struct mlx4_srq *srq = NULL;
50 uint32_t g_mlpath_rqpn;
53 @@ -221,20 +223,29 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
54 is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
55 MLX4_CQE_OPCODE_ERROR;
58 - (ntohl(cqe->my_qpn) & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) {
59 + if (qpn & MLX4_XRC_QPN_BIT && !is_send) {
60 + srqn = ntohl(cqe->g_mlpath_rqpn) & 0xffffff;
62 + * We do not have to take the XRC SRQ table lock here,
63 + * because CQs will be locked while XRC SRQs are removed
66 + srq = mlx4_find_xrc_srq(to_mctx(cq->ibv_cq.context), srqn);
69 + } else if (!*cur_qp || (qpn & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) {
71 * We do not have to take the QP table lock here,
72 * because CQs will be locked while QPs are removed
75 *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context),
76 - ntohl(cqe->my_qpn) & 0xffffff);
82 - wc->qp_num = (*cur_qp)->ibv_qp.qp_num;
83 + wc->qp_num = qpn & 0xffffff;
87 @@ -242,6 +254,10 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
88 wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
89 wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
92 + wqe_index = htons(cqe->wqe_index);
93 + wc->wr_id = srq->wrid[wqe_index];
94 + mlx4_free_srq_wqe(srq, wqe_index);
95 } else if ((*cur_qp)->ibv_qp.srq) {
96 srq = to_msrq((*cur_qp)->ibv_qp.srq);
97 wqe_index = htons(cqe->wqe_index);
98 @@ -387,6 +403,10 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
102 + int is_xrc_srq = 0;
104 + if (srq && srq->ibv_srq.xrc_cq)
108 * First we need to find the current producer index, so we
109 @@ -405,7 +425,12 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
111 while ((int) --prod_index - (int) cq->cons_index >= 0) {
112 cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
113 - if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
115 + (ntohl(cqe->g_mlpath_rqpn & 0xffffff) == srq->srqn) &&
116 + !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
117 + mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
119 + } else if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
120 if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
121 mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
123 diff --git a/src/mlx4-abi.h b/src/mlx4-abi.h
124 index 20a40c9..1b1253c 100644
127 @@ -68,6 +68,14 @@ struct mlx4_resize_cq {
131 +#ifdef HAVE_IBV_XRC_OPS
132 +struct mlx4_create_xrc_srq {
133 + struct ibv_create_xrc_srq ibv_cmd;
139 struct mlx4_create_srq {
140 struct ibv_create_srq ibv_cmd;
142 @@ -90,4 +98,12 @@ struct mlx4_create_qp {
146 +#ifdef HAVE_IBV_XRC_OPS
147 +struct mlx4_open_xrc_domain_resp {
148 + struct ibv_open_xrc_domain_resp ibv_resp;
154 #endif /* MLX4_ABI_H */
155 diff --git a/src/mlx4.c b/src/mlx4.c
156 index 671e849..27ca75d 100644
159 @@ -68,6 +68,16 @@ struct {
160 HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */
163 +#ifdef HAVE_IBV_MORE_OPS
164 +static struct ibv_more_ops mlx4_more_ops = {
165 +#ifdef HAVE_IBV_XRC_OPS
166 + .create_xrc_srq = mlx4_create_xrc_srq,
167 + .open_xrc_domain = mlx4_open_xrc_domain,
168 + .close_xrc_domain = mlx4_close_xrc_domain,
173 static struct ibv_context_ops mlx4_ctx_ops = {
174 .query_device = mlx4_query_device,
175 .query_port = mlx4_query_port,
176 @@ -124,6 +134,15 @@ static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, int cmd_
177 for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
178 context->qp_table[i].refcnt = 0;
180 + context->num_xrc_srqs = resp.qp_tab_size;
181 + context->xrc_srq_table_shift = ffs(context->num_xrc_srqs) - 1
182 + - MLX4_XRC_SRQ_TABLE_BITS;
183 + context->xrc_srq_table_mask = (1 << context->xrc_srq_table_shift) - 1;
185 + pthread_mutex_init(&context->xrc_srq_table_mutex, NULL);
186 + for (i = 0; i < MLX4_XRC_SRQ_TABLE_SIZE; ++i)
187 + context->xrc_srq_table[i].refcnt = 0;
189 for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
190 context->db_list[i] = NULL;
192 @@ -156,6 +175,9 @@ static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, int cmd_
193 pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
195 context->ibv_ctx.ops = mlx4_ctx_ops;
196 +#ifdef HAVE_IBV_XRC_OPS
197 + context->ibv_ctx.more_ops = &mlx4_more_ops;
200 if (mlx4_query_device(&context->ibv_ctx, &dev_attrs))
202 diff --git a/src/mlx4.h b/src/mlx4.h
203 index 8643d8f..3eadb98 100644
210 +#ifndef HAVE_IBV_MORE_OPS
211 +#undef HAVE_IBV_XRC_OPS
212 +#undef HAVE_IBV_CREATE_QP_EXP
215 #define HIDDEN __attribute__((visibility ("hidden")))
218 @@ -111,6 +116,16 @@ enum {
219 MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1
223 + MLX4_XRC_SRQ_TABLE_BITS = 8,
224 + MLX4_XRC_SRQ_TABLE_SIZE = 1 << MLX4_XRC_SRQ_TABLE_BITS,
225 + MLX4_XRC_SRQ_TABLE_MASK = MLX4_XRC_SRQ_TABLE_SIZE - 1
229 + MLX4_XRC_QPN_BIT = (1 << 23)
235 @@ -174,6 +189,15 @@ struct mlx4_context {
240 + struct mlx4_srq **table;
242 + } xrc_srq_table[MLX4_XRC_SRQ_TABLE_SIZE];
243 + pthread_mutex_t xrc_srq_table_mutex;
245 + int xrc_srq_table_shift;
246 + int xrc_srq_table_mask;
248 struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE];
249 pthread_mutex_t db_list_mutex;
251 @@ -260,6 +284,11 @@ struct mlx4_ah {
255 +struct mlx4_xrc_domain {
256 + struct ibv_xrc_domain ibv_xrcd;
260 static inline unsigned long align(unsigned long val, unsigned long align)
262 return (val + align - 1) & ~(align - 1);
263 @@ -304,6 +333,13 @@ static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
264 return to_mxxx(ah, ah);
267 +#ifdef HAVE_IBV_XRC_OPS
268 +static inline struct mlx4_xrc_domain *to_mxrcd(struct ibv_xrc_domain *ibxrcd)
270 + return to_mxxx(xrcd, xrc_domain);
274 int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
275 void mlx4_free_buf(struct mlx4_buf *buf);
277 @@ -350,6 +386,10 @@ void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
278 int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
279 struct ibv_recv_wr *wr,
280 struct ibv_recv_wr **bad_wr);
281 +struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
282 +int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
283 + struct mlx4_srq *srq);
284 +void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
286 struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
287 int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
288 @@ -380,5 +420,16 @@ int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
289 int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
291 void mlx4_free_av(struct mlx4_ah *ah);
292 +#ifdef HAVE_IBV_XRC_OPS
293 +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
294 + struct ibv_xrc_domain *xrc_domain,
295 + struct ibv_cq *xrc_cq,
296 + struct ibv_srq_init_attr *attr);
297 +struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
298 + int fd, int oflag);
300 +int mlx4_close_xrc_domain(struct ibv_xrc_domain *d);
305 diff --git a/src/qp.c b/src/qp.c
306 index 01e8580..2f02430 100644
309 @@ -226,7 +226,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
310 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
311 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
313 - ctrl->srcrb_flags =
314 + ctrl->xrcrb_flags =
315 (wr->send_flags & IBV_SEND_SIGNALED ?
316 htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
317 (wr->send_flags & IBV_SEND_SOLICITED ?
318 @@ -243,6 +243,9 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
319 size = sizeof *ctrl / 16;
321 switch (ibqp->qp_type) {
323 + ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8);
327 switch (wr->opcode) {
328 @@ -543,6 +546,7 @@ void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
329 size += sizeof (struct mlx4_wqe_raddr_seg);
334 size += sizeof (struct mlx4_wqe_raddr_seg);
336 @@ -631,6 +635,7 @@ void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
341 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
344 diff --git a/src/srq.c b/src/srq.c
345 index ba2ceb9..1350792 100644
348 @@ -167,3 +167,53 @@ int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
353 +struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
355 + int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
357 + if (ctx->xrc_srq_table[tind].refcnt)
358 + return ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask];
363 +int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
364 + struct mlx4_srq *srq)
366 + int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
369 + pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
371 + if (!ctx->xrc_srq_table[tind].refcnt) {
372 + ctx->xrc_srq_table[tind].table = calloc(ctx->xrc_srq_table_mask + 1,
373 + sizeof(struct mlx4_srq *));
374 + if (!ctx->xrc_srq_table[tind].table) {
380 + ++ctx->xrc_srq_table[tind].refcnt;
381 + ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = srq;
384 + pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
388 +void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
390 + int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
392 + pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
394 + if (!--ctx->xrc_srq_table[tind].refcnt)
395 + free(ctx->xrc_srq_table[tind].table);
397 + ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = NULL;
399 + pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
402 diff --git a/src/verbs.c b/src/verbs.c
403 index 400050c..b7c9c8e 100644
406 @@ -368,18 +368,36 @@ int mlx4_query_srq(struct ibv_srq *srq,
407 return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
410 -int mlx4_destroy_srq(struct ibv_srq *srq)
411 +int mlx4_destroy_srq(struct ibv_srq *ibsrq)
413 + struct mlx4_srq *srq = to_msrq(ibsrq);
414 + struct mlx4_cq *mcq = NULL;
417 - ret = ibv_cmd_destroy_srq(srq);
419 + if (ibsrq->xrc_cq) {
420 + /* is an xrc_srq */
421 + mcq = to_mcq(ibsrq->xrc_cq);
422 + mlx4_cq_clean(mcq, 0, srq);
423 + pthread_spin_lock(&mcq->lock);
424 + mlx4_clear_xrc_srq(to_mctx(ibsrq->context), srq->srqn);
425 + pthread_spin_unlock(&mcq->lock);
428 + ret = ibv_cmd_destroy_srq(ibsrq);
430 + if (ibsrq->xrc_cq) {
431 + pthread_spin_lock(&mcq->lock);
432 + mlx4_store_xrc_srq(to_mctx(ibsrq->context),
434 + pthread_spin_unlock(&mcq->lock);
439 - mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
440 - mlx4_free_buf(&to_msrq(srq)->buf);
441 - free(to_msrq(srq)->wrid);
442 - free(to_msrq(srq));
443 + mlx4_free_db(to_mctx(ibsrq->context), MLX4_DB_TYPE_RQ, srq->db);
444 + mlx4_free_buf(&srq->buf);
450 @@ -415,7 +433,7 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
451 qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
452 qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
455 + if (attr->srq || attr->qp_type == IBV_QPT_XRC)
456 attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0;
458 if (attr->cap.max_recv_sge < 1)
459 @@ -433,7 +451,7 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
460 pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
464 + if (!attr->srq && attr->qp_type != IBV_QPT_XRC) {
465 qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
468 @@ -442,7 +460,7 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
471 cmd.buf_addr = (uintptr_t) qp->buf.buf;
473 + if (attr->srq || attr->qp_type == IBV_QPT_XRC)
476 cmd.db_addr = (uintptr_t) qp->db;
477 @@ -485,7 +503,7 @@ err_destroy:
480 pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
482 + if (!attr->srq && attr->qp_type != IBV_QPT_XRC)
483 mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db);
486 @@ -544,7 +562,7 @@ int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
487 mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
489 mlx4_init_qp_indices(to_mqp(qp));
491 + if (!qp->srq && qp->qp_type != IBV_QPT_XRC)
495 @@ -603,7 +621,7 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp)
496 mlx4_unlock_cqs(ibqp);
497 pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
500 + if (!ibqp->srq && ibqp->qp_type != IBV_QPT_XRC)
501 mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
504 @@ -661,3 +679,103 @@ int mlx4_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid)
509 +#ifdef HAVE_IBV_XRC_OPS
510 +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
511 + struct ibv_xrc_domain *xrc_domain,
512 + struct ibv_cq *xrc_cq,
513 + struct ibv_srq_init_attr *attr)
515 + struct mlx4_create_xrc_srq cmd;
516 + struct mlx4_create_srq_resp resp;
517 + struct mlx4_srq *srq;
520 + /* Sanity check SRQ size before proceeding */
521 + if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
524 + srq = malloc(sizeof *srq);
528 + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
531 + srq->max = align_queue_size(attr->attr.max_wr + 1);
532 + srq->max_gs = attr->attr.max_sge;
535 + if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
538 + srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
544 + cmd.buf_addr = (uintptr_t) srq->buf.buf;
545 + cmd.db_addr = (uintptr_t) srq->db;
547 + ret = ibv_cmd_create_xrc_srq(pd, &srq->ibv_srq, attr,
548 + xrc_domain->handle,
550 + &cmd.ibv_cmd, sizeof cmd,
551 + &resp.ibv_resp, sizeof resp);
555 + srq->ibv_srq.xrc_srq_num = srq->srqn = resp.srqn;
557 + ret = mlx4_store_xrc_srq(to_mctx(pd->context), srq->ibv_srq.xrc_srq_num, srq);
561 + return &srq->ibv_srq;
564 + ibv_cmd_destroy_srq(&srq->ibv_srq);
567 + mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
571 + mlx4_free_buf(&srq->buf);
579 +struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
583 + struct mlx4_open_xrc_domain_resp resp;
584 + struct mlx4_xrc_domain *xrcd;
586 + xrcd = malloc(sizeof *xrcd);
590 + ret = ibv_cmd_open_xrc_domain(context, fd, oflag, &xrcd->ibv_xrcd,
591 + &resp.ibv_resp, sizeof resp);
597 + xrcd->xrcdn = resp.xrcdn;
598 + return &xrcd->ibv_xrcd;
601 +int mlx4_close_xrc_domain(struct ibv_xrc_domain *d)
603 + ibv_cmd_close_xrc_domain(d);
608 diff --git a/src/wqe.h b/src/wqe.h
609 index 6f7f309..fa2f8ac 100644
612 @@ -65,7 +65,7 @@ struct mlx4_wqe_ctrl_seg {
613 * [1] SE (solicited event)
614 * [0] FL (force loopback)
616 - uint32_t srcrb_flags;
617 + uint32_t xrcrb_flags;
619 * imm is immediate data for send/RDMA write w/ immediate;
620 * also invalidation key for send with invalidate; input