2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
45 #define MLX5_ATOMIC_SIZE 8
47 static const uint32_t mlx5_ib_opcode[] = {
48 [IBV_WR_SEND] = MLX5_OPCODE_SEND,
49 [IBV_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL,
50 [IBV_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM,
51 [IBV_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE,
52 [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM,
53 [IBV_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ,
54 [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS,
55 [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA,
56 [IBV_WR_BIND_MW] = MLX5_OPCODE_UMR,
57 [IBV_WR_LOCAL_INV] = MLX5_OPCODE_UMR,
58 [IBV_WR_TSO] = MLX5_OPCODE_TSO,
61 static void *get_recv_wqe(struct mlx5_qp *qp, int n)
63 return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
66 static void *get_wq_recv_wqe(struct mlx5_rwq *rwq, int n)
68 return rwq->pbuff + (n << rwq->rq.wqe_shift);
71 static int copy_to_scat(struct mlx5_wqe_data_seg *scat, void *buf, int *size,
77 if (unlikely(!(*size)))
78 return IBV_WC_SUCCESS;
80 for (i = 0; i < max; ++i) {
81 copy = min_t(long, *size, be32toh(scat->byte_count));
82 memcpy((void *)(unsigned long)be64toh(scat->addr), buf, copy);
85 return IBV_WC_SUCCESS;
90 return IBV_WC_LOC_LEN_ERR;
93 int mlx5_copy_to_recv_wqe(struct mlx5_qp *qp, int idx, void *buf, int size)
95 struct mlx5_wqe_data_seg *scat;
96 int max = 1 << (qp->rq.wqe_shift - 4);
98 scat = get_recv_wqe(qp, idx);
99 if (unlikely(qp->wq_sig))
102 return copy_to_scat(scat, buf, &size, max);
105 int mlx5_copy_to_send_wqe(struct mlx5_qp *qp, int idx, void *buf, int size)
107 struct mlx5_wqe_ctrl_seg *ctrl;
108 struct mlx5_wqe_data_seg *scat;
112 idx &= (qp->sq.wqe_cnt - 1);
113 ctrl = mlx5_get_send_wqe(qp, idx);
114 if (qp->ibv_qp->qp_type != IBV_QPT_RC) {
115 fprintf(stderr, "scatter to CQE is supported only for RC QPs\n");
116 return IBV_WC_GENERAL_ERR;
120 switch (be32toh(ctrl->opmod_idx_opcode) & 0xff) {
121 case MLX5_OPCODE_RDMA_READ:
122 p = p + sizeof(struct mlx5_wqe_raddr_seg);
125 case MLX5_OPCODE_ATOMIC_CS:
126 case MLX5_OPCODE_ATOMIC_FA:
127 p = p + sizeof(struct mlx5_wqe_raddr_seg) +
128 sizeof(struct mlx5_wqe_atomic_seg);
132 fprintf(stderr, "scatter to CQE for opcode %d\n",
133 be32toh(ctrl->opmod_idx_opcode) & 0xff);
134 return IBV_WC_REM_INV_REQ_ERR;
138 max = (be32toh(ctrl->qpn_ds) & 0x3F) - (((void *)scat - (void *)ctrl) >> 4);
139 if (unlikely((void *)(scat + max) > qp->sq.qend)) {
140 int tmp = ((void *)qp->sq.qend - (void *)scat) >> 4;
141 int orig_size = size;
143 if (copy_to_scat(scat, buf, &size, tmp) == IBV_WC_SUCCESS)
144 return IBV_WC_SUCCESS;
146 buf += orig_size - size;
147 scat = mlx5_get_send_wqe(qp, 0);
150 return copy_to_scat(scat, buf, &size, max);
153 void *mlx5_get_send_wqe(struct mlx5_qp *qp, int n)
155 return qp->sq_start + (n << MLX5_SEND_WQE_SHIFT);
158 void mlx5_init_rwq_indices(struct mlx5_rwq *rwq)
164 void mlx5_init_qp_indices(struct mlx5_qp *qp)
173 static int mlx5_wq_overflow(struct mlx5_wq *wq, int nreq, struct mlx5_cq *cq)
177 cur = wq->head - wq->tail;
178 if (cur + nreq < wq->max_post)
181 mlx5_spin_lock(&cq->lock);
182 cur = wq->head - wq->tail;
183 mlx5_spin_unlock(&cq->lock);
185 return cur + nreq >= wq->max_post;
188 static inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
189 uint64_t remote_addr, uint32_t rkey)
191 rseg->raddr = htobe64(remote_addr);
192 rseg->rkey = htobe32(rkey);
196 static void set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg,
197 enum ibv_wr_opcode opcode,
199 uint64_t compare_add)
201 if (opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
202 aseg->swap_add = htobe64(swap);
203 aseg->compare = htobe64(compare_add);
205 aseg->swap_add = htobe64(compare_add);
209 static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
210 struct ibv_send_wr *wr)
212 memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof dseg->av);
213 dseg->av.dqp_dct = htobe32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV);
214 dseg->av.key.qkey.qkey = htobe32(wr->wr.ud.remote_qkey);
217 static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ibv_sge *sg,
220 dseg->byte_count = htobe32(sg->length - offset);
221 dseg->lkey = htobe32(sg->lkey);
222 dseg->addr = htobe64(sg->addr + offset);
225 static void set_data_ptr_seg_atomic(struct mlx5_wqe_data_seg *dseg,
228 dseg->byte_count = htobe32(MLX5_ATOMIC_SIZE);
229 dseg->lkey = htobe32(sg->lkey);
230 dseg->addr = htobe64(sg->addr);
234 * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
235 * implementations may use move-string-buffer assembler instructions,
236 * which do not guarantee order of copying.
238 static void mlx5_bf_copy(unsigned long long *dst, unsigned long long *src,
239 unsigned bytecnt, struct mlx5_qp *qp)
241 while (bytecnt > 0) {
250 bytecnt -= 8 * sizeof(unsigned long long);
251 if (unlikely(src == qp->sq.qend))
256 static uint32_t send_ieth(struct ibv_send_wr *wr)
258 switch (wr->opcode) {
259 case IBV_WR_SEND_WITH_IMM:
260 case IBV_WR_RDMA_WRITE_WITH_IMM:
262 case IBV_WR_SEND_WITH_INV:
263 return htobe32(wr->imm_data);
269 static int set_data_inl_seg(struct mlx5_qp *qp, struct ibv_send_wr *wr,
271 struct mlx5_sg_copy_ptr *sg_copy_ptr)
273 struct mlx5_wqe_inline_seg *seg;
278 void *qend = qp->sq.qend;
280 int offset = sg_copy_ptr->offset;
284 for (i = sg_copy_ptr->index; i < wr->num_sge; ++i) {
285 addr = (void *) (unsigned long)(wr->sg_list[i].addr + offset);
286 len = wr->sg_list[i].length - offset;
290 if (unlikely(inl > qp->max_inline_data))
293 if (unlikely(wqe + len > qend)) {
295 memcpy(wqe, addr, copy);
298 wqe = mlx5_get_send_wqe(qp, 0);
300 memcpy(wqe, addr, len);
305 seg->byte_count = htobe32(inl | MLX5_INLINE_SEG);
306 *sz = align(inl + sizeof seg->byte_count, 16) / 16;
313 static uint8_t wq_sig(struct mlx5_wqe_ctrl_seg *ctrl)
315 return calc_sig(ctrl, be32toh(ctrl->qpn_ds));
319 static void dump_wqe(FILE *fp, int idx, int size_16, struct mlx5_qp *qp)
325 fprintf(fp, "dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
326 for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
327 if ((i & 0xf) == 0) {
328 void *buf = mlx5_get_send_wqe(qp, tidx);
329 tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
333 fprintf(fp, "%08x %08x %08x %08x\n", be32toh(p[j]), be32toh(p[j + 1]),
334 be32toh(p[j + 2]), be32toh(p[j + 3]));
337 #endif /* MLX5_DEBUG */
340 void *mlx5_get_atomic_laddr(struct mlx5_qp *qp, uint16_t idx, int *byte_count)
342 struct mlx5_wqe_data_seg *dpseg;
345 dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
346 sizeof(struct mlx5_wqe_raddr_seg) +
347 sizeof(struct mlx5_wqe_atomic_seg);
348 addr = (void *)(unsigned long)be64toh(dpseg->addr);
351 * Currently byte count is always 8 bytes. Fix this when
352 * we support variable size of atomics
358 static inline int copy_eth_inline_headers(struct ibv_qp *ibqp,
359 struct ibv_send_wr *wr,
360 struct mlx5_wqe_eth_seg *eseg,
361 struct mlx5_sg_copy_ptr *sg_copy_ptr)
363 uint32_t inl_hdr_size = MLX5_ETH_L2_INLINE_HEADER_SIZE;
364 int inl_hdr_copy_size = 0;
366 FILE *fp = to_mctx(ibqp->context)->dbg_fp;
368 if (unlikely(wr->num_sge < 1)) {
369 mlx5_dbg(fp, MLX5_DBG_QP_SEND, "illegal num_sge: %d, minimum is 1\n",
374 if (likely(wr->sg_list[0].length >= MLX5_ETH_L2_INLINE_HEADER_SIZE)) {
375 inl_hdr_copy_size = MLX5_ETH_L2_INLINE_HEADER_SIZE;
376 memcpy(eseg->inline_hdr_start,
377 (void *)(uintptr_t)wr->sg_list[0].addr,
380 for (j = 0; j < wr->num_sge && inl_hdr_size > 0; ++j) {
381 inl_hdr_copy_size = min(wr->sg_list[j].length,
383 memcpy(eseg->inline_hdr_start +
384 (MLX5_ETH_L2_INLINE_HEADER_SIZE - inl_hdr_size),
385 (void *)(uintptr_t)wr->sg_list[j].addr,
387 inl_hdr_size -= inl_hdr_copy_size;
389 if (unlikely(inl_hdr_size)) {
390 mlx5_dbg(fp, MLX5_DBG_QP_SEND, "Ethernet headers < 16 bytes\n");
397 eseg->inline_hdr_sz = htobe16(MLX5_ETH_L2_INLINE_HEADER_SIZE);
399 /* If we copied all the sge into the inline-headers, then we need to
400 * start copying from the next sge into the data-segment.
402 if (unlikely(wr->sg_list[j].length == inl_hdr_copy_size)) {
404 inl_hdr_copy_size = 0;
407 sg_copy_ptr->index = j;
408 sg_copy_ptr->offset = inl_hdr_copy_size;
414 #define ALIGN(x, log_a) ((((x) + (1 << (log_a)) - 1)) & ~((1 << (log_a)) - 1))
416 static inline uint16_t get_klm_octo(int nentries)
418 return htobe16(ALIGN(nentries, 3) / 2);
421 static void set_umr_data_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
422 int32_t rkey, struct ibv_mw_bind_info *bind_info,
423 uint32_t qpn, void **seg, int *size)
426 struct mlx5_wqe_umr_klm_seg klm;
427 uint8_t reserved[64];
430 data->klm.byte_count = htobe32(bind_info->length);
431 data->klm.mkey = htobe32(bind_info->mr->lkey);
432 data->klm.address = htobe64(bind_info->addr);
434 memset(&data->klm + 1, 0, sizeof(data->reserved) -
437 *seg += sizeof(*data);
438 *size += (sizeof(*data) / 16);
441 static void set_umr_mkey_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
442 int32_t rkey, struct ibv_mw_bind_info *bind_info,
443 uint32_t qpn, void **seg, int *size)
445 struct mlx5_wqe_mkey_context_seg *mkey = *seg;
447 mkey->qpn_mkey = htobe32((rkey & 0xFF) |
448 ((type == IBV_MW_TYPE_1 || !bind_info->length) ?
449 0xFFFFFF00 : qpn << 8));
450 if (bind_info->length) {
451 /* Local read is set in kernel */
452 mkey->access_flags = 0;
454 if (bind_info->mw_access_flags & IBV_ACCESS_LOCAL_WRITE)
455 mkey->access_flags |=
456 MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_LOCAL_WRITE;
457 if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_WRITE)
458 mkey->access_flags |=
459 MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_WRITE;
460 if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_READ)
461 mkey->access_flags |=
462 MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_READ;
463 if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_ATOMIC)
464 mkey->access_flags |=
465 MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_ATOMIC;
466 if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED)
467 mkey->start_addr = 0;
469 mkey->start_addr = htobe64(bind_info->addr);
470 mkey->len = htobe64(bind_info->length);
472 mkey->free = MLX5_WQE_MKEY_CONTEXT_FREE;
475 *seg += sizeof(struct mlx5_wqe_mkey_context_seg);
476 *size += (sizeof(struct mlx5_wqe_mkey_context_seg) / 16);
479 static inline void set_umr_control_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
480 int32_t rkey, struct ibv_mw_bind_info *bind_info,
481 uint32_t qpn, void **seg, int *size)
483 struct mlx5_wqe_umr_ctrl_seg *ctrl = *seg;
485 ctrl->flags = MLX5_WQE_UMR_CTRL_FLAG_TRNSLATION_OFFSET |
486 MLX5_WQE_UMR_CTRL_FLAG_INLINE;
487 ctrl->mkey_mask = htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_FREE |
488 MLX5_WQE_UMR_CTRL_MKEY_MASK_MKEY);
489 ctrl->translation_offset = 0;
490 memset(ctrl->rsvd0, 0, sizeof(ctrl->rsvd0));
491 memset(ctrl->rsvd1, 0, sizeof(ctrl->rsvd1));
493 if (type == IBV_MW_TYPE_2)
494 ctrl->mkey_mask |= htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_QPN);
496 if (bind_info->length) {
497 ctrl->klm_octowords = get_klm_octo(1);
498 if (type == IBV_MW_TYPE_2)
499 ctrl->flags |= MLX5_WQE_UMR_CTRL_FLAG_CHECK_FREE;
500 ctrl->mkey_mask |= htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN |
501 MLX5_WQE_UMR_CTRL_MKEY_MASK_START_ADDR |
502 MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_LOCAL_WRITE |
503 MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_READ |
504 MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_WRITE |
505 MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_ATOMIC);
507 ctrl->klm_octowords = get_klm_octo(0);
508 if (type == IBV_MW_TYPE_2)
509 ctrl->flags |= MLX5_WQE_UMR_CTRL_FLAG_CHECK_QPN;
512 *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
513 *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
516 static inline int set_bind_wr(struct mlx5_qp *qp, enum ibv_mw_type type,
517 int32_t rkey, struct ibv_mw_bind_info *bind_info,
518 uint32_t qpn, void **seg, int *size)
520 void *qend = qp->sq.qend;
523 if (bind_info->mw_access_flags &
524 ~(IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_READ |
525 IBV_ACCESS_REMOTE_WRITE))
529 (bind_info->mr->addr > (void *)bind_info->addr ||
530 bind_info->mr->addr + bind_info->mr->length <
531 (void *)bind_info->addr + bind_info->length ||
532 !(to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_MW_BIND) ||
533 (bind_info->mw_access_flags &
534 (IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_WRITE) &&
535 !(to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_LOCAL_WRITE))))
540 /* check that len > 2GB because KLM support only 2GB */
541 if (bind_info->length > 1UL << 31)
544 set_umr_control_seg(qp, type, rkey, bind_info, qpn, seg, size);
545 if (unlikely((*seg == qend)))
546 *seg = mlx5_get_send_wqe(qp, 0);
548 set_umr_mkey_seg(qp, type, rkey, bind_info, qpn, seg, size);
549 if (!bind_info->length)
552 if (unlikely((seg == qend)))
553 *seg = mlx5_get_send_wqe(qp, 0);
555 set_umr_data_seg(qp, type, rkey, bind_info, qpn, seg, size);
559 /* Copy tso header to eth segment with considering padding and WQE
560 * wrap around in WQ buffer.
562 static inline int set_tso_eth_seg(void **seg, struct ibv_send_wr *wr,
563 void *qend, struct mlx5_qp *qp, int *size)
565 struct mlx5_wqe_eth_seg *eseg = *seg;
566 int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start);
567 uint64_t left, left_len, copy_sz;
568 void *pdata = wr->tso.hdr;
569 FILE *fp = to_mctx(qp->ibv_qp->context)->dbg_fp;
571 if (unlikely(wr->tso.hdr_sz < MLX5_ETH_L2_MIN_HEADER_SIZE ||
572 wr->tso.hdr_sz > qp->max_tso_header)) {
573 mlx5_dbg(fp, MLX5_DBG_QP_SEND,
574 "TSO header size should be at least %d and at most %d\n",
575 MLX5_ETH_L2_MIN_HEADER_SIZE,
580 left = wr->tso.hdr_sz;
581 eseg->mss = htobe16(wr->tso.mss);
582 eseg->inline_hdr_sz = htobe16(wr->tso.hdr_sz);
584 /* Check if there is space till the end of queue, if yes,
585 * copy all in one shot, otherwise copy till the end of queue,
586 * rollback and then copy the left
588 left_len = qend - (void *)eseg->inline_hdr_start;
589 copy_sz = min(left_len, left);
591 memcpy(eseg->inline_hdr_start, pdata, copy_sz);
593 /* The -1 is because there are already 16 bytes included in
594 * eseg->inline_hdr[16]
596 *seg += align(copy_sz - size_of_inl_hdr_start, 16) - 16;
597 *size += align(copy_sz - size_of_inl_hdr_start, 16) / 16 - 1;
599 /* The last wqe in the queue */
600 if (unlikely(copy_sz < left)) {
601 *seg = mlx5_get_send_wqe(qp, 0);
604 memcpy(*seg, pdata, left);
605 *seg += align(left, 16);
606 *size += align(left, 16) / 16;
612 static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
613 struct ibv_send_wr **bad_wr)
615 struct mlx5_context *ctx;
616 struct mlx5_qp *qp = to_mqp(ibqp);
618 struct mlx5_wqe_eth_seg *eseg;
619 struct mlx5_wqe_ctrl_seg *ctrl = NULL;
620 struct mlx5_wqe_data_seg *dpseg;
621 struct mlx5_sg_copy_ptr sg_copy_ptr = {.index = 0, .offset = 0};
629 struct mlx5_bf *bf = qp->bf;
630 void *qend = qp->sq.qend;
631 uint32_t mlx5_opcode;
632 struct mlx5_wqe_xrc_seg *xrc;
635 uint32_t max_tso = 0;
636 FILE *fp = to_mctx(ibqp->context)->dbg_fp; /* The compiler ignores in non-debug mode */
638 mlx5_spin_lock(&qp->sq.lock);
640 next_fence = qp->fm_cache;
642 for (nreq = 0; wr; ++nreq, wr = wr->next) {
643 if (unlikely(wr->opcode < 0 ||
644 wr->opcode >= sizeof mlx5_ib_opcode / sizeof mlx5_ib_opcode[0])) {
645 mlx5_dbg(fp, MLX5_DBG_QP_SEND, "bad opcode %d\n", wr->opcode);
651 if (unlikely(mlx5_wq_overflow(&qp->sq, nreq,
652 to_mcq(qp->ibv_qp->send_cq)))) {
653 mlx5_dbg(fp, MLX5_DBG_QP_SEND, "work queue overflow\n");
659 if (unlikely(wr->num_sge > qp->sq.max_gs)) {
660 mlx5_dbg(fp, MLX5_DBG_QP_SEND, "max gs exceeded %d (max = %d)\n",
661 wr->num_sge, qp->sq.max_gs);
667 if (wr->send_flags & IBV_SEND_FENCE)
668 fence = MLX5_WQE_CTRL_FENCE;
672 idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
673 ctrl = seg = mlx5_get_send_wqe(qp, idx);
674 *(uint32_t *)(seg + 8) = 0;
675 ctrl->imm = send_ieth(wr);
676 ctrl->fm_ce_se = qp->sq_signal_bits | fence |
677 (wr->send_flags & IBV_SEND_SIGNALED ?
678 MLX5_WQE_CTRL_CQ_UPDATE : 0) |
679 (wr->send_flags & IBV_SEND_SOLICITED ?
680 MLX5_WQE_CTRL_SOLICITED : 0);
683 size = sizeof *ctrl / 16;
685 switch (ibqp->qp_type) {
686 case IBV_QPT_XRC_SEND:
687 if (unlikely(wr->opcode != IBV_WR_BIND_MW &&
688 wr->opcode != IBV_WR_LOCAL_INV)) {
690 xrc->xrc_srqn = htobe32(wr->qp_type.xrc.remote_srqn);
692 size += sizeof(*xrc) / 16;
696 switch (wr->opcode) {
697 case IBV_WR_RDMA_READ:
698 case IBV_WR_RDMA_WRITE:
699 case IBV_WR_RDMA_WRITE_WITH_IMM:
700 set_raddr_seg(seg, wr->wr.rdma.remote_addr,
702 seg += sizeof(struct mlx5_wqe_raddr_seg);
703 size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
706 case IBV_WR_ATOMIC_CMP_AND_SWP:
707 case IBV_WR_ATOMIC_FETCH_AND_ADD:
708 if (unlikely(!qp->atomics_enabled)) {
709 mlx5_dbg(fp, MLX5_DBG_QP_SEND, "atomic operations are not supported\n");
714 set_raddr_seg(seg, wr->wr.atomic.remote_addr,
716 seg += sizeof(struct mlx5_wqe_raddr_seg);
718 set_atomic_seg(seg, wr->opcode,
720 wr->wr.atomic.compare_add);
721 seg += sizeof(struct mlx5_wqe_atomic_seg);
723 size += (sizeof(struct mlx5_wqe_raddr_seg) +
724 sizeof(struct mlx5_wqe_atomic_seg)) / 16;
728 next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
729 ctrl->imm = htobe32(wr->bind_mw.mw->rkey);
730 err = set_bind_wr(qp, wr->bind_mw.mw->type,
732 &wr->bind_mw.bind_info,
733 ibqp->qp_num, &seg, &size);
739 qp->sq.wr_data[idx] = IBV_WC_BIND_MW;
741 case IBV_WR_LOCAL_INV: {
742 struct ibv_mw_bind_info bind_info = {};
744 next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
745 ctrl->imm = htobe32(wr->imm_data);
746 err = set_bind_wr(qp, IBV_MW_TYPE_2, 0,
747 &bind_info, ibqp->qp_num,
754 qp->sq.wr_data[idx] = IBV_WC_LOCAL_INV;
764 switch (wr->opcode) {
765 case IBV_WR_RDMA_WRITE:
766 case IBV_WR_RDMA_WRITE_WITH_IMM:
767 set_raddr_seg(seg, wr->wr.rdma.remote_addr,
769 seg += sizeof(struct mlx5_wqe_raddr_seg);
770 size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
773 next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
774 ctrl->imm = htobe32(wr->bind_mw.mw->rkey);
775 err = set_bind_wr(qp, wr->bind_mw.mw->type,
777 &wr->bind_mw.bind_info,
778 ibqp->qp_num, &seg, &size);
784 qp->sq.wr_data[idx] = IBV_WC_BIND_MW;
786 case IBV_WR_LOCAL_INV: {
787 struct ibv_mw_bind_info bind_info = {};
789 next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
790 ctrl->imm = htobe32(wr->imm_data);
791 err = set_bind_wr(qp, IBV_MW_TYPE_2, 0,
792 &bind_info, ibqp->qp_num,
799 qp->sq.wr_data[idx] = IBV_WC_LOCAL_INV;
809 set_datagram_seg(seg, wr);
810 seg += sizeof(struct mlx5_wqe_datagram_seg);
811 size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
812 if (unlikely((seg == qend)))
813 seg = mlx5_get_send_wqe(qp, 0);
816 case IBV_QPT_RAW_PACKET:
817 memset(seg, 0, sizeof(struct mlx5_wqe_eth_seg));
820 if (wr->send_flags & IBV_SEND_IP_CSUM) {
821 if (!(qp->qp_cap_cache & MLX5_CSUM_SUPPORT_RAW_OVER_ETH)) {
827 eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
830 if (wr->opcode == IBV_WR_TSO) {
831 max_tso = qp->max_tso;
832 err = set_tso_eth_seg(&seg, wr, qend, qp, &size);
838 err = copy_eth_inline_headers(ibqp, wr, seg, &sg_copy_ptr);
841 mlx5_dbg(fp, MLX5_DBG_QP_SEND,
842 "copy_eth_inline_headers failed, err: %d\n",
848 seg += sizeof(struct mlx5_wqe_eth_seg);
849 size += sizeof(struct mlx5_wqe_eth_seg) / 16;
856 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
859 err = set_data_inl_seg(qp, wr, seg, &sz, &sg_copy_ptr);
862 mlx5_dbg(fp, MLX5_DBG_QP_SEND,
863 "inline layout failed, err %d\n", err);
870 for (i = sg_copy_ptr.index; i < wr->num_sge; ++i) {
871 if (unlikely(dpseg == qend)) {
872 seg = mlx5_get_send_wqe(qp, 0);
875 if (likely(wr->sg_list[i].length)) {
876 if (unlikely(wr->opcode ==
877 IBV_WR_ATOMIC_CMP_AND_SWP ||
879 IBV_WR_ATOMIC_FETCH_AND_ADD))
880 set_data_ptr_seg_atomic(dpseg, wr->sg_list + i);
882 if (unlikely(wr->opcode == IBV_WR_TSO)) {
883 if (max_tso < wr->sg_list[i].length) {
888 max_tso -= wr->sg_list[i].length;
890 set_data_ptr_seg(dpseg, wr->sg_list + i,
893 sg_copy_ptr.offset = 0;
895 size += sizeof(struct mlx5_wqe_data_seg) / 16;
900 mlx5_opcode = mlx5_ib_opcode[wr->opcode];
901 ctrl->opmod_idx_opcode = htobe32(((qp->sq.cur_post & 0xffff) << 8) |
904 ctrl->qpn_ds = htobe32(size | (ibqp->qp_num << 8));
906 if (unlikely(qp->wq_sig))
907 ctrl->signature = wq_sig(ctrl);
909 qp->sq.wrid[idx] = wr->wr_id;
910 qp->sq.wqe_head[idx] = qp->sq.head + nreq;
911 qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
914 if (mlx5_debug_mask & MLX5_DBG_QP_SEND)
915 dump_wqe(to_mctx(ibqp->context)->dbg_fp, idx, size, qp);
922 qp->fm_cache = next_fence;
925 * Make sure that descriptors are written before
926 * updating doorbell record and ringing the doorbell
928 udma_to_device_barrier();
929 qp->db[MLX5_SND_DBR] = htobe32(qp->sq.cur_post & 0xffff);
931 /* Make sure that the doorbell write happens before the memcpy
932 * to WC memory below */
933 ctx = to_mctx(ibqp->context);
935 mmio_wc_spinlock(&bf->lock.lock);
939 if (!ctx->shut_up_bf && nreq == 1 && bf->uuarn &&
940 (inl || ctx->prefer_bf) && size > 1 &&
941 size <= bf->buf_size / 16)
942 mlx5_bf_copy(bf->reg + bf->offset, (unsigned long long *)ctrl,
943 align(size * 16, 64), qp);
945 mlx5_write64((__be32 *)ctrl, bf->reg + bf->offset,
949 * use mmio_flush_writes() to ensure write combining buffers are flushed out
950 * of the running CPU. This must be carried inside the spinlock.
951 * Otherwise, there is a potential race. In the race, CPU A
952 * writes doorbell 1, which is waiting in the WC buffer. CPU B
953 * writes doorbell 2, and it's write is flushed earlier. Since
954 * the mmio_flush_writes is CPU local, this will result in the HCA seeing
955 * doorbell 2, followed by doorbell 1.
956 * Flush before toggling bf_offset to be latency oriented.
959 bf->offset ^= bf->buf_size;
961 mlx5_spin_unlock(&bf->lock);
964 mlx5_spin_unlock(&qp->sq.lock);
969 int mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
970 struct ibv_send_wr **bad_wr)
973 if (wr->opcode == IBV_WR_BIND_MW) {
974 if (wr->bind_mw.mw->type == IBV_MW_TYPE_1)
977 if (!wr->bind_mw.bind_info.mr ||
978 !wr->bind_mw.bind_info.addr ||
979 !wr->bind_mw.bind_info.length)
982 if (wr->bind_mw.bind_info.mr->pd != wr->bind_mw.mw->pd)
987 return _mlx5_post_send(ibqp, wr, bad_wr);
990 int mlx5_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
991 struct ibv_mw_bind *mw_bind)
993 struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info;
994 struct ibv_send_wr wr = {};
995 struct ibv_send_wr *bad_wr = NULL;
998 if (!bind_info->mr && (bind_info->addr || bind_info->length)) {
1003 if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) {
1008 if (bind_info->mr) {
1009 if (to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_ZERO_BASED) {
1014 if (mw->pd != bind_info->mr->pd) {
1020 wr.opcode = IBV_WR_BIND_MW;
1022 wr.wr_id = mw_bind->wr_id;
1023 wr.send_flags = mw_bind->send_flags;
1024 wr.bind_mw.bind_info = mw_bind->bind_info;
1026 wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
1028 ret = _mlx5_post_send(qp, &wr, &bad_wr);
1032 mw->rkey = wr.bind_mw.rkey;
1037 static void set_sig_seg(struct mlx5_qp *qp, struct mlx5_rwqe_sig *sig,
1038 int size, uint16_t idx)
1041 uint32_t qpn = qp->ibv_qp->qp_num;
1043 sign = calc_sig(sig, size);
1044 sign ^= calc_sig(&qpn, 4);
1045 sign ^= calc_sig(&idx, 2);
1046 sig->signature = sign;
1049 static void set_wq_sig_seg(struct mlx5_rwq *rwq, struct mlx5_rwqe_sig *sig,
1050 int size, uint16_t idx)
1053 uint32_t qpn = rwq->wq.wq_num;
1055 sign = calc_sig(sig, size);
1056 sign ^= calc_sig(&qpn, 4);
1057 sign ^= calc_sig(&idx, 2);
1058 sig->signature = sign;
1061 int mlx5_post_wq_recv(struct ibv_wq *ibwq, struct ibv_recv_wr *wr,
1062 struct ibv_recv_wr **bad_wr)
1064 struct mlx5_rwq *rwq = to_mrwq(ibwq);
1065 struct mlx5_wqe_data_seg *scat;
1070 struct mlx5_rwqe_sig *sig;
1072 mlx5_spin_lock(&rwq->rq.lock);
1074 ind = rwq->rq.head & (rwq->rq.wqe_cnt - 1);
1076 for (nreq = 0; wr; ++nreq, wr = wr->next) {
1077 if (unlikely(mlx5_wq_overflow(&rwq->rq, nreq,
1078 to_mcq(rwq->wq.cq)))) {
1084 if (unlikely(wr->num_sge > rwq->rq.max_gs)) {
1090 scat = get_wq_recv_wqe(rwq, ind);
1091 sig = (struct mlx5_rwqe_sig *)scat;
1092 if (unlikely(rwq->wq_sig)) {
1093 memset(sig, 0, 1 << rwq->rq.wqe_shift);
1097 for (i = 0, j = 0; i < wr->num_sge; ++i) {
1098 if (unlikely(!wr->sg_list[i].length))
1100 set_data_ptr_seg(scat + j++, wr->sg_list + i, 0);
1103 if (j < rwq->rq.max_gs) {
1104 scat[j].byte_count = 0;
1105 scat[j].lkey = htobe32(MLX5_INVALID_LKEY);
1109 if (unlikely(rwq->wq_sig))
1110 set_wq_sig_seg(rwq, sig, (wr->num_sge + 1) << 4,
1111 rwq->rq.head & 0xffff);
1113 rwq->rq.wrid[ind] = wr->wr_id;
1115 ind = (ind + 1) & (rwq->rq.wqe_cnt - 1);
1120 rwq->rq.head += nreq;
1122 * Make sure that descriptors are written before
1125 udma_to_device_barrier();
1126 *(rwq->recv_db) = htobe32(rwq->rq.head & 0xffff);
1129 mlx5_spin_unlock(&rwq->rq.lock);
1134 int mlx5_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
1135 struct ibv_recv_wr **bad_wr)
1137 struct mlx5_qp *qp = to_mqp(ibqp);
1138 struct mlx5_wqe_data_seg *scat;
1143 struct mlx5_rwqe_sig *sig;
1145 mlx5_spin_lock(&qp->rq.lock);
1147 ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
1149 for (nreq = 0; wr; ++nreq, wr = wr->next) {
1150 if (unlikely(mlx5_wq_overflow(&qp->rq, nreq,
1151 to_mcq(qp->ibv_qp->recv_cq)))) {
1157 if (unlikely(wr->num_sge > qp->rq.max_gs)) {
1163 scat = get_recv_wqe(qp, ind);
1164 sig = (struct mlx5_rwqe_sig *)scat;
1165 if (unlikely(qp->wq_sig)) {
1166 memset(sig, 0, 1 << qp->rq.wqe_shift);
1170 for (i = 0, j = 0; i < wr->num_sge; ++i) {
1171 if (unlikely(!wr->sg_list[i].length))
1173 set_data_ptr_seg(scat + j++, wr->sg_list + i, 0);
1176 if (j < qp->rq.max_gs) {
1177 scat[j].byte_count = 0;
1178 scat[j].lkey = htobe32(MLX5_INVALID_LKEY);
1182 if (unlikely(qp->wq_sig))
1183 set_sig_seg(qp, sig, (wr->num_sge + 1) << 4,
1184 qp->rq.head & 0xffff);
1186 qp->rq.wrid[ind] = wr->wr_id;
1188 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
1193 qp->rq.head += nreq;
1196 * Make sure that descriptors are written before
1199 udma_to_device_barrier();
1202 * For Raw Packet QP, avoid updating the doorbell record
1203 * as long as the QP isn't in RTR state, to avoid receiving
1204 * packets in illegal states.
1205 * This is only for Raw Packet QPs since they are represented
1206 * differently in the hardware.
1208 if (likely(!(ibqp->qp_type == IBV_QPT_RAW_PACKET &&
1209 ibqp->state < IBV_QPS_RTR)))
1210 qp->db[MLX5_RCV_DBR] = htobe32(qp->rq.head & 0xffff);
1213 mlx5_spin_unlock(&qp->rq.lock);
1218 int mlx5_use_huge(const char *key)
1222 if (e && !strcmp(e, "y"))
1228 struct mlx5_qp *mlx5_find_qp(struct mlx5_context *ctx, uint32_t qpn)
1230 int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1232 if (ctx->qp_table[tind].refcnt)
1233 return ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK];
1238 int mlx5_store_qp(struct mlx5_context *ctx, uint32_t qpn, struct mlx5_qp *qp)
1240 int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1242 if (!ctx->qp_table[tind].refcnt) {
1243 ctx->qp_table[tind].table = calloc(MLX5_QP_TABLE_MASK + 1,
1244 sizeof(struct mlx5_qp *));
1245 if (!ctx->qp_table[tind].table)
1249 ++ctx->qp_table[tind].refcnt;
1250 ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK] = qp;
1254 void mlx5_clear_qp(struct mlx5_context *ctx, uint32_t qpn)
1256 int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1258 if (!--ctx->qp_table[tind].refcnt)
1259 free(ctx->qp_table[tind].table);
1261 ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK] = NULL;