2 * Copyright (c) 2005 Topspin Communications. All rights reserved.
3 * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
4 * Copyright (c) 2007 Cisco, Inc. All rights reserved.
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46 static const uint32_t mlx4_ib_opcode[] = {
47 [IBV_WR_SEND] = MLX4_OPCODE_SEND,
48 [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM,
49 [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE,
50 [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM,
51 [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ,
52 [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS,
53 [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA,
54 [IBV_WR_LOCAL_INV] = MLX4_OPCODE_LOCAL_INVAL,
55 [IBV_WR_BIND_MW] = MLX4_OPCODE_BIND_MW,
56 [IBV_WR_SEND_WITH_INV] = MLX4_OPCODE_SEND_INVAL,
59 static void *get_recv_wqe(struct mlx4_qp *qp, int n)
61 return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
64 static void *get_send_wqe(struct mlx4_qp *qp, int n)
66 return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
70 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
71 * first four bytes of every 64 byte chunk with 0xffffffff, except for
72 * the very first chunk of the WQE.
74 static void stamp_send_wqe(struct mlx4_qp *qp, int n)
76 uint32_t *wqe = get_send_wqe(qp, n);
78 int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
80 for (i = 16; i < ds; i += 16)
84 void mlx4_init_qp_indices(struct mlx4_qp *qp)
92 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
94 struct mlx4_wqe_ctrl_seg *ctrl;
97 for (i = 0; i < qp->sq.wqe_cnt; ++i) {
98 ctrl = get_send_wqe(qp, i);
99 ctrl->owner_opcode = htobe32(1 << 31);
100 ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
102 stamp_send_wqe(qp, i);
106 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
110 cur = wq->head - wq->tail;
111 if (cur + nreq < wq->max_post)
114 pthread_spin_lock(&cq->lock);
115 cur = wq->head - wq->tail;
116 pthread_spin_unlock(&cq->lock);
118 return cur + nreq >= wq->max_post;
121 static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
123 int acc = wr->bind_mw.bind_info.mw_access_flags;
125 if (acc & IBV_ACCESS_REMOTE_ATOMIC)
126 bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
127 if (acc & IBV_ACCESS_REMOTE_WRITE)
128 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
129 if (acc & IBV_ACCESS_REMOTE_READ)
130 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
133 if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
134 bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
135 if (acc & IBV_ACCESS_ZERO_BASED)
136 bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
138 bseg->new_rkey = htobe32(wr->bind_mw.rkey);
139 bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
140 bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
141 bseg->length = htobe64(wr->bind_mw.bind_info.length);
144 static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
147 iseg->mem_key = htobe32(rkey);
151 iseg->reserved3[0] = 0;
152 iseg->reserved3[1] = 0;
155 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
156 uint64_t remote_addr, uint32_t rkey)
158 rseg->raddr = htobe64(remote_addr);
159 rseg->rkey = htobe32(rkey);
163 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
165 if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166 aseg->swap_add = htobe64(wr->wr.atomic.swap);
167 aseg->compare = htobe64(wr->wr.atomic.compare_add);
169 aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
175 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
176 struct ibv_send_wr *wr)
178 memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
179 dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
180 dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
181 dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
182 memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
185 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
187 dseg->byte_count = htobe32(sg->length);
188 dseg->lkey = htobe32(sg->lkey);
189 dseg->addr = htobe64(sg->addr);
192 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
194 dseg->lkey = htobe32(sg->lkey);
195 dseg->addr = htobe64(sg->addr);
198 * Need a barrier here before writing the byte_count field to
199 * make sure that all the data is visible before the
200 * byte_count field is set. Otherwise, if the segment begins
201 * a new cacheline, the HCA prefetcher could grab the 64-byte
202 * chunk and get a valid (!= * 0xffffffff) byte count but
203 * stale data, and end up sending the wrong data.
205 udma_to_device_barrier();
207 if (likely(sg->length))
208 dseg->byte_count = htobe32(sg->length);
210 dseg->byte_count = htobe32(0x80000000);
213 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
214 struct ibv_send_wr **bad_wr)
216 struct mlx4_context *ctx;
217 struct mlx4_qp *qp = to_mqp(ibqp);
219 struct mlx4_wqe_ctrl_seg *ctrl = NULL;
227 pthread_spin_lock(&qp->sq.lock);
229 /* XXX check that state is OK to post send */
233 for (nreq = 0; wr; ++nreq, wr = wr->next) {
234 if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
240 if (wr->num_sge > qp->sq.max_gs) {
246 if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
252 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
253 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
256 (wr->send_flags & IBV_SEND_SIGNALED ?
257 htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
258 (wr->send_flags & IBV_SEND_SOLICITED ?
259 htobe32(MLX4_WQE_CTRL_SOLICIT) : 0) |
262 if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
263 wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
264 ctrl->imm = wr->imm_data;
269 size = sizeof *ctrl / 16;
271 switch (ibqp->qp_type) {
272 case IBV_QPT_XRC_SEND:
273 ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
277 switch (wr->opcode) {
278 case IBV_WR_ATOMIC_CMP_AND_SWP:
279 case IBV_WR_ATOMIC_FETCH_AND_ADD:
280 set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
282 wqe += sizeof (struct mlx4_wqe_raddr_seg);
284 set_atomic_seg(wqe, wr);
285 wqe += sizeof (struct mlx4_wqe_atomic_seg);
286 size += (sizeof (struct mlx4_wqe_raddr_seg) +
287 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
291 case IBV_WR_RDMA_READ:
294 case IBV_WR_RDMA_WRITE:
295 case IBV_WR_RDMA_WRITE_WITH_IMM:
298 set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
300 wqe += sizeof (struct mlx4_wqe_raddr_seg);
301 size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
304 case IBV_WR_LOCAL_INV:
306 htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
307 set_local_inv_seg(wqe, wr->imm_data);
309 (struct mlx4_wqe_local_inval_seg);
311 (struct mlx4_wqe_local_inval_seg) / 16;
315 htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
316 set_bind_seg(wqe, wr);
318 (struct mlx4_wqe_bind_seg);
320 (struct mlx4_wqe_bind_seg) / 16;
322 case IBV_WR_SEND_WITH_INV:
323 ctrl->imm = htobe32(wr->imm_data);
327 /* No extra segments required for sends */
333 set_datagram_seg(wqe, wr);
334 wqe += sizeof (struct mlx4_wqe_datagram_seg);
335 size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
337 if (wr->send_flags & IBV_SEND_IP_CSUM) {
338 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
343 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
344 MLX4_WQE_CTRL_TCP_UDP_CSUM);
348 case IBV_QPT_RAW_PACKET:
349 /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
350 * to indicate that no icrc should be calculated */
351 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
352 if (wr->send_flags & IBV_SEND_IP_CSUM) {
353 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
358 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
359 MLX4_WQE_CTRL_TCP_UDP_CSUM);
367 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
368 struct mlx4_wqe_inline_seg *seg;
378 off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
382 for (i = 0; i < wr->num_sge; ++i) {
383 addr = (void *) (uintptr_t) wr->sg_list[i].addr;
384 len = wr->sg_list[i].length;
387 if (inl > qp->max_inline_data) {
394 while (len >= MLX4_INLINE_ALIGN - off) {
395 to_copy = MLX4_INLINE_ALIGN - off;
396 memcpy(wqe, addr, to_copy);
401 udma_to_device_barrier(); /* see comment below */
402 seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
410 memcpy(wqe, addr, len);
419 * Need a barrier here to make sure
420 * all the data is visible before the
421 * byte_count field is set. Otherwise
422 * the HCA prefetcher could grab the
423 * 64-byte chunk with this inline
424 * segment and get a valid (!=
425 * 0xffffffff) byte count but stale
426 * data, and end up sending the wrong
429 udma_to_device_barrier();
430 seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
433 size += (inl + num_seg * sizeof * seg + 15) / 16;
435 struct mlx4_wqe_data_seg *seg = wqe;
437 for (i = wr->num_sge - 1; i >= 0 ; --i)
438 set_data_seg(seg + i, wr->sg_list + i);
440 size += wr->num_sge * (sizeof *seg / 16);
443 ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
444 MLX4_WQE_CTRL_FENCE : 0) | size;
447 * Make sure descriptor is fully written before
448 * setting ownership bit (because HW can start
449 * executing as soon as we do).
451 udma_to_device_barrier();
453 ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
454 (ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
457 * We can improve latency by not stamping the last
458 * send queue WQE until after ringing the doorbell, so
459 * only stamp here if there are still more WQEs to post.
462 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
463 (qp->sq.wqe_cnt - 1));
469 ctx = to_mctx(ibqp->context);
471 if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
472 ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
474 ctrl->bf_qpn |= qp->doorbell_qpn;
477 * Make sure that descriptor is written to memory
478 * before writing to BlueFlame page.
480 mmio_wc_spinlock(&ctx->bf_lock);
482 mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
483 align(size * 16, 64));
484 /* Flush before toggling bf_offset to be latency oriented */
487 ctx->bf_offset ^= ctx->bf_buf_size;
489 pthread_spin_unlock(&ctx->bf_lock);
494 * Make sure that descriptors are written before
497 udma_to_device_barrier();
499 mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
504 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
505 (qp->sq.wqe_cnt - 1));
507 pthread_spin_unlock(&qp->sq.lock);
512 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
513 struct ibv_recv_wr **bad_wr)
515 struct mlx4_qp *qp = to_mqp(ibqp);
516 struct mlx4_wqe_data_seg *scat;
522 pthread_spin_lock(&qp->rq.lock);
524 /* XXX check that state is OK to post receive */
526 ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
528 for (nreq = 0; wr; ++nreq, wr = wr->next) {
529 if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
535 if (wr->num_sge > qp->rq.max_gs) {
541 scat = get_recv_wqe(qp, ind);
543 for (i = 0; i < wr->num_sge; ++i)
544 __set_data_seg(scat + i, wr->sg_list + i);
546 if (i < qp->rq.max_gs) {
547 scat[i].byte_count = 0;
548 scat[i].lkey = htobe32(MLX4_INVALID_LKEY);
552 qp->rq.wrid[ind] = wr->wr_id;
554 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
562 * Make sure that descriptors are written before
565 udma_to_device_barrier();
567 *qp->db = htobe32(qp->rq.head & 0xffff);
570 pthread_spin_unlock(&qp->rq.lock);
575 static int num_inline_segs(int data, enum ibv_qp_type type)
578 * Inline data segments are not allowed to cross 64 byte
579 * boundaries. For UD QPs, the data segments always start
580 * aligned to 64 bytes (16 byte control segment + 48 byte
581 * datagram segment); for other QPs, there will be a 16 byte
582 * control segment and possibly a 16 byte remote address
583 * segment, so in the worst case there will be only 32 bytes
584 * available for the first data segment.
586 if (type == IBV_QPT_UD)
587 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
588 sizeof (struct mlx4_wqe_datagram_seg)) %
591 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
592 sizeof (struct mlx4_wqe_raddr_seg)) %
595 return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
596 (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
599 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
605 max_sq_sge = align(cap->max_inline_data +
606 num_inline_segs(cap->max_inline_data, type) *
607 sizeof (struct mlx4_wqe_inline_seg),
608 sizeof (struct mlx4_wqe_data_seg)) /
609 sizeof (struct mlx4_wqe_data_seg);
610 if (max_sq_sge < cap->max_send_sge)
611 max_sq_sge = cap->max_send_sge;
613 size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
616 size += sizeof (struct mlx4_wqe_datagram_seg);
620 size += sizeof (struct mlx4_wqe_raddr_seg);
623 case IBV_QPT_XRC_SEND:
625 size += sizeof (struct mlx4_wqe_raddr_seg);
627 * An atomic op will require an atomic segment, a
628 * remote address segment and one scatter entry.
630 if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
631 sizeof (struct mlx4_wqe_raddr_seg) +
632 sizeof (struct mlx4_wqe_data_seg)))
633 size = (sizeof (struct mlx4_wqe_atomic_seg) +
634 sizeof (struct mlx4_wqe_raddr_seg) +
635 sizeof (struct mlx4_wqe_data_seg));
642 /* Make sure that we have enough space for a bind request */
643 if (size < sizeof (struct mlx4_wqe_bind_seg))
644 size = sizeof (struct mlx4_wqe_bind_seg);
646 size += sizeof (struct mlx4_wqe_ctrl_seg);
648 for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
653 int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
654 enum ibv_qp_type type, struct mlx4_qp *qp)
656 qp->rq.max_gs = cap->max_recv_sge;
658 if (qp->sq.wqe_cnt) {
659 qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
664 if (qp->rq.wqe_cnt) {
665 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
672 for (qp->rq.wqe_shift = 4;
673 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
677 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
678 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
679 if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
681 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
683 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
688 if (mlx4_alloc_buf(&qp->buf,
689 align(qp->buf_size, to_mdev(context->device)->page_size),
690 to_mdev(context->device)->page_size)) {
696 memset(qp->buf.buf, 0, qp->buf_size);
704 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
705 enum ibv_qp_type type)
709 wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
712 wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
715 case IBV_QPT_XRC_SEND:
718 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
725 qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg);
726 cap->max_send_sge = qp->sq.max_gs;
727 qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
728 cap->max_send_wr = qp->sq.max_post;
731 * Inline data segments can't cross a 64 byte boundary. So
732 * subtract off one segment header for each 64-byte chunk,
733 * taking into account the fact that wqe_size will be 32 mod
736 qp->max_inline_data = wqe_size -
737 sizeof (struct mlx4_wqe_inline_seg) *
738 (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
739 cap->max_inline_data = qp->max_inline_data;
742 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
744 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
746 if (ctx->qp_table[tind].refcnt)
747 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
752 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
754 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
756 if (!ctx->qp_table[tind].refcnt) {
757 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
758 sizeof (struct mlx4_qp *));
759 if (!ctx->qp_table[tind].table)
763 ++ctx->qp_table[tind].refcnt;
764 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
768 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
770 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
772 if (!--ctx->qp_table[tind].refcnt)
773 free(ctx->qp_table[tind].table);
775 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;