2 * Copyright (c) 2005 Topspin Communications. All rights reserved.
3 * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
4 * Copyright (c) 2007 Cisco, Inc. All rights reserved.
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 #endif /* HAVE_CONFIG_H */
40 #include <netinet/in.h>
48 static const uint32_t mlx4_ib_opcode[] = {
49 [IBV_WR_SEND] = MLX4_OPCODE_SEND,
50 [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM,
51 [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE,
52 [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM,
53 [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ,
54 [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS,
55 [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA,
58 static void *get_recv_wqe(struct mlx4_qp *qp, int n)
60 return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
63 static void *get_send_wqe(struct mlx4_qp *qp, int n)
65 return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
69 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
70 * first four bytes of every 64 byte chunk with 0xffffffff, except for
71 * the very first chunk of the WQE.
73 static void stamp_send_wqe(struct mlx4_qp *qp, int n)
75 uint32_t *wqe = get_send_wqe(qp, n);
77 int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
79 for (i = 16; i < ds; i += 16)
83 void mlx4_init_qp_indices(struct mlx4_qp *qp)
91 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
93 struct mlx4_wqe_ctrl_seg *ctrl;
96 for (i = 0; i < qp->sq.wqe_cnt; ++i) {
97 ctrl = get_send_wqe(qp, i);
98 ctrl->owner_opcode = htonl(1 << 31);
99 ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
101 stamp_send_wqe(qp, i);
105 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
109 cur = wq->head - wq->tail;
110 if (cur + nreq < wq->max_post)
113 pthread_spin_lock(&cq->lock);
114 cur = wq->head - wq->tail;
115 pthread_spin_unlock(&cq->lock);
117 return cur + nreq >= wq->max_post;
120 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
121 uint64_t remote_addr, uint32_t rkey)
123 rseg->raddr = htonll(remote_addr);
124 rseg->rkey = htonl(rkey);
128 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
130 if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
131 aseg->swap_add = htonll(wr->wr.atomic.swap);
132 aseg->compare = htonll(wr->wr.atomic.compare_add);
134 aseg->swap_add = htonll(wr->wr.atomic.compare_add);
140 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
141 struct ibv_send_wr *wr)
143 memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
144 dseg->dqpn = htonl(wr->wr.ud.remote_qpn);
145 dseg->qkey = htonl(wr->wr.ud.remote_qkey);
146 dseg->vlan = htons(to_mah(wr->wr.ud.ah)->vlan);
147 memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
150 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
152 dseg->byte_count = htonl(sg->length);
153 dseg->lkey = htonl(sg->lkey);
154 dseg->addr = htonll(sg->addr);
157 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
159 dseg->lkey = htonl(sg->lkey);
160 dseg->addr = htonll(sg->addr);
163 * Need a barrier here before writing the byte_count field to
164 * make sure that all the data is visible before the
165 * byte_count field is set. Otherwise, if the segment begins
166 * a new cacheline, the HCA prefetcher could grab the 64-byte
167 * chunk and get a valid (!= * 0xffffffff) byte count but
168 * stale data, and end up sending the wrong data.
172 dseg->byte_count = htonl(sg->length);
176 * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
177 * implementations may use move-string-buffer assembler instructions,
178 * which do not guarantee order of copying.
180 static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
182 while (bytecnt > 0) {
185 bytecnt -= 2 * sizeof (long);
189 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
190 struct ibv_send_wr **bad_wr)
192 struct mlx4_context *ctx;
193 struct mlx4_qp *qp = to_mqp(ibqp);
195 struct mlx4_wqe_ctrl_seg *ctrl;
203 pthread_spin_lock(&qp->sq.lock);
205 /* XXX check that state is OK to post send */
209 for (nreq = 0; wr; ++nreq, wr = wr->next) {
210 if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
216 if (wr->num_sge > qp->sq.max_gs) {
222 if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
228 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
229 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
232 (wr->send_flags & IBV_SEND_SIGNALED ?
233 htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
234 (wr->send_flags & IBV_SEND_SOLICITED ?
235 htonl(MLX4_WQE_CTRL_SOLICIT) : 0) |
238 if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
239 wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
240 ctrl->imm = wr->imm_data;
245 size = sizeof *ctrl / 16;
247 switch (ibqp->qp_type) {
249 ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8);
253 switch (wr->opcode) {
254 case IBV_WR_ATOMIC_CMP_AND_SWP:
255 case IBV_WR_ATOMIC_FETCH_AND_ADD:
256 set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
258 wqe += sizeof (struct mlx4_wqe_raddr_seg);
260 set_atomic_seg(wqe, wr);
261 wqe += sizeof (struct mlx4_wqe_atomic_seg);
262 size += (sizeof (struct mlx4_wqe_raddr_seg) +
263 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
267 case IBV_WR_RDMA_READ:
270 case IBV_WR_RDMA_WRITE:
271 case IBV_WR_RDMA_WRITE_WITH_IMM:
272 set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
274 wqe += sizeof (struct mlx4_wqe_raddr_seg);
275 size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
280 /* No extra segments required for sends */
286 set_datagram_seg(wqe, wr);
287 wqe += sizeof (struct mlx4_wqe_datagram_seg);
288 size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
289 if (to_mah(wr->wr.ud.ah)->tagged) {
290 ctrl->ins_vlan = 1 << 6;
291 ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan);
300 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
301 struct mlx4_wqe_inline_seg *seg;
311 off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
315 for (i = 0; i < wr->num_sge; ++i) {
316 addr = (void *) (uintptr_t) wr->sg_list[i].addr;
317 len = wr->sg_list[i].length;
320 if (inl > qp->max_inline_data) {
327 while (len >= MLX4_INLINE_ALIGN - off) {
328 to_copy = MLX4_INLINE_ALIGN - off;
329 memcpy(wqe, addr, to_copy);
334 wmb(); /* see comment below */
335 seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
343 memcpy(wqe, addr, len);
352 * Need a barrier here to make sure
353 * all the data is visible before the
354 * byte_count field is set. Otherwise
355 * the HCA prefetcher could grab the
356 * 64-byte chunk with this inline
357 * segment and get a valid (!=
358 * 0xffffffff) byte count but stale
359 * data, and end up sending the wrong
363 seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
366 size += (inl + num_seg * sizeof * seg + 15) / 16;
368 struct mlx4_wqe_data_seg *seg = wqe;
370 for (i = wr->num_sge - 1; i >= 0 ; --i)
371 set_data_seg(seg + i, wr->sg_list + i);
373 size += wr->num_sge * (sizeof *seg / 16);
376 ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
377 MLX4_WQE_CTRL_FENCE : 0) | size;
380 * Make sure descriptor is fully written before
381 * setting ownership bit (because HW can start
382 * executing as soon as we do).
386 ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
387 (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
390 * We can improve latency by not stamping the last
391 * send queue WQE until after ringing the doorbell, so
392 * only stamp here if there are still more WQEs to post.
395 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
396 (qp->sq.wqe_cnt - 1));
402 ctx = to_mctx(ibqp->context);
404 if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) {
405 ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
406 *(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
408 * Make sure that descriptor is written to memory
409 * before writing to BlueFlame page.
415 pthread_spin_lock(&ctx->bf_lock);
417 mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
418 align(size * 16, 64));
421 ctx->bf_offset ^= ctx->bf_buf_size;
423 pthread_spin_unlock(&ctx->bf_lock);
428 * Make sure that descriptors are written before
433 *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
437 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
438 (qp->sq.wqe_cnt - 1));
440 pthread_spin_unlock(&qp->sq.lock);
445 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
446 struct ibv_recv_wr **bad_wr)
448 struct mlx4_qp *qp = to_mqp(ibqp);
449 struct mlx4_wqe_data_seg *scat;
455 pthread_spin_lock(&qp->rq.lock);
457 /* XXX check that state is OK to post receive */
459 ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
461 for (nreq = 0; wr; ++nreq, wr = wr->next) {
462 if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
468 if (wr->num_sge > qp->rq.max_gs) {
474 scat = get_recv_wqe(qp, ind);
476 for (i = 0; i < wr->num_sge; ++i)
477 __set_data_seg(scat + i, wr->sg_list + i);
479 if (i < qp->rq.max_gs) {
480 scat[i].byte_count = 0;
481 scat[i].lkey = htonl(MLX4_INVALID_LKEY);
485 qp->rq.wrid[ind] = wr->wr_id;
487 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
495 * Make sure that descriptors are written before
500 *qp->db = htonl(qp->rq.head & 0xffff);
503 pthread_spin_unlock(&qp->rq.lock);
508 int num_inline_segs(int data, enum ibv_qp_type type)
511 * Inline data segments are not allowed to cross 64 byte
512 * boundaries. For UD QPs, the data segments always start
513 * aligned to 64 bytes (16 byte control segment + 48 byte
514 * datagram segment); for other QPs, there will be a 16 byte
515 * control segment and possibly a 16 byte remote address
516 * segment, so in the worst case there will be only 32 bytes
517 * available for the first data segment.
519 if (type == IBV_QPT_UD)
520 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
521 sizeof (struct mlx4_wqe_datagram_seg)) %
524 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
525 sizeof (struct mlx4_wqe_raddr_seg)) %
528 return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
529 (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
532 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
538 max_sq_sge = align(cap->max_inline_data +
539 num_inline_segs(cap->max_inline_data, type) *
540 sizeof (struct mlx4_wqe_inline_seg),
541 sizeof (struct mlx4_wqe_data_seg)) /
542 sizeof (struct mlx4_wqe_data_seg);
543 if (max_sq_sge < cap->max_send_sge)
544 max_sq_sge = cap->max_send_sge;
546 size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
549 size += sizeof (struct mlx4_wqe_datagram_seg);
553 size += sizeof (struct mlx4_wqe_raddr_seg);
558 size += sizeof (struct mlx4_wqe_raddr_seg);
560 * An atomic op will require an atomic segment, a
561 * remote address segment and one scatter entry.
563 if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
564 sizeof (struct mlx4_wqe_raddr_seg) +
565 sizeof (struct mlx4_wqe_data_seg)))
566 size = (sizeof (struct mlx4_wqe_atomic_seg) +
567 sizeof (struct mlx4_wqe_raddr_seg) +
568 sizeof (struct mlx4_wqe_data_seg));
575 /* Make sure that we have enough space for a bind request */
576 if (size < sizeof (struct mlx4_wqe_bind_seg))
577 size = sizeof (struct mlx4_wqe_bind_seg);
579 size += sizeof (struct mlx4_wqe_ctrl_seg);
581 for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
586 int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
587 enum ibv_qp_type type, struct mlx4_qp *qp)
589 qp->rq.max_gs = cap->max_recv_sge;
591 qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
595 if (qp->rq.wqe_cnt) {
596 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
603 for (qp->rq.wqe_shift = 4;
604 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
608 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
609 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
610 if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
612 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
614 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
618 if (mlx4_alloc_buf(&qp->buf,
619 align(qp->buf_size, to_mdev(pd->context->device)->page_size),
620 to_mdev(pd->context->device)->page_size)) {
626 memset(qp->buf.buf, 0, qp->buf_size);
631 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
632 enum ibv_qp_type type)
635 struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context);
637 wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) -
638 sizeof (struct mlx4_wqe_ctrl_seg);
641 wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
647 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
654 qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg);
655 cap->max_send_sge = min(ctx->max_sge, qp->sq.max_gs);
656 qp->sq.max_post = min(ctx->max_qp_wr,
657 qp->sq.wqe_cnt - qp->sq_spare_wqes);
658 cap->max_send_wr = qp->sq.max_post;
661 * Inline data segments can't cross a 64 byte boundary. So
662 * subtract off one segment header for each 64-byte chunk,
663 * taking into account the fact that wqe_size will be 32 mod
666 qp->max_inline_data = wqe_size -
667 sizeof (struct mlx4_wqe_inline_seg) *
668 (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
669 cap->max_inline_data = qp->max_inline_data;
672 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
674 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
676 if (ctx->qp_table[tind].refcnt)
677 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
682 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
684 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
686 if (!ctx->qp_table[tind].refcnt) {
687 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
688 sizeof (struct mlx4_qp *));
689 if (!ctx->qp_table[tind].table)
693 ++ctx->qp_table[tind].refcnt;
694 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
698 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
700 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
702 if (!--ctx->qp_table[tind].refcnt)
703 free(ctx->qp_table[tind].table);
705 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;