]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/ofed/libmlx4/qp.c
dts: Import files from Linux 5.1
[FreeBSD/FreeBSD.git] / contrib / ofed / libmlx4 / qp.c
1 /*
2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4  * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34
35 #include <config.h>
36
37 #include <stdlib.h>
38 #include <pthread.h>
39 #include <string.h>
40 #include <errno.h>
41
42 #include "mlx4.h"
43 #include "doorbell.h"
44 #include "wqe.h"
45
46 static const uint32_t mlx4_ib_opcode[] = {
47         [IBV_WR_SEND]                   = MLX4_OPCODE_SEND,
48         [IBV_WR_SEND_WITH_IMM]          = MLX4_OPCODE_SEND_IMM,
49         [IBV_WR_RDMA_WRITE]             = MLX4_OPCODE_RDMA_WRITE,
50         [IBV_WR_RDMA_WRITE_WITH_IMM]    = MLX4_OPCODE_RDMA_WRITE_IMM,
51         [IBV_WR_RDMA_READ]              = MLX4_OPCODE_RDMA_READ,
52         [IBV_WR_ATOMIC_CMP_AND_SWP]     = MLX4_OPCODE_ATOMIC_CS,
53         [IBV_WR_ATOMIC_FETCH_AND_ADD]   = MLX4_OPCODE_ATOMIC_FA,
54         [IBV_WR_LOCAL_INV]              = MLX4_OPCODE_LOCAL_INVAL,
55         [IBV_WR_BIND_MW]                = MLX4_OPCODE_BIND_MW,
56         [IBV_WR_SEND_WITH_INV]          = MLX4_OPCODE_SEND_INVAL,
57 };
58
59 static void *get_recv_wqe(struct mlx4_qp *qp, int n)
60 {
61         return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
62 }
63
64 static void *get_send_wqe(struct mlx4_qp *qp, int n)
65 {
66         return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
67 }
68
69 /*
70  * Stamp a SQ WQE so that it is invalid if prefetched by marking the
71  * first four bytes of every 64 byte chunk with 0xffffffff, except for
72  * the very first chunk of the WQE.
73  */
74 static void stamp_send_wqe(struct mlx4_qp *qp, int n)
75 {
76         uint32_t *wqe = get_send_wqe(qp, n);
77         int i;
78         int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
79
80         for (i = 16; i < ds; i += 16)
81                 wqe[i] = 0xffffffff;
82 }
83
84 void mlx4_init_qp_indices(struct mlx4_qp *qp)
85 {
86         qp->sq.head      = 0;
87         qp->sq.tail      = 0;
88         qp->rq.head      = 0;
89         qp->rq.tail      = 0;
90 }
91
92 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
93 {
94         struct mlx4_wqe_ctrl_seg *ctrl;
95         int i;
96
97         for (i = 0; i < qp->sq.wqe_cnt; ++i) {
98                 ctrl = get_send_wqe(qp, i);
99                 ctrl->owner_opcode = htobe32(1 << 31);
100                 ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
101
102                 stamp_send_wqe(qp, i);
103         }
104 }
105
106 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
107 {
108         unsigned cur;
109
110         cur = wq->head - wq->tail;
111         if (cur + nreq < wq->max_post)
112                 return 0;
113
114         pthread_spin_lock(&cq->lock);
115         cur = wq->head - wq->tail;
116         pthread_spin_unlock(&cq->lock);
117
118         return cur + nreq >= wq->max_post;
119 }
120
121 static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
122 {
123         int acc = wr->bind_mw.bind_info.mw_access_flags;
124         bseg->flags1 = 0;
125         if (acc & IBV_ACCESS_REMOTE_ATOMIC)
126                 bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
127         if (acc & IBV_ACCESS_REMOTE_WRITE)
128                 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
129         if (acc & IBV_ACCESS_REMOTE_READ)
130                 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
131
132         bseg->flags2 = 0;
133         if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
134                 bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
135         if (acc & IBV_ACCESS_ZERO_BASED)
136                 bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
137
138         bseg->new_rkey = htobe32(wr->bind_mw.rkey);
139         bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
140         bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
141         bseg->length = htobe64(wr->bind_mw.bind_info.length);
142 }
143
144 static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
145                 uint32_t rkey)
146 {
147         iseg->mem_key   = htobe32(rkey);
148
149         iseg->reserved1    = 0;
150         iseg->reserved2    = 0;
151         iseg->reserved3[0] = 0;
152         iseg->reserved3[1] = 0;
153 }
154
155 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
156                                  uint64_t remote_addr, uint32_t rkey)
157 {
158         rseg->raddr    = htobe64(remote_addr);
159         rseg->rkey     = htobe32(rkey);
160         rseg->reserved = 0;
161 }
162
163 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
164 {
165         if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166                 aseg->swap_add = htobe64(wr->wr.atomic.swap);
167                 aseg->compare  = htobe64(wr->wr.atomic.compare_add);
168         } else {
169                 aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
170                 aseg->compare  = 0;
171         }
172
173 }
174
175 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
176                              struct ibv_send_wr *wr)
177 {
178         memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
179         dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
180         dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
181         dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
182         memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
183 }
184
185 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
186 {
187         dseg->byte_count = htobe32(sg->length);
188         dseg->lkey       = htobe32(sg->lkey);
189         dseg->addr       = htobe64(sg->addr);
190 }
191
192 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
193 {
194         dseg->lkey       = htobe32(sg->lkey);
195         dseg->addr       = htobe64(sg->addr);
196
197         /*
198          * Need a barrier here before writing the byte_count field to
199          * make sure that all the data is visible before the
200          * byte_count field is set.  Otherwise, if the segment begins
201          * a new cacheline, the HCA prefetcher could grab the 64-byte
202          * chunk and get a valid (!= * 0xffffffff) byte count but
203          * stale data, and end up sending the wrong data.
204          */
205         udma_to_device_barrier();
206
207         if (likely(sg->length))
208                 dseg->byte_count = htobe32(sg->length);
209         else
210                 dseg->byte_count = htobe32(0x80000000);
211 }
212
213 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
214                           struct ibv_send_wr **bad_wr)
215 {
216         struct mlx4_context *ctx;
217         struct mlx4_qp *qp = to_mqp(ibqp);
218         void *wqe;
219         struct mlx4_wqe_ctrl_seg *ctrl = NULL;
220         int ind;
221         int nreq;
222         int inl = 0;
223         int ret = 0;
224         int size = 0;
225         int i;
226
227         pthread_spin_lock(&qp->sq.lock);
228
229         /* XXX check that state is OK to post send */
230
231         ind = qp->sq.head;
232
233         for (nreq = 0; wr; ++nreq, wr = wr->next) {
234                 if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
235                         ret = ENOMEM;
236                         *bad_wr = wr;
237                         goto out;
238                 }
239
240                 if (wr->num_sge > qp->sq.max_gs) {
241                         ret = ENOMEM;
242                         *bad_wr = wr;
243                         goto out;
244                 }
245
246                 if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
247                         ret = EINVAL;
248                         *bad_wr = wr;
249                         goto out;
250                 }
251
252                 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
253                 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
254
255                 ctrl->srcrb_flags =
256                         (wr->send_flags & IBV_SEND_SIGNALED ?
257                          htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
258                         (wr->send_flags & IBV_SEND_SOLICITED ?
259                          htobe32(MLX4_WQE_CTRL_SOLICIT) : 0)   |
260                         qp->sq_signal_bits;
261
262                 if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
263                     wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
264                         ctrl->imm = wr->imm_data;
265                 else
266                         ctrl->imm = 0;
267
268                 wqe += sizeof *ctrl;
269                 size = sizeof *ctrl / 16;
270
271                 switch (ibqp->qp_type) {
272                 case IBV_QPT_XRC_SEND:
273                         ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
274                         /* fall through */
275                 case IBV_QPT_RC:
276                 case IBV_QPT_UC:
277                         switch (wr->opcode) {
278                         case IBV_WR_ATOMIC_CMP_AND_SWP:
279                         case IBV_WR_ATOMIC_FETCH_AND_ADD:
280                                 set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
281                                               wr->wr.atomic.rkey);
282                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
283
284                                 set_atomic_seg(wqe, wr);
285                                 wqe  += sizeof (struct mlx4_wqe_atomic_seg);
286                                 size += (sizeof (struct mlx4_wqe_raddr_seg) +
287                                          sizeof (struct mlx4_wqe_atomic_seg)) / 16;
288
289                                 break;
290
291                         case IBV_WR_RDMA_READ:
292                                 inl = 1;
293                                 /* fall through */
294                         case IBV_WR_RDMA_WRITE:
295                         case IBV_WR_RDMA_WRITE_WITH_IMM:
296                                 if (!wr->num_sge)
297                                         inl = 1;
298                                 set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
299                                               wr->wr.rdma.rkey);
300                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
301                                 size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
302
303                                 break;
304                         case IBV_WR_LOCAL_INV:
305                                 ctrl->srcrb_flags |=
306                                         htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
307                                 set_local_inv_seg(wqe, wr->imm_data);
308                                 wqe  += sizeof
309                                         (struct mlx4_wqe_local_inval_seg);
310                                 size += sizeof
311                                         (struct mlx4_wqe_local_inval_seg) / 16;
312                                 break;
313                         case IBV_WR_BIND_MW:
314                                 ctrl->srcrb_flags |=
315                                         htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
316                                 set_bind_seg(wqe, wr);
317                                 wqe  += sizeof
318                                         (struct mlx4_wqe_bind_seg);
319                                 size += sizeof
320                                         (struct mlx4_wqe_bind_seg) / 16;
321                                 break;
322                         case IBV_WR_SEND_WITH_INV:
323                                 ctrl->imm = htobe32(wr->imm_data);
324                                 break;
325
326                         default:
327                                 /* No extra segments required for sends */
328                                 break;
329                         }
330                         break;
331
332                 case IBV_QPT_UD:
333                         set_datagram_seg(wqe, wr);
334                         wqe  += sizeof (struct mlx4_wqe_datagram_seg);
335                         size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
336
337                         if (wr->send_flags & IBV_SEND_IP_CSUM) {
338                                 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
339                                         ret = EINVAL;
340                                         *bad_wr = wr;
341                                         goto out;
342                                 }
343                                 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
344                                                            MLX4_WQE_CTRL_TCP_UDP_CSUM);
345                         }
346                         break;
347
348                 case IBV_QPT_RAW_PACKET:
349                         /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
350                          * to indicate that no icrc should be calculated */
351                         ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
352                         if (wr->send_flags & IBV_SEND_IP_CSUM) {
353                                 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
354                                         ret = EINVAL;
355                                         *bad_wr = wr;
356                                         goto out;
357                                 }
358                                 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
359                                                            MLX4_WQE_CTRL_TCP_UDP_CSUM);
360                         }
361                         break;
362
363                 default:
364                         break;
365                 }
366
367                 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
368                         struct mlx4_wqe_inline_seg *seg;
369                         void *addr;
370                         int len, seg_len;
371                         int num_seg;
372                         int off, to_copy;
373
374                         inl = 0;
375
376                         seg = wqe;
377                         wqe += sizeof *seg;
378                         off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
379                         num_seg = 0;
380                         seg_len = 0;
381
382                         for (i = 0; i < wr->num_sge; ++i) {
383                                 addr = (void *) (uintptr_t) wr->sg_list[i].addr;
384                                 len  = wr->sg_list[i].length;
385                                 inl += len;
386
387                                 if (inl > qp->max_inline_data) {
388                                         inl = 0;
389                                         ret = ENOMEM;
390                                         *bad_wr = wr;
391                                         goto out;
392                                 }
393
394                                 while (len >= MLX4_INLINE_ALIGN - off) {
395                                         to_copy = MLX4_INLINE_ALIGN - off;
396                                         memcpy(wqe, addr, to_copy);
397                                         len -= to_copy;
398                                         wqe += to_copy;
399                                         addr += to_copy;
400                                         seg_len += to_copy;
401                                         udma_to_device_barrier(); /* see comment below */
402                                         seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
403                                         seg_len = 0;
404                                         seg = wqe;
405                                         wqe += sizeof *seg;
406                                         off = sizeof *seg;
407                                         ++num_seg;
408                                 }
409
410                                 memcpy(wqe, addr, len);
411                                 wqe += len;
412                                 seg_len += len;
413                                 off += len;
414                         }
415
416                         if (seg_len) {
417                                 ++num_seg;
418                                 /*
419                                  * Need a barrier here to make sure
420                                  * all the data is visible before the
421                                  * byte_count field is set.  Otherwise
422                                  * the HCA prefetcher could grab the
423                                  * 64-byte chunk with this inline
424                                  * segment and get a valid (!=
425                                  * 0xffffffff) byte count but stale
426                                  * data, and end up sending the wrong
427                                  * data.
428                                  */
429                                 udma_to_device_barrier();
430                                 seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
431                         }
432
433                         size += (inl + num_seg * sizeof * seg + 15) / 16;
434                 } else {
435                         struct mlx4_wqe_data_seg *seg = wqe;
436
437                         for (i = wr->num_sge - 1; i >= 0 ; --i)
438                                 set_data_seg(seg + i, wr->sg_list + i);
439
440                         size += wr->num_sge * (sizeof *seg / 16);
441                 }
442
443                 ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
444                                     MLX4_WQE_CTRL_FENCE : 0) | size;
445
446                 /*
447                  * Make sure descriptor is fully written before
448                  * setting ownership bit (because HW can start
449                  * executing as soon as we do).
450                  */
451                 udma_to_device_barrier();
452
453                 ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
454                         (ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
455
456                 /*
457                  * We can improve latency by not stamping the last
458                  * send queue WQE until after ringing the doorbell, so
459                  * only stamp here if there are still more WQEs to post.
460                  */
461                 if (wr->next)
462                         stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
463                                        (qp->sq.wqe_cnt - 1));
464
465                 ++ind;
466         }
467
468 out:
469         ctx = to_mctx(ibqp->context);
470
471         if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
472                 ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
473
474                 ctrl->bf_qpn |= qp->doorbell_qpn;
475                 ++qp->sq.head;
476                 /*
477                  * Make sure that descriptor is written to memory
478                  * before writing to BlueFlame page.
479                  */
480                 mmio_wc_spinlock(&ctx->bf_lock);
481
482                 mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
483                              align(size * 16, 64));
484                 /* Flush before toggling bf_offset to be latency oriented */
485                 mmio_flush_writes();
486
487                 ctx->bf_offset ^= ctx->bf_buf_size;
488
489                 pthread_spin_unlock(&ctx->bf_lock);
490         } else if (nreq) {
491                 qp->sq.head += nreq;
492
493                 /*
494                  * Make sure that descriptors are written before
495                  * doorbell record.
496                  */
497                 udma_to_device_barrier();
498
499                 mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
500                             qp->doorbell_qpn);
501         }
502
503         if (nreq)
504                 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
505                                (qp->sq.wqe_cnt - 1));
506
507         pthread_spin_unlock(&qp->sq.lock);
508
509         return ret;
510 }
511
512 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
513                    struct ibv_recv_wr **bad_wr)
514 {
515         struct mlx4_qp *qp = to_mqp(ibqp);
516         struct mlx4_wqe_data_seg *scat;
517         int ret = 0;
518         int nreq;
519         int ind;
520         int i;
521
522         pthread_spin_lock(&qp->rq.lock);
523
524         /* XXX check that state is OK to post receive */
525
526         ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
527
528         for (nreq = 0; wr; ++nreq, wr = wr->next) {
529                 if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
530                         ret = ENOMEM;
531                         *bad_wr = wr;
532                         goto out;
533                 }
534
535                 if (wr->num_sge > qp->rq.max_gs) {
536                         ret = ENOMEM;
537                         *bad_wr = wr;
538                         goto out;
539                 }
540
541                 scat = get_recv_wqe(qp, ind);
542
543                 for (i = 0; i < wr->num_sge; ++i)
544                         __set_data_seg(scat + i, wr->sg_list + i);
545
546                 if (i < qp->rq.max_gs) {
547                         scat[i].byte_count = 0;
548                         scat[i].lkey       = htobe32(MLX4_INVALID_LKEY);
549                         scat[i].addr       = 0;
550                 }
551
552                 qp->rq.wrid[ind] = wr->wr_id;
553
554                 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
555         }
556
557 out:
558         if (nreq) {
559                 qp->rq.head += nreq;
560
561                 /*
562                  * Make sure that descriptors are written before
563                  * doorbell record.
564                  */
565                 udma_to_device_barrier();
566
567                 *qp->db = htobe32(qp->rq.head & 0xffff);
568         }
569
570         pthread_spin_unlock(&qp->rq.lock);
571
572         return ret;
573 }
574
575 static int num_inline_segs(int data, enum ibv_qp_type type)
576 {
577         /*
578          * Inline data segments are not allowed to cross 64 byte
579          * boundaries.  For UD QPs, the data segments always start
580          * aligned to 64 bytes (16 byte control segment + 48 byte
581          * datagram segment); for other QPs, there will be a 16 byte
582          * control segment and possibly a 16 byte remote address
583          * segment, so in the worst case there will be only 32 bytes
584          * available for the first data segment.
585          */
586         if (type == IBV_QPT_UD)
587                 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
588                          sizeof (struct mlx4_wqe_datagram_seg)) %
589                         MLX4_INLINE_ALIGN;
590         else
591                 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
592                          sizeof (struct mlx4_wqe_raddr_seg)) %
593                         MLX4_INLINE_ALIGN;
594
595         return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
596                 (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
597 }
598
599 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
600                            struct mlx4_qp *qp)
601 {
602         int size;
603         int max_sq_sge;
604
605         max_sq_sge       = align(cap->max_inline_data +
606                                  num_inline_segs(cap->max_inline_data, type) *
607                                  sizeof (struct mlx4_wqe_inline_seg),
608                                  sizeof (struct mlx4_wqe_data_seg)) /
609                 sizeof (struct mlx4_wqe_data_seg);
610         if (max_sq_sge < cap->max_send_sge)
611                 max_sq_sge = cap->max_send_sge;
612
613         size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
614         switch (type) {
615         case IBV_QPT_UD:
616                 size += sizeof (struct mlx4_wqe_datagram_seg);
617                 break;
618
619         case IBV_QPT_UC:
620                 size += sizeof (struct mlx4_wqe_raddr_seg);
621                 break;
622
623         case IBV_QPT_XRC_SEND:
624         case IBV_QPT_RC:
625                 size += sizeof (struct mlx4_wqe_raddr_seg);
626                 /*
627                  * An atomic op will require an atomic segment, a
628                  * remote address segment and one scatter entry.
629                  */
630                 if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
631                             sizeof (struct mlx4_wqe_raddr_seg) +
632                             sizeof (struct mlx4_wqe_data_seg)))
633                         size = (sizeof (struct mlx4_wqe_atomic_seg) +
634                                 sizeof (struct mlx4_wqe_raddr_seg) +
635                                 sizeof (struct mlx4_wqe_data_seg));
636                 break;
637
638         default:
639                 break;
640         }
641
642         /* Make sure that we have enough space for a bind request */
643         if (size < sizeof (struct mlx4_wqe_bind_seg))
644                 size = sizeof (struct mlx4_wqe_bind_seg);
645
646         size += sizeof (struct mlx4_wqe_ctrl_seg);
647
648         for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
649              qp->sq.wqe_shift++)
650                 ; /* nothing */
651 }
652
653 int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
654                        enum ibv_qp_type type, struct mlx4_qp *qp)
655 {
656         qp->rq.max_gs    = cap->max_recv_sge;
657
658         if (qp->sq.wqe_cnt) {
659                 qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
660                 if (!qp->sq.wrid)
661                         return -1;
662         }
663
664         if (qp->rq.wqe_cnt) {
665                 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
666                 if (!qp->rq.wrid) {
667                         free(qp->sq.wrid);
668                         return -1;
669                 }
670         }
671
672         for (qp->rq.wqe_shift = 4;
673              1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
674              qp->rq.wqe_shift++)
675                 ; /* nothing */
676
677         qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
678                 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
679         if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
680                 qp->rq.offset = 0;
681                 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
682         } else {
683                 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
684                 qp->sq.offset = 0;
685         }
686
687         if (qp->buf_size) {
688                 if (mlx4_alloc_buf(&qp->buf,
689                                    align(qp->buf_size, to_mdev(context->device)->page_size),
690                                    to_mdev(context->device)->page_size)) {
691                         free(qp->sq.wrid);
692                         free(qp->rq.wrid);
693                         return -1;
694                 }
695
696                 memset(qp->buf.buf, 0, qp->buf_size);
697         } else {
698                 qp->buf.buf = NULL;
699         }
700
701         return 0;
702 }
703
704 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
705                        enum ibv_qp_type type)
706 {
707         int wqe_size;
708
709         wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
710         switch (type) {
711         case IBV_QPT_UD:
712                 wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
713                 break;
714
715         case IBV_QPT_XRC_SEND:
716         case IBV_QPT_UC:
717         case IBV_QPT_RC:
718                 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
719                 break;
720
721         default:
722                 break;
723         }
724
725         qp->sq.max_gs        = wqe_size / sizeof (struct mlx4_wqe_data_seg);
726         cap->max_send_sge    = qp->sq.max_gs;
727         qp->sq.max_post      = qp->sq.wqe_cnt - qp->sq_spare_wqes;
728         cap->max_send_wr     = qp->sq.max_post;
729
730         /*
731          * Inline data segments can't cross a 64 byte boundary.  So
732          * subtract off one segment header for each 64-byte chunk,
733          * taking into account the fact that wqe_size will be 32 mod
734          * 64 for non-UD QPs.
735          */
736         qp->max_inline_data  = wqe_size -
737                 sizeof (struct mlx4_wqe_inline_seg) *
738                 (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
739         cap->max_inline_data = qp->max_inline_data;
740 }
741
742 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
743 {
744         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
745
746         if (ctx->qp_table[tind].refcnt)
747                 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
748         else
749                 return NULL;
750 }
751
752 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
753 {
754         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
755
756         if (!ctx->qp_table[tind].refcnt) {
757                 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
758                                                    sizeof (struct mlx4_qp *));
759                 if (!ctx->qp_table[tind].table)
760                         return -1;
761         }
762
763         ++ctx->qp_table[tind].refcnt;
764         ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
765         return 0;
766 }
767
768 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
769 {
770         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
771
772         if (!--ctx->qp_table[tind].refcnt)
773                 free(ctx->qp_table[tind].table);
774         else
775                 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
776 }