]> CyberLeo.Net >> Repos - FreeBSD/releng/9.2.git/blob - contrib/ofed/libmthca/src/qp.c
- Copy stable/9 to releng/9.2 as part of the 9.2-RELEASE cycle.
[FreeBSD/releng/9.2.git] / contrib / ofed / libmthca / src / qp.c
1 /*
2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #if HAVE_CONFIG_H
35 #  include <config.h>
36 #endif /* HAVE_CONFIG_H */
37
38 #include <stdlib.h>
39 #include <netinet/in.h>
40 #include <pthread.h>
41 #include <string.h>
42
43 #include "mthca.h"
44 #include "doorbell.h"
45 #include "wqe.h"
46
47 enum {
48         MTHCA_SEND_DOORBELL_FENCE = 1 << 5
49 };
50
51 static const uint8_t mthca_opcode[] = {
52         [IBV_WR_SEND]                 = MTHCA_OPCODE_SEND,
53         [IBV_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
54         [IBV_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
55         [IBV_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
56         [IBV_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
57         [IBV_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
58         [IBV_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
59 };
60
61 static void *get_recv_wqe(struct mthca_qp *qp, int n)
62 {
63         return qp->buf.buf + (n << qp->rq.wqe_shift);
64 }
65
66 static void *get_send_wqe(struct mthca_qp *qp, int n)
67 {
68         return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
69 }
70
71 void mthca_init_qp_indices(struct mthca_qp *qp)
72 {
73         qp->sq.next_ind  = 0;
74         qp->sq.last_comp = qp->sq.max - 1;
75         qp->sq.head      = 0;
76         qp->sq.tail      = 0;
77         qp->sq.last      = get_send_wqe(qp, qp->sq.max - 1);
78
79         qp->rq.next_ind  = 0;
80         qp->rq.last_comp = qp->rq.max - 1;
81         qp->rq.head      = 0;
82         qp->rq.tail      = 0;
83         qp->rq.last      = get_recv_wqe(qp, qp->rq.max - 1);
84 }
85
86 static inline int wq_overflow(struct mthca_wq *wq, int nreq, struct mthca_cq *cq)
87 {
88         unsigned cur;
89
90         cur = wq->head - wq->tail;
91         if (cur + nreq < wq->max)
92                 return 0;
93
94         pthread_spin_lock(&cq->lock);
95         cur = wq->head - wq->tail;
96         pthread_spin_unlock(&cq->lock);
97
98         return cur + nreq >= wq->max;
99 }
100
101 int mthca_tavor_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
102                           struct ibv_send_wr **bad_wr)
103 {
104         struct mthca_qp *qp = to_mqp(ibqp);
105         void *wqe, *prev_wqe;
106         int ind;
107         int nreq;
108         int ret = 0;
109         int size;
110         int size0 = 0;
111         int i;
112         /*
113          * f0 and op0 cannot be used unless nreq > 0, which means this
114          * function makes it through the loop at least once.  So the
115          * code inside the if (!size0) will be executed, and f0 and
116          * op0 will be initialized.  So any gcc warning about "may be
117          * used unitialized" is bogus.
118          */
119         uint32_t f0;
120         uint32_t op0;
121
122         pthread_spin_lock(&qp->sq.lock);
123
124         ind = qp->sq.next_ind;
125
126         for (nreq = 0; wr; ++nreq, wr = wr->next) {
127                 if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
128                         ret = -1;
129                         *bad_wr = wr;
130                         goto out;
131                 }
132
133                 wqe = get_send_wqe(qp, ind);
134                 prev_wqe = qp->sq.last;
135                 qp->sq.last = wqe;
136
137                 ((struct mthca_next_seg *) wqe)->nda_op = 0;
138                 ((struct mthca_next_seg *) wqe)->ee_nds = 0;
139                 ((struct mthca_next_seg *) wqe)->flags =
140                         ((wr->send_flags & IBV_SEND_SIGNALED) ?
141                          htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
142                         ((wr->send_flags & IBV_SEND_SOLICITED) ?
143                          htonl(MTHCA_NEXT_SOLICIT) : 0)   |
144                         htonl(1);
145                 if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
146                     wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
147                         ((struct mthca_next_seg *) wqe)->imm = wr->imm_data;
148
149                 wqe += sizeof (struct mthca_next_seg);
150                 size = sizeof (struct mthca_next_seg) / 16;
151
152                 switch (ibqp->qp_type) {
153                 case IBV_QPT_RC:
154                         switch (wr->opcode) {
155                         case IBV_WR_ATOMIC_CMP_AND_SWP:
156                         case IBV_WR_ATOMIC_FETCH_AND_ADD:
157                                 ((struct mthca_raddr_seg *) wqe)->raddr =
158                                         htonll(wr->wr.atomic.remote_addr);
159                                 ((struct mthca_raddr_seg *) wqe)->rkey =
160                                         htonl(wr->wr.atomic.rkey);
161                                 ((struct mthca_raddr_seg *) wqe)->reserved = 0;
162
163                                 wqe += sizeof (struct mthca_raddr_seg);
164
165                                 if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166                                         ((struct mthca_atomic_seg *) wqe)->swap_add =
167                                                 htonll(wr->wr.atomic.swap);
168                                         ((struct mthca_atomic_seg *) wqe)->compare =
169                                                 htonll(wr->wr.atomic.compare_add);
170                                 } else {
171                                         ((struct mthca_atomic_seg *) wqe)->swap_add =
172                                                 htonll(wr->wr.atomic.compare_add);
173                                         ((struct mthca_atomic_seg *) wqe)->compare = 0;
174                                 }
175
176                                 wqe += sizeof (struct mthca_atomic_seg);
177                                 size += (sizeof (struct mthca_raddr_seg) +
178                                          sizeof (struct mthca_atomic_seg)) / 16;
179                                 break;
180
181                         case IBV_WR_RDMA_WRITE:
182                         case IBV_WR_RDMA_WRITE_WITH_IMM:
183                         case IBV_WR_RDMA_READ:
184                                 ((struct mthca_raddr_seg *) wqe)->raddr =
185                                         htonll(wr->wr.rdma.remote_addr);
186                                 ((struct mthca_raddr_seg *) wqe)->rkey =
187                                         htonl(wr->wr.rdma.rkey);
188                                 ((struct mthca_raddr_seg *) wqe)->reserved = 0;
189                                 wqe += sizeof (struct mthca_raddr_seg);
190                                 size += sizeof (struct mthca_raddr_seg) / 16;
191                                 break;
192
193                         default:
194                                 /* No extra segments required for sends */
195                                 break;
196                         }
197
198                         break;
199
200                 case IBV_QPT_UC:
201                         switch (wr->opcode) {
202                         case IBV_WR_RDMA_WRITE:
203                         case IBV_WR_RDMA_WRITE_WITH_IMM:
204                                 ((struct mthca_raddr_seg *) wqe)->raddr =
205                                         htonll(wr->wr.rdma.remote_addr);
206                                 ((struct mthca_raddr_seg *) wqe)->rkey =
207                                         htonl(wr->wr.rdma.rkey);
208                                 ((struct mthca_raddr_seg *) wqe)->reserved = 0;
209                                 wqe += sizeof (struct mthca_raddr_seg);
210                                 size += sizeof (struct mthca_raddr_seg) / 16;
211                                 break;
212
213                         default:
214                                 /* No extra segments required for sends */
215                                 break;
216                         }
217
218                         break;
219
220                 case IBV_QPT_UD:
221                         ((struct mthca_tavor_ud_seg *) wqe)->lkey =
222                                 htonl(to_mah(wr->wr.ud.ah)->key);
223                         ((struct mthca_tavor_ud_seg *) wqe)->av_addr =
224                                 htonll((uintptr_t) to_mah(wr->wr.ud.ah)->av);
225                         ((struct mthca_tavor_ud_seg *) wqe)->dqpn =
226                                 htonl(wr->wr.ud.remote_qpn);
227                         ((struct mthca_tavor_ud_seg *) wqe)->qkey =
228                                 htonl(wr->wr.ud.remote_qkey);
229
230                         wqe += sizeof (struct mthca_tavor_ud_seg);
231                         size += sizeof (struct mthca_tavor_ud_seg) / 16;
232                         break;
233
234                 default:
235                         break;
236                 }
237
238                 if (wr->num_sge > qp->sq.max_gs) {
239                         ret = -1;
240                         *bad_wr = wr;
241                         goto out;
242                 }
243
244                 if (wr->send_flags & IBV_SEND_INLINE) {
245                         if (wr->num_sge) {
246                                 struct mthca_inline_seg *seg = wqe;
247                                 int s = 0;
248
249                                 wqe += sizeof *seg;
250                                 for (i = 0; i < wr->num_sge; ++i) {
251                                         struct ibv_sge *sge = &wr->sg_list[i];
252
253                                         s += sge->length;
254
255                                         if (s > qp->max_inline_data) {
256                                                 ret = -1;
257                                                 *bad_wr = wr;
258                                                 goto out;
259                                         }
260
261                                         memcpy(wqe, (void *) (intptr_t) sge->addr,
262                                                sge->length);
263                                         wqe += sge->length;
264                                 }
265
266                                 seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
267                                 size += align(s + sizeof *seg, 16) / 16;
268                         }
269                 } else {
270                         struct mthca_data_seg *seg;
271
272                         for (i = 0; i < wr->num_sge; ++i) {
273                                 seg = wqe;
274                                 seg->byte_count = htonl(wr->sg_list[i].length);
275                                 seg->lkey = htonl(wr->sg_list[i].lkey);
276                                 seg->addr = htonll(wr->sg_list[i].addr);
277                                 wqe += sizeof *seg;
278                         }
279
280                         size += wr->num_sge * (sizeof *seg / 16);
281                 }
282
283                 qp->wrid[ind + qp->rq.max] = wr->wr_id;
284
285                 if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) {
286                         ret = -1;
287                         *bad_wr = wr;
288                         goto out;
289                 }
290
291                 ((struct mthca_next_seg *) prev_wqe)->nda_op =
292                         htonl(((ind << qp->sq.wqe_shift) +
293                                qp->send_wqe_offset) |
294                               mthca_opcode[wr->opcode]);
295                 /*
296                  * Make sure that nda_op is written before setting ee_nds.
297                  */
298                 wmb();
299                 ((struct mthca_next_seg *) prev_wqe)->ee_nds =
300                         htonl((size0 ? 0 : MTHCA_NEXT_DBD) | size |
301                         ((wr->send_flags & IBV_SEND_FENCE) ?
302                          MTHCA_NEXT_FENCE : 0));
303
304                 if (!size0) {
305                         size0 = size;
306                         op0   = mthca_opcode[wr->opcode];
307                         f0    = wr->send_flags & IBV_SEND_FENCE ?
308                                 MTHCA_SEND_DOORBELL_FENCE : 0;
309                 }
310
311                 ++ind;
312                 if (ind >= qp->sq.max)
313                         ind -= qp->sq.max;
314         }
315
316 out:
317         if (nreq) {
318                 uint32_t doorbell[2];
319
320                 doorbell[0] = htonl(((qp->sq.next_ind << qp->sq.wqe_shift) +
321                                      qp->send_wqe_offset) | f0 | op0);
322                 doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
323
324                 mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
325         }
326
327         qp->sq.next_ind = ind;
328         qp->sq.head    += nreq;
329
330         pthread_spin_unlock(&qp->sq.lock);
331         return ret;
332 }
333
334 int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
335                           struct ibv_recv_wr **bad_wr)
336 {
337         struct mthca_qp *qp = to_mqp(ibqp);
338         uint32_t doorbell[2];
339         int ret = 0;
340         int nreq;
341         int i;
342         int size;
343         int size0 = 0;
344         int ind;
345         void *wqe;
346         void *prev_wqe;
347
348         pthread_spin_lock(&qp->rq.lock);
349
350         ind = qp->rq.next_ind;
351
352         for (nreq = 0; wr; wr = wr->next) {
353                 if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
354                         ret = -1;
355                         *bad_wr = wr;
356                         goto out;
357                 }
358
359                 wqe = get_recv_wqe(qp, ind);
360                 prev_wqe = qp->rq.last;
361                 qp->rq.last = wqe;
362
363                 ((struct mthca_next_seg *) wqe)->ee_nds =
364                         htonl(MTHCA_NEXT_DBD);
365                 ((struct mthca_next_seg *) wqe)->flags =
366                         htonl(MTHCA_NEXT_CQ_UPDATE);
367
368                 wqe += sizeof (struct mthca_next_seg);
369                 size = sizeof (struct mthca_next_seg) / 16;
370
371                 if (wr->num_sge > qp->rq.max_gs) {
372                         ret = -1;
373                         *bad_wr = wr;
374                         goto out;
375                 }
376
377                 for (i = 0; i < wr->num_sge; ++i) {
378                         ((struct mthca_data_seg *) wqe)->byte_count =
379                                 htonl(wr->sg_list[i].length);
380                         ((struct mthca_data_seg *) wqe)->lkey =
381                                 htonl(wr->sg_list[i].lkey);
382                         ((struct mthca_data_seg *) wqe)->addr =
383                                 htonll(wr->sg_list[i].addr);
384                         wqe += sizeof (struct mthca_data_seg);
385                         size += sizeof (struct mthca_data_seg) / 16;
386                 }
387
388                 qp->wrid[ind] = wr->wr_id;
389
390                 ((struct mthca_next_seg *) prev_wqe)->ee_nds =
391                         htonl(MTHCA_NEXT_DBD | size);
392
393                 if (!size0)
394                         size0 = size;
395
396                 ++ind;
397                 if (ind >= qp->rq.max)
398                         ind -= qp->rq.max;
399
400                 ++nreq;
401                 if (nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB) {
402                         nreq = 0;
403
404                         doorbell[0] = htonl((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
405                         doorbell[1] = htonl(ibqp->qp_num << 8);
406
407                         /*
408                          * Make sure that descriptors are written
409                          * before doorbell is rung.
410                          */
411                         wmb();
412
413                         mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_RECV_DOORBELL);
414
415                         qp->rq.next_ind = ind;
416                         qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
417                         size0 = 0;
418                 }
419         }
420
421 out:
422         if (nreq) {
423                 doorbell[0] = htonl((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
424                 doorbell[1] = htonl((ibqp->qp_num << 8) | nreq);
425
426                 /*
427                  * Make sure that descriptors are written before
428                  * doorbell is rung.
429                  */
430                 wmb();
431
432                 mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_RECV_DOORBELL);
433         }
434
435         qp->rq.next_ind = ind;
436         qp->rq.head    += nreq;
437
438         pthread_spin_unlock(&qp->rq.lock);
439         return ret;
440 }
441
442 int mthca_arbel_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
443                           struct ibv_send_wr **bad_wr)
444 {
445         struct mthca_qp *qp = to_mqp(ibqp);
446         uint32_t doorbell[2];
447         void *wqe, *prev_wqe;
448         int ind;
449         int nreq;
450         int ret = 0;
451         int size;
452         int size0 = 0;
453         int i;
454         /*
455          * f0 and op0 cannot be used unless nreq > 0, which means this
456          * function makes it through the loop at least once.  So the
457          * code inside the if (!size0) will be executed, and f0 and
458          * op0 will be initialized.  So any gcc warning about "may be
459          * used unitialized" is bogus.
460          */
461         uint32_t f0;
462         uint32_t op0;
463
464         pthread_spin_lock(&qp->sq.lock);
465
466         /* XXX check that state is OK to post send */
467
468         ind = qp->sq.head & (qp->sq.max - 1);
469
470         for (nreq = 0; wr; ++nreq, wr = wr->next) {
471                 if (nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB) {
472                         nreq = 0;
473
474                         doorbell[0] = htonl((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
475                                             ((qp->sq.head & 0xffff) << 8) | f0 | op0);
476                         doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
477
478                         qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
479
480                         /*
481                          * Make sure that descriptors are written before
482                          * doorbell record.
483                          */
484                         wmb();
485                         *qp->sq.db = htonl(qp->sq.head & 0xffff);
486
487                         /*
488                          * Make sure doorbell record is written before we
489                          * write MMIO send doorbell.
490                          */
491                         wmb();
492                         mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
493
494                         size0 = 0;
495                 }
496
497                 if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
498                         ret = -1;
499                         *bad_wr = wr;
500                         goto out;
501                 }
502
503                 wqe = get_send_wqe(qp, ind);
504                 prev_wqe = qp->sq.last;
505                 qp->sq.last = wqe;
506
507                 ((struct mthca_next_seg *) wqe)->flags =
508                         ((wr->send_flags & IBV_SEND_SIGNALED) ?
509                          htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
510                         ((wr->send_flags & IBV_SEND_SOLICITED) ?
511                          htonl(MTHCA_NEXT_SOLICIT) : 0)   |
512                         htonl(1);
513                 if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
514                     wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
515                         ((struct mthca_next_seg *) wqe)->imm = wr->imm_data;
516
517                 wqe += sizeof (struct mthca_next_seg);
518                 size = sizeof (struct mthca_next_seg) / 16;
519
520                 switch (ibqp->qp_type) {
521                 case IBV_QPT_RC:
522                         switch (wr->opcode) {
523                         case IBV_WR_ATOMIC_CMP_AND_SWP:
524                         case IBV_WR_ATOMIC_FETCH_AND_ADD:
525                                 ((struct mthca_raddr_seg *) wqe)->raddr =
526                                         htonll(wr->wr.atomic.remote_addr);
527                                 ((struct mthca_raddr_seg *) wqe)->rkey =
528                                         htonl(wr->wr.atomic.rkey);
529                                 ((struct mthca_raddr_seg *) wqe)->reserved = 0;
530
531                                 wqe += sizeof (struct mthca_raddr_seg);
532
533                                 if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
534                                         ((struct mthca_atomic_seg *) wqe)->swap_add =
535                                                 htonll(wr->wr.atomic.swap);
536                                         ((struct mthca_atomic_seg *) wqe)->compare =
537                                                 htonll(wr->wr.atomic.compare_add);
538                                 } else {
539                                         ((struct mthca_atomic_seg *) wqe)->swap_add =
540                                                 htonll(wr->wr.atomic.compare_add);
541                                         ((struct mthca_atomic_seg *) wqe)->compare = 0;
542                                 }
543
544                                 wqe += sizeof (struct mthca_atomic_seg);
545                                 size += (sizeof (struct mthca_raddr_seg) +
546                                          sizeof (struct mthca_atomic_seg)) / 16;
547                                 break;
548
549                         case IBV_WR_RDMA_WRITE:
550                         case IBV_WR_RDMA_WRITE_WITH_IMM:
551                         case IBV_WR_RDMA_READ:
552                                 ((struct mthca_raddr_seg *) wqe)->raddr =
553                                         htonll(wr->wr.rdma.remote_addr);
554                                 ((struct mthca_raddr_seg *) wqe)->rkey =
555                                         htonl(wr->wr.rdma.rkey);
556                                 ((struct mthca_raddr_seg *) wqe)->reserved = 0;
557                                 wqe += sizeof (struct mthca_raddr_seg);
558                                 size += sizeof (struct mthca_raddr_seg) / 16;
559                                 break;
560
561                         default:
562                                 /* No extra segments required for sends */
563                                 break;
564                         }
565
566                         break;
567
568                 case IBV_QPT_UC:
569                         switch (wr->opcode) {
570                         case IBV_WR_RDMA_WRITE:
571                         case IBV_WR_RDMA_WRITE_WITH_IMM:
572                                 ((struct mthca_raddr_seg *) wqe)->raddr =
573                                         htonll(wr->wr.rdma.remote_addr);
574                                 ((struct mthca_raddr_seg *) wqe)->rkey =
575                                         htonl(wr->wr.rdma.rkey);
576                                 ((struct mthca_raddr_seg *) wqe)->reserved = 0;
577                                 wqe += sizeof (struct mthca_raddr_seg);
578                                 size += sizeof (struct mthca_raddr_seg) / 16;
579                                 break;
580
581                         default:
582                                 /* No extra segments required for sends */
583                                 break;
584                         }
585
586                         break;
587
588                 case IBV_QPT_UD:
589                         memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,
590                                to_mah(wr->wr.ud.ah)->av, sizeof (struct mthca_av));
591                         ((struct mthca_arbel_ud_seg *) wqe)->dqpn =
592                                 htonl(wr->wr.ud.remote_qpn);
593                         ((struct mthca_arbel_ud_seg *) wqe)->qkey =
594                                 htonl(wr->wr.ud.remote_qkey);
595
596                         wqe += sizeof (struct mthca_arbel_ud_seg);
597                         size += sizeof (struct mthca_arbel_ud_seg) / 16;
598                         break;
599
600                 default:
601                         break;
602                 }
603
604                 if (wr->num_sge > qp->sq.max_gs) {
605                         ret = -1;
606                         *bad_wr = wr;
607                         goto out;
608                 }
609
610                 if (wr->send_flags & IBV_SEND_INLINE) {
611                         if (wr->num_sge) {
612                                 struct mthca_inline_seg *seg = wqe;
613                                 int s = 0;
614
615                                 wqe += sizeof *seg;
616                                 for (i = 0; i < wr->num_sge; ++i) {
617                                         struct ibv_sge *sge = &wr->sg_list[i];
618
619                                         s += sge->length;
620
621                                         if (s > qp->max_inline_data) {
622                                                 ret = -1;
623                                                 *bad_wr = wr;
624                                                 goto out;
625                                         }
626
627                                         memcpy(wqe, (void *) (uintptr_t) sge->addr,
628                                                sge->length);
629                                         wqe += sge->length;
630                                 }
631
632                                 seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
633                                 size += align(s + sizeof *seg, 16) / 16;
634                         }
635                 } else {
636                         struct mthca_data_seg *seg;
637
638                         for (i = 0; i < wr->num_sge; ++i) {
639                                 seg = wqe;
640                                 seg->byte_count = htonl(wr->sg_list[i].length);
641                                 seg->lkey = htonl(wr->sg_list[i].lkey);
642                                 seg->addr = htonll(wr->sg_list[i].addr);
643                                 wqe += sizeof *seg;
644                         }
645
646                         size += wr->num_sge * (sizeof *seg / 16);
647                 }
648
649                 qp->wrid[ind + qp->rq.max] = wr->wr_id;
650
651                 if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) {
652                         ret = -1;
653                         *bad_wr = wr;
654                         goto out;
655                 }
656
657                 ((struct mthca_next_seg *) prev_wqe)->nda_op =
658                         htonl(((ind << qp->sq.wqe_shift) +
659                                qp->send_wqe_offset) |
660                               mthca_opcode[wr->opcode]);
661                 wmb();
662                 ((struct mthca_next_seg *) prev_wqe)->ee_nds =
663                         htonl(MTHCA_NEXT_DBD | size |
664                               ((wr->send_flags & IBV_SEND_FENCE) ?
665                                MTHCA_NEXT_FENCE : 0));
666
667                 if (!size0) {
668                         size0 = size;
669                         op0   = mthca_opcode[wr->opcode];
670                         f0    = wr->send_flags & IBV_SEND_FENCE ?
671                                 MTHCA_SEND_DOORBELL_FENCE : 0;
672                 }
673
674                 ++ind;
675                 if (ind >= qp->sq.max)
676                         ind -= qp->sq.max;
677         }
678
679 out:
680         if (nreq) {
681                 doorbell[0] = htonl((nreq << 24)                  |
682                                     ((qp->sq.head & 0xffff) << 8) |
683                                     f0 | op0);
684                 doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
685
686                 qp->sq.head += nreq;
687
688                 /*
689                  * Make sure that descriptors are written before
690                  * doorbell record.
691                  */
692                 wmb();
693                 *qp->sq.db = htonl(qp->sq.head & 0xffff);
694
695                 /*
696                  * Make sure doorbell record is written before we
697                  * write MMIO send doorbell.
698                  */
699                 wmb();
700                 mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
701         }
702
703         pthread_spin_unlock(&qp->sq.lock);
704         return ret;
705 }
706
707 int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
708                           struct ibv_recv_wr **bad_wr)
709 {
710         struct mthca_qp *qp = to_mqp(ibqp);
711         int ret = 0;
712         int nreq;
713         int ind;
714         int i;
715         void *wqe;
716
717         pthread_spin_lock(&qp->rq.lock);
718
719         /* XXX check that state is OK to post receive */
720
721         ind = qp->rq.head & (qp->rq.max - 1);
722
723         for (nreq = 0; wr; ++nreq, wr = wr->next) {
724                 if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
725                         ret = -1;
726                         *bad_wr = wr;
727                         goto out;
728                 }
729
730                 wqe = get_recv_wqe(qp, ind);
731
732                 ((struct mthca_next_seg *) wqe)->flags = 0;
733
734                 wqe += sizeof (struct mthca_next_seg);
735
736                 if (wr->num_sge > qp->rq.max_gs) {
737                         ret = -1;
738                         *bad_wr = wr;
739                         goto out;
740                 }
741
742                 for (i = 0; i < wr->num_sge; ++i) {
743                         ((struct mthca_data_seg *) wqe)->byte_count =
744                                 htonl(wr->sg_list[i].length);
745                         ((struct mthca_data_seg *) wqe)->lkey =
746                                 htonl(wr->sg_list[i].lkey);
747                         ((struct mthca_data_seg *) wqe)->addr =
748                                 htonll(wr->sg_list[i].addr);
749                         wqe += sizeof (struct mthca_data_seg);
750                 }
751
752                 if (i < qp->rq.max_gs) {
753                         ((struct mthca_data_seg *) wqe)->byte_count = 0;
754                         ((struct mthca_data_seg *) wqe)->lkey = htonl(MTHCA_INVAL_LKEY);
755                         ((struct mthca_data_seg *) wqe)->addr = 0;
756                 }
757
758                 qp->wrid[ind] = wr->wr_id;
759
760                 ++ind;
761                 if (ind >= qp->rq.max)
762                         ind -= qp->rq.max;
763         }
764 out:
765         if (nreq) {
766                 qp->rq.head += nreq;
767
768                 /*
769                  * Make sure that descriptors are written before
770                  * doorbell record.
771                  */
772                 wmb();
773                 *qp->rq.db = htonl(qp->rq.head & 0xffff);
774         }
775
776         pthread_spin_unlock(&qp->rq.lock);
777         return ret;
778 }
779
780 int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
781                        enum ibv_qp_type type, struct mthca_qp *qp)
782 {
783         int size;
784         int max_sq_sge;
785         struct mthca_next_seg *next;
786         int i;
787
788         qp->rq.max_gs    = cap->max_recv_sge;
789         qp->sq.max_gs    = cap->max_send_sge;
790         max_sq_sge       = align(cap->max_inline_data + sizeof (struct mthca_inline_seg),
791                                  sizeof (struct mthca_data_seg)) / sizeof (struct mthca_data_seg);
792         if (max_sq_sge < cap->max_send_sge)
793                 max_sq_sge = cap->max_send_sge;
794
795         qp->wrid = malloc((qp->rq.max + qp->sq.max) * sizeof (uint64_t));
796         if (!qp->wrid)
797                 return -1;
798
799         size = sizeof (struct mthca_next_seg) +
800                 qp->rq.max_gs * sizeof (struct mthca_data_seg);
801
802         for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
803              qp->rq.wqe_shift++)
804                 ; /* nothing */
805
806         size = max_sq_sge * sizeof (struct mthca_data_seg);
807         switch (type) {
808         case IBV_QPT_UD:
809                 size += mthca_is_memfree(pd->context) ?
810                         sizeof (struct mthca_arbel_ud_seg) :
811                         sizeof (struct mthca_tavor_ud_seg);
812                 break;
813
814         case IBV_QPT_UC:
815                 size += sizeof (struct mthca_raddr_seg);
816                 break;
817
818         case IBV_QPT_RC:
819                 size += sizeof (struct mthca_raddr_seg);
820                 /*
821                  * An atomic op will require an atomic segment, a
822                  * remote address segment and one scatter entry.
823                  */
824                 if (size < (sizeof (struct mthca_atomic_seg) +
825                             sizeof (struct mthca_raddr_seg) +
826                             sizeof (struct mthca_data_seg)))
827                         size = (sizeof (struct mthca_atomic_seg) +
828                                 sizeof (struct mthca_raddr_seg) +
829                                 sizeof (struct mthca_data_seg));
830                 break;
831
832         default:
833                 break;
834         }
835
836         /* Make sure that we have enough space for a bind request */
837         if (size < sizeof (struct mthca_bind_seg))
838                 size = sizeof (struct mthca_bind_seg);
839
840         size += sizeof (struct mthca_next_seg);
841
842         for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
843              qp->sq.wqe_shift++)
844                 ; /* nothing */
845
846         qp->send_wqe_offset = align(qp->rq.max << qp->rq.wqe_shift,
847                                     1 << qp->sq.wqe_shift);
848
849         qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
850
851         if (mthca_alloc_buf(&qp->buf,
852                             align(qp->buf_size, to_mdev(pd->context->device)->page_size),
853                             to_mdev(pd->context->device)->page_size)) {
854                 free(qp->wrid);
855                 return -1;
856         }
857
858         memset(qp->buf.buf, 0, qp->buf_size);
859
860         if (mthca_is_memfree(pd->context)) {
861                 struct mthca_data_seg *scatter;
862                 uint32_t sz;
863
864                 sz = htonl((sizeof (struct mthca_next_seg) +
865                             qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16);
866
867                 for (i = 0; i < qp->rq.max; ++i) {
868                         next = get_recv_wqe(qp, i);
869                         next->nda_op = htonl(((i + 1) & (qp->rq.max - 1)) <<
870                                              qp->rq.wqe_shift);
871                         next->ee_nds = sz;
872
873                         for (scatter = (void *) (next + 1);
874                              (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
875                              ++scatter)
876                                 scatter->lkey = htonl(MTHCA_INVAL_LKEY);
877                 }
878
879                 for (i = 0; i < qp->sq.max; ++i) {
880                         next = get_send_wqe(qp, i);
881                         next->nda_op = htonl((((i + 1) & (qp->sq.max - 1)) <<
882                                               qp->sq.wqe_shift) +
883                                              qp->send_wqe_offset);
884                 }
885         } else {
886                 for (i = 0; i < qp->rq.max; ++i) {
887                         next = get_recv_wqe(qp, i);
888                         next->nda_op = htonl((((i + 1) % qp->rq.max) <<
889                                              qp->rq.wqe_shift) | 1);
890                 }
891         }
892
893         qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
894         qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
895
896         return 0;
897 }
898
899 struct mthca_qp *mthca_find_qp(struct mthca_context *ctx, uint32_t qpn)
900 {
901         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
902
903         if (ctx->qp_table[tind].refcnt)
904                 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
905         else
906                 return NULL;
907 }
908
909 int mthca_store_qp(struct mthca_context *ctx, uint32_t qpn, struct mthca_qp *qp)
910 {
911         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
912
913         if (!ctx->qp_table[tind].refcnt) {
914                 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
915                                                    sizeof (struct mthca_qp *));
916                 if (!ctx->qp_table[tind].table)
917                         return -1;
918         }
919
920         ++ctx->qp_table[tind].refcnt;
921         ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
922         return 0;
923 }
924
925 void mthca_clear_qp(struct mthca_context *ctx, uint32_t qpn)
926 {
927         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
928
929         if (!--ctx->qp_table[tind].refcnt)
930                 free(ctx->qp_table[tind].table);
931         else
932                 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
933 }
934
935 int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
936                        int index, int *dbd, uint32_t *new_wqe)
937 {
938         struct mthca_next_seg *next;
939
940         /*
941          * For SRQs, all receive WQEs generate a CQE, so we're always
942          * at the end of the doorbell chain.
943          */
944         if (qp->ibv_qp.srq && !is_send) {
945                 *new_wqe = 0;
946                 return 0;
947         }
948
949         if (is_send)
950                 next = get_send_wqe(qp, index);
951         else
952                 next = get_recv_wqe(qp, index);
953
954         *dbd = !!(next->ee_nds & htonl(MTHCA_NEXT_DBD));
955         if (next->ee_nds & htonl(0x3f))
956                 *new_wqe = (next->nda_op & htonl(~0x3f)) |
957                         (next->ee_nds & htonl(0x3f));
958         else
959                 *new_wqe = 0;
960
961         return 0;
962 }
963