contrib/ofed/libmlx4/src/qp.c

   1 /*
   2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
   3  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
   4  * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
   5  *
   6  * This software is available to you under a choice of one of two
   7  * licenses.  You may choose to be licensed under the terms of the GNU
   8  * General Public License (GPL) Version 2, available from the file
   9  * COPYING in the main directory of this source tree, or the
  10  * OpenIB.org BSD license below:
  11  *
  12  *     Redistribution and use in source and binary forms, with or
  13  *     without modification, are permitted provided that the following
  14  *     conditions are met:
  15  *
  16  *      - Redistributions of source code must retain the above
  17  *        copyright notice, this list of conditions and the following
  18  *        disclaimer.
  19  *
  20  *      - Redistributions in binary form must reproduce the above
  21  *        copyright notice, this list of conditions and the following
  22  *        disclaimer in the documentation and/or other materials
  23  *        provided with the distribution.
  24  *
  25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32  * SOFTWARE.
  33  */
  34
  35 #if HAVE_CONFIG_H
  36 #  include <config.h>
  37 #endif /* HAVE_CONFIG_H */
  38
  39 #include <stdlib.h>
  40 #include <netinet/in.h>
  41 #include <pthread.h>
  42 #include <string.h>
  43
  44 #include "mlx4.h"
  45 #include "doorbell.h"
  46 #include "wqe.h"
  47
  48 static const uint32_t mlx4_ib_opcode[] = {
  49         [IBV_WR_SEND]                   = MLX4_OPCODE_SEND,
  50         [IBV_WR_SEND_WITH_IMM]          = MLX4_OPCODE_SEND_IMM,
  51         [IBV_WR_RDMA_WRITE]             = MLX4_OPCODE_RDMA_WRITE,
  52         [IBV_WR_RDMA_WRITE_WITH_IMM]    = MLX4_OPCODE_RDMA_WRITE_IMM,
  53         [IBV_WR_RDMA_READ]              = MLX4_OPCODE_RDMA_READ,
  54         [IBV_WR_ATOMIC_CMP_AND_SWP]     = MLX4_OPCODE_ATOMIC_CS,
  55         [IBV_WR_ATOMIC_FETCH_AND_ADD]   = MLX4_OPCODE_ATOMIC_FA,
  56 };
  57
  58 static void *get_recv_wqe(struct mlx4_qp *qp, int n)
  59 {
  60         return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
  61 }
  62
  63 static void *get_send_wqe(struct mlx4_qp *qp, int n)
  64 {
  65         return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
  66 }
  67
  68 /*
  69  * Stamp a SQ WQE so that it is invalid if prefetched by marking the
  70  * first four bytes of every 64 byte chunk with 0xffffffff, except for
  71  * the very first chunk of the WQE.
  72  */
  73 static void stamp_send_wqe(struct mlx4_qp *qp, int n)
  74 {
  75         uint32_t *wqe = get_send_wqe(qp, n);
  76         int i;
  77         int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
  78
  79         for (i = 16; i < ds; i += 16)
  80                 wqe[i] = 0xffffffff;
  81 }
  82
  83 void mlx4_init_qp_indices(struct mlx4_qp *qp)
  84 {
  85         qp->sq.head      = 0;
  86         qp->sq.tail      = 0;
  87         qp->rq.head      = 0;
  88         qp->rq.tail      = 0;
  89 }
  90
  91 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
  92 {
  93         struct mlx4_wqe_ctrl_seg *ctrl;
  94         int i;
  95
  96         for (i = 0; i < qp->sq.wqe_cnt; ++i) {
  97                 ctrl = get_send_wqe(qp, i);
  98                 ctrl->owner_opcode = htonl(1 << 31);
  99                 ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
 100
 101                 stamp_send_wqe(qp, i);
 102         }
 103 }
 104
 105 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
 106 {
 107         unsigned cur;
 108
 109         cur = wq->head - wq->tail;
 110         if (cur + nreq < wq->max_post)
 111                 return 0;
 112
 113         pthread_spin_lock(&cq->lock);
 114         cur = wq->head - wq->tail;
 115         pthread_spin_unlock(&cq->lock);
 116
 117         return cur + nreq >= wq->max_post;
 118 }
 119
 120 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
 121                                  uint64_t remote_addr, uint32_t rkey)
 122 {
 123         rseg->raddr    = htonll(remote_addr);
 124         rseg->rkey     = htonl(rkey);
 125         rseg->reserved = 0;
 126 }
 127
 128 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
 129 {
 130         if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
 131                 aseg->swap_add = htonll(wr->wr.atomic.swap);
 132                 aseg->compare  = htonll(wr->wr.atomic.compare_add);
 133         } else {
 134                 aseg->swap_add = htonll(wr->wr.atomic.compare_add);
 135                 aseg->compare  = 0;
 136         }
 137
 138 }
 139
 140 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
 141                              struct ibv_send_wr *wr)
 142 {
 143         memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
 144         dseg->dqpn = htonl(wr->wr.ud.remote_qpn);
 145         dseg->qkey = htonl(wr->wr.ud.remote_qkey);
 146         dseg->vlan = htons(to_mah(wr->wr.ud.ah)->vlan);
 147         memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
 148 }
 149
 150 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
 151 {
 152         dseg->byte_count = htonl(sg->length);
 153         dseg->lkey       = htonl(sg->lkey);
 154         dseg->addr       = htonll(sg->addr);
 155 }
 156
 157 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
 158 {
 159         dseg->lkey       = htonl(sg->lkey);
 160         dseg->addr       = htonll(sg->addr);
 161
 162         /*
 163          * Need a barrier here before writing the byte_count field to
 164          * make sure that all the data is visible before the
 165          * byte_count field is set.  Otherwise, if the segment begins
 166          * a new cacheline, the HCA prefetcher could grab the 64-byte
 167          * chunk and get a valid (!= * 0xffffffff) byte count but
 168          * stale data, and end up sending the wrong data.
 169          */
 170         wmb();
 171
 172         dseg->byte_count = htonl(sg->length);
 173 }
 174
 175 /*
 176  * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
 177  * implementations may use move-string-buffer assembler instructions,
 178  * which do not guarantee order of copying.
 179  */
 180 static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
 181 {
 182         while (bytecnt > 0) {
 183                 *dst++ = *src++;
 184                 *dst++ = *src++;
 185                 bytecnt -= 2 * sizeof (long);
 186         }
 187 }
 188
 189 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 190                           struct ibv_send_wr **bad_wr)
 191 {
 192         struct mlx4_context *ctx;
 193         struct mlx4_qp *qp = to_mqp(ibqp);
 194         void *wqe;
 195         struct mlx4_wqe_ctrl_seg *ctrl;
 196         int ind;
 197         int nreq;
 198         int inl = 0;
 199         int ret = 0;
 200         int size;
 201         int i;
 202
 203         pthread_spin_lock(&qp->sq.lock);
 204
 205         /* XXX check that state is OK to post send */
 206
 207         ind = qp->sq.head;
 208
 209         for (nreq = 0; wr; ++nreq, wr = wr->next) {
 210                 if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
 211                         ret = -1;
 212                         *bad_wr = wr;
 213                         goto out;
 214                 }
 215
 216                 if (wr->num_sge > qp->sq.max_gs) {
 217                         ret = -1;
 218                         *bad_wr = wr;
 219                         goto out;
 220                 }
 221
 222                 if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
 223                         ret = -1;
 224                         *bad_wr = wr;
 225                         goto out;
 226                 }
 227
 228                 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
 229                 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
 230
 231                 ctrl->xrcrb_flags =
 232                         (wr->send_flags & IBV_SEND_SIGNALED ?
 233                          htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
 234                         (wr->send_flags & IBV_SEND_SOLICITED ?
 235                          htonl(MLX4_WQE_CTRL_SOLICIT) : 0)   |
 236                         qp->sq_signal_bits;
 237
 238                 if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
 239                     wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
 240                         ctrl->imm = wr->imm_data;
 241                 else
 242                         ctrl->imm = 0;
 243
 244                 wqe += sizeof *ctrl;
 245                 size = sizeof *ctrl / 16;
 246
 247                 switch (ibqp->qp_type) {
 248                 case IBV_QPT_XRC:
 249                         ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8);
 250                         /* fall thru */
 251                 case IBV_QPT_RC:
 252                 case IBV_QPT_UC:
 253                         switch (wr->opcode) {
 254                         case IBV_WR_ATOMIC_CMP_AND_SWP:
 255                         case IBV_WR_ATOMIC_FETCH_AND_ADD:
 256                                 set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
 257                                               wr->wr.atomic.rkey);
 258                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
 259
 260                                 set_atomic_seg(wqe, wr);
 261                                 wqe  += sizeof (struct mlx4_wqe_atomic_seg);
 262                                 size += (sizeof (struct mlx4_wqe_raddr_seg) +
 263                                          sizeof (struct mlx4_wqe_atomic_seg)) / 16;
 264
 265                                 break;
 266
 267                         case IBV_WR_RDMA_READ:
 268                                 inl = 1;
 269                                 /* fall through */
 270                         case IBV_WR_RDMA_WRITE:
 271                         case IBV_WR_RDMA_WRITE_WITH_IMM:
 272                                 set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
 273                                               wr->wr.rdma.rkey);
 274                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
 275                                 size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
 276
 277                                 break;
 278
 279                         default:
 280                                 /* No extra segments required for sends */
 281                                 break;
 282                         }
 283                         break;
 284
 285                 case IBV_QPT_UD:
 286                         set_datagram_seg(wqe, wr);
 287                         wqe  += sizeof (struct mlx4_wqe_datagram_seg);
 288                         size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
 289                         if (to_mah(wr->wr.ud.ah)->tagged) {
 290                                 ctrl->ins_vlan = 1 << 6;
 291                                 ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan);
 292                         }
 293
 294                         break;
 295
 296                 default:
 297                         break;
 298                 }
 299
 300                 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
 301                         struct mlx4_wqe_inline_seg *seg;
 302                         void *addr;
 303                         int len, seg_len;
 304                         int num_seg;
 305                         int off, to_copy;
 306
 307                         inl = 0;
 308
 309                         seg = wqe;
 310                         wqe += sizeof *seg;
 311                         off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
 312                         num_seg = 0;
 313                         seg_len = 0;
 314
 315                         for (i = 0; i < wr->num_sge; ++i) {
 316                                 addr = (void *) (uintptr_t) wr->sg_list[i].addr;
 317                                 len  = wr->sg_list[i].length;
 318                                 inl += len;
 319
 320                                 if (inl > qp->max_inline_data) {
 321                                         inl = 0;
 322                                         ret = -1;
 323                                         *bad_wr = wr;
 324                                         goto out;
 325                                 }
 326
 327                                 while (len >= MLX4_INLINE_ALIGN - off) {
 328                                         to_copy = MLX4_INLINE_ALIGN - off;
 329                                         memcpy(wqe, addr, to_copy);
 330                                         len -= to_copy;
 331                                         wqe += to_copy;
 332                                         addr += to_copy;
 333                                         seg_len += to_copy;
 334                                         wmb(); /* see comment below */
 335                                         seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
 336                                         seg_len = 0;
 337                                         seg = wqe;
 338                                         wqe += sizeof *seg;
 339                                         off = sizeof *seg;
 340                                         ++num_seg;
 341                                 }
 342
 343                                 memcpy(wqe, addr, len);
 344                                 wqe += len;
 345                                 seg_len += len;
 346                                 off += len;
 347                         }
 348
 349                         if (seg_len) {
 350                                 ++num_seg;
 351                                 /*
 352                                  * Need a barrier here to make sure
 353                                  * all the data is visible before the
 354                                  * byte_count field is set.  Otherwise
 355                                  * the HCA prefetcher could grab the
 356                                  * 64-byte chunk with this inline
 357                                  * segment and get a valid (!=
 358                                  * 0xffffffff) byte count but stale
 359                                  * data, and end up sending the wrong
 360                                  * data.
 361                                  */
 362                                 wmb();
 363                                 seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
 364                         }
 365
 366                         size += (inl + num_seg * sizeof * seg + 15) / 16;
 367                 } else {
 368                         struct mlx4_wqe_data_seg *seg = wqe;
 369
 370                         for (i = wr->num_sge - 1; i >= 0 ; --i)
 371                                 set_data_seg(seg + i, wr->sg_list + i);
 372
 373                         size += wr->num_sge * (sizeof *seg / 16);
 374                 }
 375
 376                 ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
 377                                     MLX4_WQE_CTRL_FENCE : 0) | size;
 378
 379                 /*
 380                  * Make sure descriptor is fully written before
 381                  * setting ownership bit (because HW can start
 382                  * executing as soon as we do).
 383                  */
 384                 wmb();
 385
 386                 ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
 387                         (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
 388
 389                 /*
 390                  * We can improve latency by not stamping the last
 391                  * send queue WQE until after ringing the doorbell, so
 392                  * only stamp here if there are still more WQEs to post.
 393                  */
 394                 if (wr->next)
 395                         stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
 396                                        (qp->sq.wqe_cnt - 1));
 397
 398                 ++ind;
 399         }
 400
 401 out:
 402         ctx = to_mctx(ibqp->context);
 403
 404         if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) {
 405                 ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
 406                 *(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
 407                 /*
 408                  * Make sure that descriptor is written to memory
 409                  * before writing to BlueFlame page.
 410                  */
 411                 wmb();
 412
 413                 ++qp->sq.head;
 414
 415                 pthread_spin_lock(&ctx->bf_lock);
 416
 417                 mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
 418                              align(size * 16, 64));
 419                 wc_wmb();
 420
 421                 ctx->bf_offset ^= ctx->bf_buf_size;
 422
 423                 pthread_spin_unlock(&ctx->bf_lock);
 424         } else if (nreq) {
 425                 qp->sq.head += nreq;
 426
 427                 /*
 428                  * Make sure that descriptors are written before
 429                  * doorbell record.
 430                  */
 431                 wmb();
 432
 433                 *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
 434         }
 435
 436         if (nreq)
 437                 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
 438                                (qp->sq.wqe_cnt - 1));
 439
 440         pthread_spin_unlock(&qp->sq.lock);
 441
 442         return ret;
 443 }
 444
 445 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 446                    struct ibv_recv_wr **bad_wr)
 447 {
 448         struct mlx4_qp *qp = to_mqp(ibqp);
 449         struct mlx4_wqe_data_seg *scat;
 450         int ret = 0;
 451         int nreq;
 452         int ind;
 453         int i;
 454
 455         pthread_spin_lock(&qp->rq.lock);
 456
 457         /* XXX check that state is OK to post receive */
 458
 459         ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
 460
 461         for (nreq = 0; wr; ++nreq, wr = wr->next) {
 462                 if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
 463                         ret = -1;
 464                         *bad_wr = wr;
 465                         goto out;
 466                 }
 467
 468                 if (wr->num_sge > qp->rq.max_gs) {
 469                         ret = -1;
 470                         *bad_wr = wr;
 471                         goto out;
 472                 }
 473
 474                 scat = get_recv_wqe(qp, ind);
 475
 476                 for (i = 0; i < wr->num_sge; ++i)
 477                         __set_data_seg(scat + i, wr->sg_list + i);
 478
 479                 if (i < qp->rq.max_gs) {
 480                         scat[i].byte_count = 0;
 481                         scat[i].lkey       = htonl(MLX4_INVALID_LKEY);
 482                         scat[i].addr       = 0;
 483                 }
 484
 485                 qp->rq.wrid[ind] = wr->wr_id;
 486
 487                 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
 488         }
 489
 490 out:
 491         if (nreq) {
 492                 qp->rq.head += nreq;
 493
 494                 /*
 495                  * Make sure that descriptors are written before
 496                  * doorbell record.
 497                  */
 498                 wmb();
 499
 500                 *qp->db = htonl(qp->rq.head & 0xffff);
 501         }
 502
 503         pthread_spin_unlock(&qp->rq.lock);
 504
 505         return ret;
 506 }
 507
 508 int num_inline_segs(int data, enum ibv_qp_type type)
 509 {
 510         /*
 511          * Inline data segments are not allowed to cross 64 byte
 512          * boundaries.  For UD QPs, the data segments always start
 513          * aligned to 64 bytes (16 byte control segment + 48 byte
 514          * datagram segment); for other QPs, there will be a 16 byte
 515          * control segment and possibly a 16 byte remote address
 516          * segment, so in the worst case there will be only 32 bytes
 517          * available for the first data segment.
 518          */
 519         if (type == IBV_QPT_UD)
 520                 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
 521                          sizeof (struct mlx4_wqe_datagram_seg)) %
 522                         MLX4_INLINE_ALIGN;
 523         else
 524                 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
 525                          sizeof (struct mlx4_wqe_raddr_seg)) %
 526                         MLX4_INLINE_ALIGN;
 527
 528         return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
 529                 (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
 530 }
 531
 532 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
 533                            struct mlx4_qp *qp)
 534 {
 535         int size;
 536         int max_sq_sge;
 537
 538         max_sq_sge       = align(cap->max_inline_data +
 539                                  num_inline_segs(cap->max_inline_data, type) *
 540                                  sizeof (struct mlx4_wqe_inline_seg),
 541                                  sizeof (struct mlx4_wqe_data_seg)) /
 542                 sizeof (struct mlx4_wqe_data_seg);
 543         if (max_sq_sge < cap->max_send_sge)
 544                 max_sq_sge = cap->max_send_sge;
 545
 546         size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
 547         switch (type) {
 548         case IBV_QPT_UD:
 549                 size += sizeof (struct mlx4_wqe_datagram_seg);
 550                 break;
 551
 552         case IBV_QPT_UC:
 553                 size += sizeof (struct mlx4_wqe_raddr_seg);
 554                 break;
 555
 556         case IBV_QPT_XRC:
 557         case IBV_QPT_RC:
 558                 size += sizeof (struct mlx4_wqe_raddr_seg);
 559                 /*
 560                  * An atomic op will require an atomic segment, a
 561                  * remote address segment and one scatter entry.
 562                  */
 563                 if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
 564                             sizeof (struct mlx4_wqe_raddr_seg) +
 565                             sizeof (struct mlx4_wqe_data_seg)))
 566                         size = (sizeof (struct mlx4_wqe_atomic_seg) +
 567                                 sizeof (struct mlx4_wqe_raddr_seg) +
 568                                 sizeof (struct mlx4_wqe_data_seg));
 569                 break;
 570
 571         default:
 572                 break;
 573         }
 574
 575         /* Make sure that we have enough space for a bind request */
 576         if (size < sizeof (struct mlx4_wqe_bind_seg))
 577                 size = sizeof (struct mlx4_wqe_bind_seg);
 578
 579         size += sizeof (struct mlx4_wqe_ctrl_seg);
 580
 581         for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
 582              qp->sq.wqe_shift++)
 583                 ; /* nothing */
 584 }
 585
 586 int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
 587                        enum ibv_qp_type type, struct mlx4_qp *qp)
 588 {
 589         qp->rq.max_gs    = cap->max_recv_sge;
 590
 591         qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
 592         if (!qp->sq.wrid)
 593                 return -1;
 594
 595         if (qp->rq.wqe_cnt) {
 596                 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
 597                 if (!qp->rq.wrid) {
 598                         free(qp->sq.wrid);
 599                         return -1;
 600                 }
 601         }
 602
 603         for (qp->rq.wqe_shift = 4;
 604              1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
 605              qp->rq.wqe_shift++)
 606                 ; /* nothing */
 607
 608         qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 609                 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
 610         if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
 611                 qp->rq.offset = 0;
 612                 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 613         } else {
 614                 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
 615                 qp->sq.offset = 0;
 616         }
 617
 618         if (mlx4_alloc_buf(&qp->buf,
 619                             align(qp->buf_size, to_mdev(pd->context->device)->page_size),
 620                             to_mdev(pd->context->device)->page_size)) {
 621                 free(qp->sq.wrid);
 622                 free(qp->rq.wrid);
 623                 return -1;
 624         }
 625
 626         memset(qp->buf.buf, 0, qp->buf_size);
 627
 628         return 0;
 629 }
 630
 631 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
 632                        enum ibv_qp_type type)
 633 {
 634         int wqe_size;
 635         struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context);
 636
 637         wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) -
 638                 sizeof (struct mlx4_wqe_ctrl_seg);
 639         switch (type) {
 640         case IBV_QPT_UD:
 641                 wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
 642                 break;
 643
 644         case IBV_QPT_UC:
 645         case IBV_QPT_RC:
 646         case IBV_QPT_XRC:
 647                 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
 648                 break;
 649
 650         default:
 651                 break;
 652         }
 653
 654         qp->sq.max_gs        = wqe_size / sizeof (struct mlx4_wqe_data_seg);
 655         cap->max_send_sge    = min(ctx->max_sge, qp->sq.max_gs);
 656         qp->sq.max_post      = min(ctx->max_qp_wr,
 657                                    qp->sq.wqe_cnt - qp->sq_spare_wqes);
 658         cap->max_send_wr     = qp->sq.max_post;
 659
 660         /*
 661          * Inline data segments can't cross a 64 byte boundary.  So
 662          * subtract off one segment header for each 64-byte chunk,
 663          * taking into account the fact that wqe_size will be 32 mod
 664          * 64 for non-UD QPs.
 665          */
 666         qp->max_inline_data  = wqe_size -
 667                 sizeof (struct mlx4_wqe_inline_seg) *
 668                 (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
 669         cap->max_inline_data = qp->max_inline_data;
 670 }
 671
 672 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
 673 {
 674         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
 675
 676         if (ctx->qp_table[tind].refcnt)
 677                 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
 678         else
 679                 return NULL;
 680 }
 681
 682 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
 683 {
 684         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
 685
 686         if (!ctx->qp_table[tind].refcnt) {
 687                 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
 688                                                    sizeof (struct mlx4_qp *));
 689                 if (!ctx->qp_table[tind].table)
 690                         return -1;
 691         }
 692
 693         ++ctx->qp_table[tind].refcnt;
 694         ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
 695         return 0;
 696 }
 697
 698 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
 699 {
 700         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
 701
 702         if (!--ctx->qp_table[tind].refcnt)
 703                 free(ctx->qp_table[tind].table);
 704         else
 705                 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
 706 }