contrib/ofed/libmlx4/qp.c

   1 /*
   2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
   3  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
   4  * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
   5  *
   6  * This software is available to you under a choice of one of two
   7  * licenses.  You may choose to be licensed under the terms of the GNU
   8  * General Public License (GPL) Version 2, available from the file
   9  * COPYING in the main directory of this source tree, or the
  10  * OpenIB.org BSD license below:
  11  *
  12  *     Redistribution and use in source and binary forms, with or
  13  *     without modification, are permitted provided that the following
  14  *     conditions are met:
  15  *
  16  *      - Redistributions of source code must retain the above
  17  *        copyright notice, this list of conditions and the following
  18  *        disclaimer.
  19  *
  20  *      - Redistributions in binary form must reproduce the above
  21  *        copyright notice, this list of conditions and the following
  22  *        disclaimer in the documentation and/or other materials
  23  *        provided with the distribution.
  24  *
  25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32  * SOFTWARE.
  33  */
  34
  35 #include <config.h>
  36
  37 #include <stdlib.h>
  38 #include <pthread.h>
  39 #include <string.h>
  40 #include <errno.h>
  41
  42 #include "mlx4.h"
  43 #include "doorbell.h"
  44 #include "wqe.h"
  45
  46 static const uint32_t mlx4_ib_opcode[] = {
  47         [IBV_WR_SEND]                   = MLX4_OPCODE_SEND,
  48         [IBV_WR_SEND_WITH_IMM]          = MLX4_OPCODE_SEND_IMM,
  49         [IBV_WR_RDMA_WRITE]             = MLX4_OPCODE_RDMA_WRITE,
  50         [IBV_WR_RDMA_WRITE_WITH_IMM]    = MLX4_OPCODE_RDMA_WRITE_IMM,
  51         [IBV_WR_RDMA_READ]              = MLX4_OPCODE_RDMA_READ,
  52         [IBV_WR_ATOMIC_CMP_AND_SWP]     = MLX4_OPCODE_ATOMIC_CS,
  53         [IBV_WR_ATOMIC_FETCH_AND_ADD]   = MLX4_OPCODE_ATOMIC_FA,
  54         [IBV_WR_LOCAL_INV]              = MLX4_OPCODE_LOCAL_INVAL,
  55         [IBV_WR_BIND_MW]                = MLX4_OPCODE_BIND_MW,
  56         [IBV_WR_SEND_WITH_INV]          = MLX4_OPCODE_SEND_INVAL,
  57 };
  58
  59 static void *get_recv_wqe(struct mlx4_qp *qp, int n)
  60 {
  61         return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
  62 }
  63
  64 static void *get_send_wqe(struct mlx4_qp *qp, int n)
  65 {
  66         return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
  67 }
  68
  69 /*
  70  * Stamp a SQ WQE so that it is invalid if prefetched by marking the
  71  * first four bytes of every 64 byte chunk with 0xffffffff, except for
  72  * the very first chunk of the WQE.
  73  */
  74 static void stamp_send_wqe(struct mlx4_qp *qp, int n)
  75 {
  76         uint32_t *wqe = get_send_wqe(qp, n);
  77         int i;
  78         int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
  79
  80         for (i = 16; i < ds; i += 16)
  81                 wqe[i] = 0xffffffff;
  82 }
  83
  84 void mlx4_init_qp_indices(struct mlx4_qp *qp)
  85 {
  86         qp->sq.head      = 0;
  87         qp->sq.tail      = 0;
  88         qp->rq.head      = 0;
  89         qp->rq.tail      = 0;
  90 }
  91
  92 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
  93 {
  94         struct mlx4_wqe_ctrl_seg *ctrl;
  95         int i;
  96
  97         for (i = 0; i < qp->sq.wqe_cnt; ++i) {
  98                 ctrl = get_send_wqe(qp, i);
  99                 ctrl->owner_opcode = htobe32(1 << 31);
 100                 ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
 101
 102                 stamp_send_wqe(qp, i);
 103         }
 104 }
 105
 106 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
 107 {
 108         unsigned cur;
 109
 110         cur = wq->head - wq->tail;
 111         if (cur + nreq < wq->max_post)
 112                 return 0;
 113
 114         pthread_spin_lock(&cq->lock);
 115         cur = wq->head - wq->tail;
 116         pthread_spin_unlock(&cq->lock);
 117
 118         return cur + nreq >= wq->max_post;
 119 }
 120
 121 static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
 122 {
 123         int acc = wr->bind_mw.bind_info.mw_access_flags;
 124         bseg->flags1 = 0;
 125         if (acc & IBV_ACCESS_REMOTE_ATOMIC)
 126                 bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
 127         if (acc & IBV_ACCESS_REMOTE_WRITE)
 128                 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
 129         if (acc & IBV_ACCESS_REMOTE_READ)
 130                 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
 131
 132         bseg->flags2 = 0;
 133         if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
 134                 bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
 135         if (acc & IBV_ACCESS_ZERO_BASED)
 136                 bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
 137
 138         bseg->new_rkey = htobe32(wr->bind_mw.rkey);
 139         bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
 140         bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
 141         bseg->length = htobe64(wr->bind_mw.bind_info.length);
 142 }
 143
 144 static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
 145                 uint32_t rkey)
 146 {
 147         iseg->mem_key   = htobe32(rkey);
 148
 149         iseg->reserved1    = 0;
 150         iseg->reserved2    = 0;
 151         iseg->reserved3[0] = 0;
 152         iseg->reserved3[1] = 0;
 153 }
 154
 155 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
 156                                  uint64_t remote_addr, uint32_t rkey)
 157 {
 158         rseg->raddr    = htobe64(remote_addr);
 159         rseg->rkey     = htobe32(rkey);
 160         rseg->reserved = 0;
 161 }
 162
 163 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
 164 {
 165         if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
 166                 aseg->swap_add = htobe64(wr->wr.atomic.swap);
 167                 aseg->compare  = htobe64(wr->wr.atomic.compare_add);
 168         } else {
 169                 aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
 170                 aseg->compare  = 0;
 171         }
 172
 173 }
 174
 175 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
 176                              struct ibv_send_wr *wr)
 177 {
 178         memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
 179         dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
 180         dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
 181         dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
 182         memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
 183 }
 184
 185 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
 186 {
 187         dseg->byte_count = htobe32(sg->length);
 188         dseg->lkey       = htobe32(sg->lkey);
 189         dseg->addr       = htobe64(sg->addr);
 190 }
 191
 192 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
 193 {
 194         dseg->lkey       = htobe32(sg->lkey);
 195         dseg->addr       = htobe64(sg->addr);
 196
 197         /*
 198          * Need a barrier here before writing the byte_count field to
 199          * make sure that all the data is visible before the
 200          * byte_count field is set.  Otherwise, if the segment begins
 201          * a new cacheline, the HCA prefetcher could grab the 64-byte
 202          * chunk and get a valid (!= * 0xffffffff) byte count but
 203          * stale data, and end up sending the wrong data.
 204          */
 205         udma_to_device_barrier();
 206
 207         if (likely(sg->length))
 208                 dseg->byte_count = htobe32(sg->length);
 209         else
 210                 dseg->byte_count = htobe32(0x80000000);
 211 }
 212
 213 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 214                           struct ibv_send_wr **bad_wr)
 215 {
 216         struct mlx4_context *ctx;
 217         struct mlx4_qp *qp = to_mqp(ibqp);
 218         void *wqe;
 219         struct mlx4_wqe_ctrl_seg *ctrl = NULL;
 220         int ind;
 221         int nreq;
 222         int inl = 0;
 223         int ret = 0;
 224         int size = 0;
 225         int i;
 226
 227         pthread_spin_lock(&qp->sq.lock);
 228
 229         /* XXX check that state is OK to post send */
 230
 231         ind = qp->sq.head;
 232
 233         for (nreq = 0; wr; ++nreq, wr = wr->next) {
 234                 if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
 235                         ret = ENOMEM;
 236                         *bad_wr = wr;
 237                         goto out;
 238                 }
 239
 240                 if (wr->num_sge > qp->sq.max_gs) {
 241                         ret = ENOMEM;
 242                         *bad_wr = wr;
 243                         goto out;
 244                 }
 245
 246                 if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
 247                         ret = EINVAL;
 248                         *bad_wr = wr;
 249                         goto out;
 250                 }
 251
 252                 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
 253                 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
 254
 255                 ctrl->srcrb_flags =
 256                         (wr->send_flags & IBV_SEND_SIGNALED ?
 257                          htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
 258                         (wr->send_flags & IBV_SEND_SOLICITED ?
 259                          htobe32(MLX4_WQE_CTRL_SOLICIT) : 0)   |
 260                         qp->sq_signal_bits;
 261
 262                 if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
 263                     wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
 264                         ctrl->imm = wr->imm_data;
 265                 else
 266                         ctrl->imm = 0;
 267
 268                 wqe += sizeof *ctrl;
 269                 size = sizeof *ctrl / 16;
 270
 271                 switch (ibqp->qp_type) {
 272                 case IBV_QPT_XRC_SEND:
 273                         ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
 274                         /* fall through */
 275                 case IBV_QPT_RC:
 276                 case IBV_QPT_UC:
 277                         switch (wr->opcode) {
 278                         case IBV_WR_ATOMIC_CMP_AND_SWP:
 279                         case IBV_WR_ATOMIC_FETCH_AND_ADD:
 280                                 set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
 281                                               wr->wr.atomic.rkey);
 282                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
 283
 284                                 set_atomic_seg(wqe, wr);
 285                                 wqe  += sizeof (struct mlx4_wqe_atomic_seg);
 286                                 size += (sizeof (struct mlx4_wqe_raddr_seg) +
 287                                          sizeof (struct mlx4_wqe_atomic_seg)) / 16;
 288
 289                                 break;
 290
 291                         case IBV_WR_RDMA_READ:
 292                                 inl = 1;
 293                                 /* fall through */
 294                         case IBV_WR_RDMA_WRITE:
 295                         case IBV_WR_RDMA_WRITE_WITH_IMM:
 296                                 if (!wr->num_sge)
 297                                         inl = 1;
 298                                 set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
 299                                               wr->wr.rdma.rkey);
 300                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
 301                                 size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
 302
 303                                 break;
 304                         case IBV_WR_LOCAL_INV:
 305                                 ctrl->srcrb_flags |=
 306                                         htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
 307                                 set_local_inv_seg(wqe, wr->imm_data);
 308                                 wqe  += sizeof
 309                                         (struct mlx4_wqe_local_inval_seg);
 310                                 size += sizeof
 311                                         (struct mlx4_wqe_local_inval_seg) / 16;
 312                                 break;
 313                         case IBV_WR_BIND_MW:
 314                                 ctrl->srcrb_flags |=
 315                                         htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
 316                                 set_bind_seg(wqe, wr);
 317                                 wqe  += sizeof
 318                                         (struct mlx4_wqe_bind_seg);
 319                                 size += sizeof
 320                                         (struct mlx4_wqe_bind_seg) / 16;
 321                                 break;
 322                         case IBV_WR_SEND_WITH_INV:
 323                                 ctrl->imm = htobe32(wr->imm_data);
 324                                 break;
 325
 326                         default:
 327                                 /* No extra segments required for sends */
 328                                 break;
 329                         }
 330                         break;
 331
 332                 case IBV_QPT_UD:
 333                         set_datagram_seg(wqe, wr);
 334                         wqe  += sizeof (struct mlx4_wqe_datagram_seg);
 335                         size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
 336
 337                         if (wr->send_flags & IBV_SEND_IP_CSUM) {
 338                                 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
 339                                         ret = EINVAL;
 340                                         *bad_wr = wr;
 341                                         goto out;
 342                                 }
 343                                 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
 344                                                            MLX4_WQE_CTRL_TCP_UDP_CSUM);
 345                         }
 346                         break;
 347
 348                 case IBV_QPT_RAW_PACKET:
 349                         /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
 350                          * to indicate that no icrc should be calculated */
 351                         ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
 352                         if (wr->send_flags & IBV_SEND_IP_CSUM) {
 353                                 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
 354                                         ret = EINVAL;
 355                                         *bad_wr = wr;
 356                                         goto out;
 357                                 }
 358                                 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
 359                                                            MLX4_WQE_CTRL_TCP_UDP_CSUM);
 360                         }
 361                         break;
 362
 363                 default:
 364                         break;
 365                 }
 366
 367                 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
 368                         struct mlx4_wqe_inline_seg *seg;
 369                         void *addr;
 370                         int len, seg_len;
 371                         int num_seg;
 372                         int off, to_copy;
 373
 374                         inl = 0;
 375
 376                         seg = wqe;
 377                         wqe += sizeof *seg;
 378                         off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
 379                         num_seg = 0;
 380                         seg_len = 0;
 381
 382                         for (i = 0; i < wr->num_sge; ++i) {
 383                                 addr = (void *) (uintptr_t) wr->sg_list[i].addr;
 384                                 len  = wr->sg_list[i].length;
 385                                 inl += len;
 386
 387                                 if (inl > qp->max_inline_data) {
 388                                         inl = 0;
 389                                         ret = ENOMEM;
 390                                         *bad_wr = wr;
 391                                         goto out;
 392                                 }
 393
 394                                 while (len >= MLX4_INLINE_ALIGN - off) {
 395                                         to_copy = MLX4_INLINE_ALIGN - off;
 396                                         memcpy(wqe, addr, to_copy);
 397                                         len -= to_copy;
 398                                         wqe += to_copy;
 399                                         addr += to_copy;
 400                                         seg_len += to_copy;
 401                                         udma_to_device_barrier(); /* see comment below */
 402                                         seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
 403                                         seg_len = 0;
 404                                         seg = wqe;
 405                                         wqe += sizeof *seg;
 406                                         off = sizeof *seg;
 407                                         ++num_seg;
 408                                 }
 409
 410                                 memcpy(wqe, addr, len);
 411                                 wqe += len;
 412                                 seg_len += len;
 413                                 off += len;
 414                         }
 415
 416                         if (seg_len) {
 417                                 ++num_seg;
 418                                 /*
 419                                  * Need a barrier here to make sure
 420                                  * all the data is visible before the
 421                                  * byte_count field is set.  Otherwise
 422                                  * the HCA prefetcher could grab the
 423                                  * 64-byte chunk with this inline
 424                                  * segment and get a valid (!=
 425                                  * 0xffffffff) byte count but stale
 426                                  * data, and end up sending the wrong
 427                                  * data.
 428                                  */
 429                                 udma_to_device_barrier();
 430                                 seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
 431                         }
 432
 433                         size += (inl + num_seg * sizeof * seg + 15) / 16;
 434                 } else {
 435                         struct mlx4_wqe_data_seg *seg = wqe;
 436
 437                         for (i = wr->num_sge - 1; i >= 0 ; --i)
 438                                 set_data_seg(seg + i, wr->sg_list + i);
 439
 440                         size += wr->num_sge * (sizeof *seg / 16);
 441                 }
 442
 443                 ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
 444                                     MLX4_WQE_CTRL_FENCE : 0) | size;
 445
 446                 /*
 447                  * Make sure descriptor is fully written before
 448                  * setting ownership bit (because HW can start
 449                  * executing as soon as we do).
 450                  */
 451                 udma_to_device_barrier();
 452
 453                 ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
 454                         (ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
 455
 456                 /*
 457                  * We can improve latency by not stamping the last
 458                  * send queue WQE until after ringing the doorbell, so
 459                  * only stamp here if there are still more WQEs to post.
 460                  */
 461                 if (wr->next)
 462                         stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
 463                                        (qp->sq.wqe_cnt - 1));
 464
 465                 ++ind;
 466         }
 467
 468 out:
 469         ctx = to_mctx(ibqp->context);
 470
 471         if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
 472                 ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
 473
 474                 ctrl->bf_qpn |= qp->doorbell_qpn;
 475                 ++qp->sq.head;
 476                 /*
 477                  * Make sure that descriptor is written to memory
 478                  * before writing to BlueFlame page.
 479                  */
 480                 mmio_wc_spinlock(&ctx->bf_lock);
 481
 482                 mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
 483                              align(size * 16, 64));
 484                 /* Flush before toggling bf_offset to be latency oriented */
 485                 mmio_flush_writes();
 486
 487                 ctx->bf_offset ^= ctx->bf_buf_size;
 488
 489                 pthread_spin_unlock(&ctx->bf_lock);
 490         } else if (nreq) {
 491                 qp->sq.head += nreq;
 492
 493                 /*
 494                  * Make sure that descriptors are written before
 495                  * doorbell record.
 496                  */
 497                 udma_to_device_barrier();
 498
 499                 mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
 500                             qp->doorbell_qpn);
 501         }
 502
 503         if (nreq)
 504                 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
 505                                (qp->sq.wqe_cnt - 1));
 506
 507         pthread_spin_unlock(&qp->sq.lock);
 508
 509         return ret;
 510 }
 511
 512 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 513                    struct ibv_recv_wr **bad_wr)
 514 {
 515         struct mlx4_qp *qp = to_mqp(ibqp);
 516         struct mlx4_wqe_data_seg *scat;
 517         int ret = 0;
 518         int nreq;
 519         int ind;
 520         int i;
 521
 522         pthread_spin_lock(&qp->rq.lock);
 523
 524         /* XXX check that state is OK to post receive */
 525
 526         ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
 527
 528         for (nreq = 0; wr; ++nreq, wr = wr->next) {
 529                 if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
 530                         ret = ENOMEM;
 531                         *bad_wr = wr;
 532                         goto out;
 533                 }
 534
 535                 if (wr->num_sge > qp->rq.max_gs) {
 536                         ret = ENOMEM;
 537                         *bad_wr = wr;
 538                         goto out;
 539                 }
 540
 541                 scat = get_recv_wqe(qp, ind);
 542
 543                 for (i = 0; i < wr->num_sge; ++i)
 544                         __set_data_seg(scat + i, wr->sg_list + i);
 545
 546                 if (i < qp->rq.max_gs) {
 547                         scat[i].byte_count = 0;
 548                         scat[i].lkey       = htobe32(MLX4_INVALID_LKEY);
 549                         scat[i].addr       = 0;
 550                 }
 551
 552                 qp->rq.wrid[ind] = wr->wr_id;
 553
 554                 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
 555         }
 556
 557 out:
 558         if (nreq) {
 559                 qp->rq.head += nreq;
 560
 561                 /*
 562                  * Make sure that descriptors are written before
 563                  * doorbell record.
 564                  */
 565                 udma_to_device_barrier();
 566
 567                 *qp->db = htobe32(qp->rq.head & 0xffff);
 568         }
 569
 570         pthread_spin_unlock(&qp->rq.lock);
 571
 572         return ret;
 573 }
 574
 575 static int num_inline_segs(int data, enum ibv_qp_type type)
 576 {
 577         /*
 578          * Inline data segments are not allowed to cross 64 byte
 579          * boundaries.  For UD QPs, the data segments always start
 580          * aligned to 64 bytes (16 byte control segment + 48 byte
 581          * datagram segment); for other QPs, there will be a 16 byte
 582          * control segment and possibly a 16 byte remote address
 583          * segment, so in the worst case there will be only 32 bytes
 584          * available for the first data segment.
 585          */
 586         if (type == IBV_QPT_UD)
 587                 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
 588                          sizeof (struct mlx4_wqe_datagram_seg)) %
 589                         MLX4_INLINE_ALIGN;
 590         else
 591                 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
 592                          sizeof (struct mlx4_wqe_raddr_seg)) %
 593                         MLX4_INLINE_ALIGN;
 594
 595         return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
 596                 (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
 597 }
 598
 599 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
 600                            struct mlx4_qp *qp)
 601 {
 602         int size;
 603         int max_sq_sge;
 604
 605         max_sq_sge       = align(cap->max_inline_data +
 606                                  num_inline_segs(cap->max_inline_data, type) *
 607                                  sizeof (struct mlx4_wqe_inline_seg),
 608                                  sizeof (struct mlx4_wqe_data_seg)) /
 609                 sizeof (struct mlx4_wqe_data_seg);
 610         if (max_sq_sge < cap->max_send_sge)
 611                 max_sq_sge = cap->max_send_sge;
 612
 613         size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
 614         switch (type) {
 615         case IBV_QPT_UD:
 616                 size += sizeof (struct mlx4_wqe_datagram_seg);
 617                 break;
 618
 619         case IBV_QPT_UC:
 620                 size += sizeof (struct mlx4_wqe_raddr_seg);
 621                 break;
 622
 623         case IBV_QPT_XRC_SEND:
 624         case IBV_QPT_RC:
 625                 size += sizeof (struct mlx4_wqe_raddr_seg);
 626                 /*
 627                  * An atomic op will require an atomic segment, a
 628                  * remote address segment and one scatter entry.
 629                  */
 630                 if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
 631                             sizeof (struct mlx4_wqe_raddr_seg) +
 632                             sizeof (struct mlx4_wqe_data_seg)))
 633                         size = (sizeof (struct mlx4_wqe_atomic_seg) +
 634                                 sizeof (struct mlx4_wqe_raddr_seg) +
 635                                 sizeof (struct mlx4_wqe_data_seg));
 636                 break;
 637
 638         default:
 639                 break;
 640         }
 641
 642         /* Make sure that we have enough space for a bind request */
 643         if (size < sizeof (struct mlx4_wqe_bind_seg))
 644                 size = sizeof (struct mlx4_wqe_bind_seg);
 645
 646         size += sizeof (struct mlx4_wqe_ctrl_seg);
 647
 648         for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
 649              qp->sq.wqe_shift++)
 650                 ; /* nothing */
 651 }
 652
 653 int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
 654                        enum ibv_qp_type type, struct mlx4_qp *qp)
 655 {
 656         qp->rq.max_gs    = cap->max_recv_sge;
 657
 658         if (qp->sq.wqe_cnt) {
 659                 qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
 660                 if (!qp->sq.wrid)
 661                         return -1;
 662         }
 663
 664         if (qp->rq.wqe_cnt) {
 665                 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
 666                 if (!qp->rq.wrid) {
 667                         free(qp->sq.wrid);
 668                         return -1;
 669                 }
 670         }
 671
 672         for (qp->rq.wqe_shift = 4;
 673              1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
 674              qp->rq.wqe_shift++)
 675                 ; /* nothing */
 676
 677         qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 678                 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
 679         if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
 680                 qp->rq.offset = 0;
 681                 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 682         } else {
 683                 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
 684                 qp->sq.offset = 0;
 685         }
 686
 687         if (qp->buf_size) {
 688                 if (mlx4_alloc_buf(&qp->buf,
 689                                    align(qp->buf_size, to_mdev(context->device)->page_size),
 690                                    to_mdev(context->device)->page_size)) {
 691                         free(qp->sq.wrid);
 692                         free(qp->rq.wrid);
 693                         return -1;
 694                 }
 695
 696                 memset(qp->buf.buf, 0, qp->buf_size);
 697         } else {
 698                 qp->buf.buf = NULL;
 699         }
 700
 701         return 0;
 702 }
 703
 704 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
 705                        enum ibv_qp_type type)
 706 {
 707         int wqe_size;
 708
 709         wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
 710         switch (type) {
 711         case IBV_QPT_UD:
 712                 wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
 713                 break;
 714
 715         case IBV_QPT_XRC_SEND:
 716         case IBV_QPT_UC:
 717         case IBV_QPT_RC:
 718                 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
 719                 break;
 720
 721         default:
 722                 break;
 723         }
 724
 725         qp->sq.max_gs        = wqe_size / sizeof (struct mlx4_wqe_data_seg);
 726         cap->max_send_sge    = qp->sq.max_gs;
 727         qp->sq.max_post      = qp->sq.wqe_cnt - qp->sq_spare_wqes;
 728         cap->max_send_wr     = qp->sq.max_post;
 729
 730         /*
 731          * Inline data segments can't cross a 64 byte boundary.  So
 732          * subtract off one segment header for each 64-byte chunk,
 733          * taking into account the fact that wqe_size will be 32 mod
 734          * 64 for non-UD QPs.
 735          */
 736         qp->max_inline_data  = wqe_size -
 737                 sizeof (struct mlx4_wqe_inline_seg) *
 738                 (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
 739         cap->max_inline_data = qp->max_inline_data;
 740 }
 741
 742 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
 743 {
 744         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
 745
 746         if (ctx->qp_table[tind].refcnt)
 747                 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
 748         else
 749                 return NULL;
 750 }
 751
 752 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
 753 {
 754         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
 755
 756         if (!ctx->qp_table[tind].refcnt) {
 757                 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
 758                                                    sizeof (struct mlx4_qp *));
 759                 if (!ctx->qp_table[tind].table)
 760                         return -1;
 761         }
 762
 763         ++ctx->qp_table[tind].refcnt;
 764         ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
 765         return 0;
 766 }
 767
 768 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
 769 {
 770         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
 771
 772         if (!--ctx->qp_table[tind].refcnt)
 773                 free(ctx->qp_table[tind].table);
 774         else
 775                 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
 776 }