]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/ofed/libmlx5/qp.c
sysctl(9): Fix a few mandoc related issues
[FreeBSD/FreeBSD.git] / contrib / ofed / libmlx5 / qp.c
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <config.h>
34
35 #include <stdlib.h>
36 #include <pthread.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <stdio.h>
40
41 #include "mlx5.h"
42 #include "doorbell.h"
43 #include "wqe.h"
44
45 #define MLX5_ATOMIC_SIZE 8
46
47 static const uint32_t mlx5_ib_opcode[] = {
48         [IBV_WR_SEND]                   = MLX5_OPCODE_SEND,
49         [IBV_WR_SEND_WITH_INV]          = MLX5_OPCODE_SEND_INVAL,
50         [IBV_WR_SEND_WITH_IMM]          = MLX5_OPCODE_SEND_IMM,
51         [IBV_WR_RDMA_WRITE]             = MLX5_OPCODE_RDMA_WRITE,
52         [IBV_WR_RDMA_WRITE_WITH_IMM]    = MLX5_OPCODE_RDMA_WRITE_IMM,
53         [IBV_WR_RDMA_READ]              = MLX5_OPCODE_RDMA_READ,
54         [IBV_WR_ATOMIC_CMP_AND_SWP]     = MLX5_OPCODE_ATOMIC_CS,
55         [IBV_WR_ATOMIC_FETCH_AND_ADD]   = MLX5_OPCODE_ATOMIC_FA,
56         [IBV_WR_BIND_MW]                = MLX5_OPCODE_UMR,
57         [IBV_WR_LOCAL_INV]              = MLX5_OPCODE_UMR,
58         [IBV_WR_TSO]                    = MLX5_OPCODE_TSO,
59 };
60
61 static void *get_recv_wqe(struct mlx5_qp *qp, int n)
62 {
63         return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
64 }
65
66 static void *get_wq_recv_wqe(struct mlx5_rwq *rwq, int n)
67 {
68         return rwq->pbuff  + (n << rwq->rq.wqe_shift);
69 }
70
71 static int copy_to_scat(struct mlx5_wqe_data_seg *scat, void *buf, int *size,
72                          int max)
73 {
74         int copy;
75         int i;
76
77         if (unlikely(!(*size)))
78                 return IBV_WC_SUCCESS;
79
80         for (i = 0; i < max; ++i) {
81                 copy = min_t(long, *size, be32toh(scat->byte_count));
82                 memcpy((void *)(unsigned long)be64toh(scat->addr), buf, copy);
83                 *size -= copy;
84                 if (*size == 0)
85                         return IBV_WC_SUCCESS;
86
87                 buf += copy;
88                 ++scat;
89         }
90         return IBV_WC_LOC_LEN_ERR;
91 }
92
93 int mlx5_copy_to_recv_wqe(struct mlx5_qp *qp, int idx, void *buf, int size)
94 {
95         struct mlx5_wqe_data_seg *scat;
96         int max = 1 << (qp->rq.wqe_shift - 4);
97
98         scat = get_recv_wqe(qp, idx);
99         if (unlikely(qp->wq_sig))
100                 ++scat;
101
102         return copy_to_scat(scat, buf, &size, max);
103 }
104
105 int mlx5_copy_to_send_wqe(struct mlx5_qp *qp, int idx, void *buf, int size)
106 {
107         struct mlx5_wqe_ctrl_seg *ctrl;
108         struct mlx5_wqe_data_seg *scat;
109         void *p;
110         int max;
111
112         idx &= (qp->sq.wqe_cnt - 1);
113         ctrl = mlx5_get_send_wqe(qp, idx);
114         if (qp->ibv_qp->qp_type != IBV_QPT_RC) {
115                 fprintf(stderr, "scatter to CQE is supported only for RC QPs\n");
116                 return IBV_WC_GENERAL_ERR;
117         }
118         p = ctrl + 1;
119
120         switch (be32toh(ctrl->opmod_idx_opcode) & 0xff) {
121         case MLX5_OPCODE_RDMA_READ:
122                 p = p + sizeof(struct mlx5_wqe_raddr_seg);
123                 break;
124
125         case MLX5_OPCODE_ATOMIC_CS:
126         case MLX5_OPCODE_ATOMIC_FA:
127                 p = p + sizeof(struct mlx5_wqe_raddr_seg) +
128                         sizeof(struct mlx5_wqe_atomic_seg);
129                 break;
130
131         default:
132                 fprintf(stderr, "scatter to CQE for opcode %d\n",
133                         be32toh(ctrl->opmod_idx_opcode) & 0xff);
134                 return IBV_WC_REM_INV_REQ_ERR;
135         }
136
137         scat = p;
138         max = (be32toh(ctrl->qpn_ds) & 0x3F) - (((void *)scat - (void *)ctrl) >> 4);
139         if (unlikely((void *)(scat + max) > qp->sq.qend)) {
140                 int tmp = ((void *)qp->sq.qend - (void *)scat) >> 4;
141                 int orig_size = size;
142
143                 if (copy_to_scat(scat, buf, &size, tmp) == IBV_WC_SUCCESS)
144                         return IBV_WC_SUCCESS;
145                 max = max - tmp;
146                 buf += orig_size - size;
147                 scat = mlx5_get_send_wqe(qp, 0);
148         }
149
150         return copy_to_scat(scat, buf, &size, max);
151 }
152
153 void *mlx5_get_send_wqe(struct mlx5_qp *qp, int n)
154 {
155         return qp->sq_start + (n << MLX5_SEND_WQE_SHIFT);
156 }
157
158 void mlx5_init_rwq_indices(struct mlx5_rwq *rwq)
159 {
160         rwq->rq.head     = 0;
161         rwq->rq.tail     = 0;
162 }
163
164 void mlx5_init_qp_indices(struct mlx5_qp *qp)
165 {
166         qp->sq.head      = 0;
167         qp->sq.tail      = 0;
168         qp->rq.head      = 0;
169         qp->rq.tail      = 0;
170         qp->sq.cur_post  = 0;
171 }
172
173 static int mlx5_wq_overflow(struct mlx5_wq *wq, int nreq, struct mlx5_cq *cq)
174 {
175         unsigned cur;
176
177         cur = wq->head - wq->tail;
178         if (cur + nreq < wq->max_post)
179                 return 0;
180
181         mlx5_spin_lock(&cq->lock);
182         cur = wq->head - wq->tail;
183         mlx5_spin_unlock(&cq->lock);
184
185         return cur + nreq >= wq->max_post;
186 }
187
188 static inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
189                                  uint64_t remote_addr, uint32_t rkey)
190 {
191         rseg->raddr    = htobe64(remote_addr);
192         rseg->rkey     = htobe32(rkey);
193         rseg->reserved = 0;
194 }
195
196 static void set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg,
197                            enum ibv_wr_opcode   opcode,
198                            uint64_t swap,
199                            uint64_t compare_add)
200 {
201         if (opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
202                 aseg->swap_add = htobe64(swap);
203                 aseg->compare  = htobe64(compare_add);
204         } else {
205                 aseg->swap_add = htobe64(compare_add);
206         }
207 }
208
209 static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
210                              struct ibv_send_wr *wr)
211 {
212         memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof dseg->av);
213         dseg->av.dqp_dct = htobe32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV);
214         dseg->av.key.qkey.qkey = htobe32(wr->wr.ud.remote_qkey);
215 }
216
217 static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ibv_sge *sg,
218                              int offset)
219 {
220         dseg->byte_count = htobe32(sg->length - offset);
221         dseg->lkey       = htobe32(sg->lkey);
222         dseg->addr       = htobe64(sg->addr + offset);
223 }
224
225 static void set_data_ptr_seg_atomic(struct mlx5_wqe_data_seg *dseg,
226                                     struct ibv_sge *sg)
227 {
228         dseg->byte_count = htobe32(MLX5_ATOMIC_SIZE);
229         dseg->lkey       = htobe32(sg->lkey);
230         dseg->addr       = htobe64(sg->addr);
231 }
232
233 /*
234  * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
235  * implementations may use move-string-buffer assembler instructions,
236  * which do not guarantee order of copying.
237  */
238 static void mlx5_bf_copy(unsigned long long *dst, unsigned long long *src,
239                          unsigned bytecnt, struct mlx5_qp *qp)
240 {
241         while (bytecnt > 0) {
242                 *dst++ = *src++;
243                 *dst++ = *src++;
244                 *dst++ = *src++;
245                 *dst++ = *src++;
246                 *dst++ = *src++;
247                 *dst++ = *src++;
248                 *dst++ = *src++;
249                 *dst++ = *src++;
250                 bytecnt -= 8 * sizeof(unsigned long long);
251                 if (unlikely(src == qp->sq.qend))
252                         src = qp->sq_start;
253         }
254 }
255
256 static uint32_t send_ieth(struct ibv_send_wr *wr)
257 {
258         switch (wr->opcode) {
259         case IBV_WR_SEND_WITH_IMM:
260         case IBV_WR_RDMA_WRITE_WITH_IMM:
261                 return wr->imm_data;
262         case IBV_WR_SEND_WITH_INV:
263                 return htobe32(wr->imm_data);
264         default:
265                 return 0;
266         }
267 }
268
269 static int set_data_inl_seg(struct mlx5_qp *qp, struct ibv_send_wr *wr,
270                             void *wqe, int *sz,
271                             struct mlx5_sg_copy_ptr *sg_copy_ptr)
272 {
273         struct mlx5_wqe_inline_seg *seg;
274         void *addr;
275         int len;
276         int i;
277         int inl = 0;
278         void *qend = qp->sq.qend;
279         int copy;
280         int offset = sg_copy_ptr->offset;
281
282         seg = wqe;
283         wqe += sizeof *seg;
284         for (i = sg_copy_ptr->index; i < wr->num_sge; ++i) {
285                 addr = (void *) (unsigned long)(wr->sg_list[i].addr + offset);
286                 len  = wr->sg_list[i].length - offset;
287                 inl += len;
288                 offset = 0;
289
290                 if (unlikely(inl > qp->max_inline_data))
291                         return ENOMEM;
292
293                 if (unlikely(wqe + len > qend)) {
294                         copy = qend - wqe;
295                         memcpy(wqe, addr, copy);
296                         addr += copy;
297                         len -= copy;
298                         wqe = mlx5_get_send_wqe(qp, 0);
299                 }
300                 memcpy(wqe, addr, len);
301                 wqe += len;
302         }
303
304         if (likely(inl)) {
305                 seg->byte_count = htobe32(inl | MLX5_INLINE_SEG);
306                 *sz = align(inl + sizeof seg->byte_count, 16) / 16;
307         } else
308                 *sz = 0;
309
310         return 0;
311 }
312
313 static uint8_t wq_sig(struct mlx5_wqe_ctrl_seg *ctrl)
314 {
315         return calc_sig(ctrl, be32toh(ctrl->qpn_ds));
316 }
317
318 #ifdef MLX5_DEBUG
319 static void dump_wqe(FILE *fp, int idx, int size_16, struct mlx5_qp *qp)
320 {
321         uint32_t *p = NULL;
322         int i, j;
323         int tidx = idx;
324
325         fprintf(fp, "dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
326         for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
327                 if ((i & 0xf) == 0) {
328                         void *buf = mlx5_get_send_wqe(qp, tidx);
329                         tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
330                         p = buf;
331                         j = 0;
332                 }
333                 fprintf(fp, "%08x %08x %08x %08x\n", be32toh(p[j]), be32toh(p[j + 1]),
334                         be32toh(p[j + 2]), be32toh(p[j + 3]));
335         }
336 }
337 #endif /* MLX5_DEBUG */
338
339
340 void *mlx5_get_atomic_laddr(struct mlx5_qp *qp, uint16_t idx, int *byte_count)
341 {
342         struct mlx5_wqe_data_seg *dpseg;
343         void *addr;
344
345         dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
346                 sizeof(struct mlx5_wqe_raddr_seg) +
347                 sizeof(struct mlx5_wqe_atomic_seg);
348         addr = (void *)(unsigned long)be64toh(dpseg->addr);
349
350         /*
351          * Currently byte count is always 8 bytes. Fix this when
352          * we support variable size of atomics
353          */
354         *byte_count = 8;
355         return addr;
356 }
357
358 static inline int copy_eth_inline_headers(struct ibv_qp *ibqp,
359                                           struct ibv_send_wr *wr,
360                                           struct mlx5_wqe_eth_seg *eseg,
361                                           struct mlx5_sg_copy_ptr *sg_copy_ptr)
362 {
363         uint32_t inl_hdr_size = MLX5_ETH_L2_INLINE_HEADER_SIZE;
364         int inl_hdr_copy_size = 0;
365         int j = 0;
366         FILE *fp = to_mctx(ibqp->context)->dbg_fp;
367
368         if (unlikely(wr->num_sge < 1)) {
369                 mlx5_dbg(fp, MLX5_DBG_QP_SEND, "illegal num_sge: %d, minimum is 1\n",
370                          wr->num_sge);
371                 return EINVAL;
372         }
373
374         if (likely(wr->sg_list[0].length >= MLX5_ETH_L2_INLINE_HEADER_SIZE)) {
375                 inl_hdr_copy_size = MLX5_ETH_L2_INLINE_HEADER_SIZE;
376                 memcpy(eseg->inline_hdr_start,
377                        (void *)(uintptr_t)wr->sg_list[0].addr,
378                        inl_hdr_copy_size);
379         } else {
380                 for (j = 0; j < wr->num_sge && inl_hdr_size > 0; ++j) {
381                         inl_hdr_copy_size = min(wr->sg_list[j].length,
382                                                 inl_hdr_size);
383                         memcpy(eseg->inline_hdr_start +
384                                (MLX5_ETH_L2_INLINE_HEADER_SIZE - inl_hdr_size),
385                                (void *)(uintptr_t)wr->sg_list[j].addr,
386                                inl_hdr_copy_size);
387                         inl_hdr_size -= inl_hdr_copy_size;
388                 }
389                 if (unlikely(inl_hdr_size)) {
390                         mlx5_dbg(fp, MLX5_DBG_QP_SEND, "Ethernet headers < 16 bytes\n");
391                         return EINVAL;
392                 }
393                 --j;
394         }
395
396
397         eseg->inline_hdr_sz = htobe16(MLX5_ETH_L2_INLINE_HEADER_SIZE);
398
399         /* If we copied all the sge into the inline-headers, then we need to
400          * start copying from the next sge into the data-segment.
401          */
402         if (unlikely(wr->sg_list[j].length == inl_hdr_copy_size)) {
403                 ++j;
404                 inl_hdr_copy_size = 0;
405         }
406
407         sg_copy_ptr->index = j;
408         sg_copy_ptr->offset = inl_hdr_copy_size;
409
410         return 0;
411 }
412
413 #undef  ALIGN
414 #define ALIGN(x, log_a) ((((x) + (1 << (log_a)) - 1)) & ~((1 << (log_a)) - 1))
415
416 static inline uint16_t get_klm_octo(int nentries)
417 {
418         return htobe16(ALIGN(nentries, 3) / 2);
419 }
420
421 static void set_umr_data_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
422                              int32_t rkey, struct ibv_mw_bind_info *bind_info,
423                              uint32_t qpn, void **seg, int *size)
424 {
425         union {
426                 struct mlx5_wqe_umr_klm_seg     klm;
427                 uint8_t                         reserved[64];
428         } *data = *seg;
429
430         data->klm.byte_count = htobe32(bind_info->length);
431         data->klm.mkey = htobe32(bind_info->mr->lkey);
432         data->klm.address = htobe64(bind_info->addr);
433
434         memset(&data->klm + 1, 0, sizeof(data->reserved) -
435                sizeof(data->klm));
436
437         *seg += sizeof(*data);
438         *size += (sizeof(*data) / 16);
439 }
440
441 static void set_umr_mkey_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
442                              int32_t rkey, struct ibv_mw_bind_info *bind_info,
443                              uint32_t qpn, void **seg, int *size)
444 {
445         struct mlx5_wqe_mkey_context_seg        *mkey = *seg;
446
447         mkey->qpn_mkey = htobe32((rkey & 0xFF) |
448                                    ((type == IBV_MW_TYPE_1 || !bind_info->length) ?
449                                     0xFFFFFF00 : qpn << 8));
450         if (bind_info->length) {
451                 /* Local read is set in kernel */
452                 mkey->access_flags = 0;
453                 mkey->free = 0;
454                 if (bind_info->mw_access_flags & IBV_ACCESS_LOCAL_WRITE)
455                         mkey->access_flags |=
456                                 MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_LOCAL_WRITE;
457                 if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_WRITE)
458                         mkey->access_flags |=
459                                 MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_WRITE;
460                 if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_READ)
461                         mkey->access_flags |=
462                                 MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_READ;
463                 if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_ATOMIC)
464                         mkey->access_flags |=
465                                 MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_ATOMIC;
466                 if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED)
467                         mkey->start_addr = 0;
468                 else
469                         mkey->start_addr = htobe64(bind_info->addr);
470                 mkey->len = htobe64(bind_info->length);
471         } else {
472                 mkey->free = MLX5_WQE_MKEY_CONTEXT_FREE;
473         }
474
475         *seg += sizeof(struct mlx5_wqe_mkey_context_seg);
476         *size += (sizeof(struct mlx5_wqe_mkey_context_seg) / 16);
477 }
478
479 static inline void set_umr_control_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
480                                        int32_t rkey, struct ibv_mw_bind_info *bind_info,
481                                        uint32_t qpn, void **seg, int *size)
482 {
483         struct mlx5_wqe_umr_ctrl_seg            *ctrl = *seg;
484
485         ctrl->flags = MLX5_WQE_UMR_CTRL_FLAG_TRNSLATION_OFFSET |
486                 MLX5_WQE_UMR_CTRL_FLAG_INLINE;
487         ctrl->mkey_mask = htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_FREE |
488                                      MLX5_WQE_UMR_CTRL_MKEY_MASK_MKEY);
489         ctrl->translation_offset = 0;
490         memset(ctrl->rsvd0, 0, sizeof(ctrl->rsvd0));
491         memset(ctrl->rsvd1, 0, sizeof(ctrl->rsvd1));
492
493         if (type == IBV_MW_TYPE_2)
494                 ctrl->mkey_mask |= htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_QPN);
495
496         if (bind_info->length) {
497                 ctrl->klm_octowords = get_klm_octo(1);
498                 if (type == IBV_MW_TYPE_2)
499                         ctrl->flags |=  MLX5_WQE_UMR_CTRL_FLAG_CHECK_FREE;
500                 ctrl->mkey_mask |= htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN      |
501                                               MLX5_WQE_UMR_CTRL_MKEY_MASK_START_ADDR |
502                                               MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_LOCAL_WRITE |
503                                               MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_READ |
504                                               MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_WRITE |
505                                               MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_ATOMIC);
506         } else {
507                 ctrl->klm_octowords = get_klm_octo(0);
508                 if (type == IBV_MW_TYPE_2)
509                         ctrl->flags |= MLX5_WQE_UMR_CTRL_FLAG_CHECK_QPN;
510         }
511
512         *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
513         *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
514 }
515
516 static inline int set_bind_wr(struct mlx5_qp *qp, enum ibv_mw_type type,
517                               int32_t rkey, struct ibv_mw_bind_info *bind_info,
518                               uint32_t qpn, void **seg, int *size)
519 {
520         void *qend = qp->sq.qend;
521
522 #ifdef MW_DEBUG
523         if (bind_info->mw_access_flags &
524             ~(IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_READ |
525              IBV_ACCESS_REMOTE_WRITE))
526                 return EINVAL;
527
528         if (bind_info->mr &&
529             (bind_info->mr->addr > (void *)bind_info->addr ||
530              bind_info->mr->addr + bind_info->mr->length <
531              (void *)bind_info->addr + bind_info->length ||
532              !(to_mmr(bind_info->mr)->alloc_flags &  IBV_ACCESS_MW_BIND) ||
533              (bind_info->mw_access_flags &
534               (IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_WRITE) &&
535               !(to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_LOCAL_WRITE))))
536                 return EINVAL;
537
538 #endif
539
540         /* check that len > 2GB because KLM support only 2GB */
541         if (bind_info->length > 1UL << 31)
542                 return EOPNOTSUPP;
543
544         set_umr_control_seg(qp, type, rkey, bind_info, qpn, seg, size);
545         if (unlikely((*seg == qend)))
546                 *seg = mlx5_get_send_wqe(qp, 0);
547
548         set_umr_mkey_seg(qp, type, rkey, bind_info, qpn, seg, size);
549         if (!bind_info->length)
550                 return 0;
551
552         if (unlikely((seg == qend)))
553                 *seg = mlx5_get_send_wqe(qp, 0);
554
555         set_umr_data_seg(qp, type, rkey, bind_info, qpn, seg, size);
556         return 0;
557 }
558
559 /* Copy tso header to eth segment with considering padding and WQE
560  * wrap around in WQ buffer.
561  */
562 static inline int set_tso_eth_seg(void **seg, struct ibv_send_wr *wr,
563                                    void *qend, struct mlx5_qp *qp, int *size)
564 {
565         struct mlx5_wqe_eth_seg *eseg = *seg;
566         int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start);
567         uint64_t left, left_len, copy_sz;
568         void *pdata = wr->tso.hdr;
569         FILE *fp = to_mctx(qp->ibv_qp->context)->dbg_fp;
570
571         if (unlikely(wr->tso.hdr_sz < MLX5_ETH_L2_MIN_HEADER_SIZE ||
572                      wr->tso.hdr_sz > qp->max_tso_header)) {
573                 mlx5_dbg(fp, MLX5_DBG_QP_SEND,
574                          "TSO header size should be at least %d and at most %d\n",
575                          MLX5_ETH_L2_MIN_HEADER_SIZE,
576                          qp->max_tso_header);
577                 return EINVAL;
578         }
579
580         left = wr->tso.hdr_sz;
581         eseg->mss = htobe16(wr->tso.mss);
582         eseg->inline_hdr_sz = htobe16(wr->tso.hdr_sz);
583
584         /* Check if there is space till the end of queue, if yes,
585          * copy all in one shot, otherwise copy till the end of queue,
586          * rollback and then copy the left
587          */
588         left_len = qend - (void *)eseg->inline_hdr_start;
589         copy_sz = min(left_len, left);
590
591         memcpy(eseg->inline_hdr_start, pdata, copy_sz);
592
593         /* The -1 is because there are already 16 bytes included in
594          * eseg->inline_hdr[16]
595          */
596         *seg += align(copy_sz - size_of_inl_hdr_start, 16) - 16;
597         *size += align(copy_sz - size_of_inl_hdr_start, 16) / 16 - 1;
598
599         /* The last wqe in the queue */
600         if (unlikely(copy_sz < left)) {
601                 *seg = mlx5_get_send_wqe(qp, 0);
602                 left -= copy_sz;
603                 pdata += copy_sz;
604                 memcpy(*seg, pdata, left);
605                 *seg += align(left, 16);
606                 *size += align(left, 16) / 16;
607         }
608
609         return 0;
610 }
611
612 static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
613                                   struct ibv_send_wr **bad_wr)
614 {
615         struct mlx5_context *ctx;
616         struct mlx5_qp *qp = to_mqp(ibqp);
617         void *seg;
618         struct mlx5_wqe_eth_seg *eseg;
619         struct mlx5_wqe_ctrl_seg *ctrl = NULL;
620         struct mlx5_wqe_data_seg *dpseg;
621         struct mlx5_sg_copy_ptr sg_copy_ptr = {.index = 0, .offset = 0};
622         int nreq;
623         int inl = 0;
624         int err = 0;
625         int size = 0;
626         int i;
627         unsigned idx;
628         uint8_t opmod = 0;
629         struct mlx5_bf *bf = qp->bf;
630         void *qend = qp->sq.qend;
631         uint32_t mlx5_opcode;
632         struct mlx5_wqe_xrc_seg *xrc;
633         uint8_t fence;
634         uint8_t next_fence;
635         uint32_t max_tso = 0;
636         FILE *fp = to_mctx(ibqp->context)->dbg_fp; /* The compiler ignores in non-debug mode */
637
638         mlx5_spin_lock(&qp->sq.lock);
639
640         next_fence = qp->fm_cache;
641
642         for (nreq = 0; wr; ++nreq, wr = wr->next) {
643                 if (unlikely(wr->opcode < 0 ||
644                     wr->opcode >= sizeof mlx5_ib_opcode / sizeof mlx5_ib_opcode[0])) {
645                         mlx5_dbg(fp, MLX5_DBG_QP_SEND, "bad opcode %d\n", wr->opcode);
646                         err = EINVAL;
647                         *bad_wr = wr;
648                         goto out;
649                 }
650
651                 if (unlikely(mlx5_wq_overflow(&qp->sq, nreq,
652                                               to_mcq(qp->ibv_qp->send_cq)))) {
653                         mlx5_dbg(fp, MLX5_DBG_QP_SEND, "work queue overflow\n");
654                         err = ENOMEM;
655                         *bad_wr = wr;
656                         goto out;
657                 }
658
659                 if (unlikely(wr->num_sge > qp->sq.max_gs)) {
660                         mlx5_dbg(fp, MLX5_DBG_QP_SEND, "max gs exceeded %d (max = %d)\n",
661                                  wr->num_sge, qp->sq.max_gs);
662                         err = ENOMEM;
663                         *bad_wr = wr;
664                         goto out;
665                 }
666
667                 if (wr->send_flags & IBV_SEND_FENCE)
668                         fence = MLX5_WQE_CTRL_FENCE;
669                 else
670                         fence = next_fence;
671                 next_fence = 0;
672                 idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
673                 ctrl = seg = mlx5_get_send_wqe(qp, idx);
674                 *(uint32_t *)(seg + 8) = 0;
675                 ctrl->imm = send_ieth(wr);
676                 ctrl->fm_ce_se = qp->sq_signal_bits | fence |
677                         (wr->send_flags & IBV_SEND_SIGNALED ?
678                          MLX5_WQE_CTRL_CQ_UPDATE : 0) |
679                         (wr->send_flags & IBV_SEND_SOLICITED ?
680                          MLX5_WQE_CTRL_SOLICITED : 0);
681
682                 seg += sizeof *ctrl;
683                 size = sizeof *ctrl / 16;
684
685                 switch (ibqp->qp_type) {
686                 case IBV_QPT_XRC_SEND:
687                         if (unlikely(wr->opcode != IBV_WR_BIND_MW &&
688                                      wr->opcode != IBV_WR_LOCAL_INV)) {
689                                 xrc = seg;
690                                 xrc->xrc_srqn = htobe32(wr->qp_type.xrc.remote_srqn);
691                                 seg += sizeof(*xrc);
692                                 size += sizeof(*xrc) / 16;
693                         }
694                         /* fall through */
695                 case IBV_QPT_RC:
696                         switch (wr->opcode) {
697                         case IBV_WR_RDMA_READ:
698                         case IBV_WR_RDMA_WRITE:
699                         case IBV_WR_RDMA_WRITE_WITH_IMM:
700                                 set_raddr_seg(seg, wr->wr.rdma.remote_addr,
701                                               wr->wr.rdma.rkey);
702                                 seg  += sizeof(struct mlx5_wqe_raddr_seg);
703                                 size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
704                                 break;
705
706                         case IBV_WR_ATOMIC_CMP_AND_SWP:
707                         case IBV_WR_ATOMIC_FETCH_AND_ADD:
708                                 if (unlikely(!qp->atomics_enabled)) {
709                                         mlx5_dbg(fp, MLX5_DBG_QP_SEND, "atomic operations are not supported\n");
710                                         err = ENOSYS;
711                                         *bad_wr = wr;
712                                         goto out;
713                                 }
714                                 set_raddr_seg(seg, wr->wr.atomic.remote_addr,
715                                               wr->wr.atomic.rkey);
716                                 seg  += sizeof(struct mlx5_wqe_raddr_seg);
717
718                                 set_atomic_seg(seg, wr->opcode,
719                                                wr->wr.atomic.swap,
720                                                wr->wr.atomic.compare_add);
721                                 seg  += sizeof(struct mlx5_wqe_atomic_seg);
722
723                                 size += (sizeof(struct mlx5_wqe_raddr_seg) +
724                                 sizeof(struct mlx5_wqe_atomic_seg)) / 16;
725                                 break;
726
727                         case IBV_WR_BIND_MW:
728                                 next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
729                                 ctrl->imm = htobe32(wr->bind_mw.mw->rkey);
730                                 err = set_bind_wr(qp, wr->bind_mw.mw->type,
731                                                   wr->bind_mw.rkey,
732                                                   &wr->bind_mw.bind_info,
733                                                   ibqp->qp_num, &seg, &size);
734                                 if (err) {
735                                         *bad_wr = wr;
736                                         goto out;
737                                 }
738
739                                 qp->sq.wr_data[idx] = IBV_WC_BIND_MW;
740                                 break;
741                         case IBV_WR_LOCAL_INV: {
742                                 struct ibv_mw_bind_info bind_info = {};
743
744                                 next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
745                                 ctrl->imm = htobe32(wr->imm_data);
746                                 err = set_bind_wr(qp, IBV_MW_TYPE_2, 0,
747                                                   &bind_info, ibqp->qp_num,
748                                                   &seg, &size);
749                                 if (err) {
750                                         *bad_wr = wr;
751                                         goto out;
752                                 }
753
754                                 qp->sq.wr_data[idx] = IBV_WC_LOCAL_INV;
755                                 break;
756                         }
757
758                         default:
759                                 break;
760                         }
761                         break;
762
763                 case IBV_QPT_UC:
764                         switch (wr->opcode) {
765                         case IBV_WR_RDMA_WRITE:
766                         case IBV_WR_RDMA_WRITE_WITH_IMM:
767                                 set_raddr_seg(seg, wr->wr.rdma.remote_addr,
768                                               wr->wr.rdma.rkey);
769                                 seg  += sizeof(struct mlx5_wqe_raddr_seg);
770                                 size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
771                                 break;
772                         case IBV_WR_BIND_MW:
773                                 next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
774                                 ctrl->imm = htobe32(wr->bind_mw.mw->rkey);
775                                 err = set_bind_wr(qp, wr->bind_mw.mw->type,
776                                                   wr->bind_mw.rkey,
777                                                   &wr->bind_mw.bind_info,
778                                                   ibqp->qp_num, &seg, &size);
779                                 if (err) {
780                                         *bad_wr = wr;
781                                         goto out;
782                                 }
783
784                                 qp->sq.wr_data[idx] = IBV_WC_BIND_MW;
785                                 break;
786                         case IBV_WR_LOCAL_INV: {
787                                 struct ibv_mw_bind_info bind_info = {};
788
789                                 next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
790                                 ctrl->imm = htobe32(wr->imm_data);
791                                 err = set_bind_wr(qp, IBV_MW_TYPE_2, 0,
792                                                   &bind_info, ibqp->qp_num,
793                                                   &seg, &size);
794                                 if (err) {
795                                         *bad_wr = wr;
796                                         goto out;
797                                 }
798
799                                 qp->sq.wr_data[idx] = IBV_WC_LOCAL_INV;
800                                 break;
801                         }
802
803                         default:
804                                 break;
805                         }
806                         break;
807
808                 case IBV_QPT_UD:
809                         set_datagram_seg(seg, wr);
810                         seg  += sizeof(struct mlx5_wqe_datagram_seg);
811                         size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
812                         if (unlikely((seg == qend)))
813                                 seg = mlx5_get_send_wqe(qp, 0);
814                         break;
815
816                 case IBV_QPT_RAW_PACKET:
817                         memset(seg, 0, sizeof(struct mlx5_wqe_eth_seg));
818                         eseg = seg;
819
820                         if (wr->send_flags & IBV_SEND_IP_CSUM) {
821                                 if (!(qp->qp_cap_cache & MLX5_CSUM_SUPPORT_RAW_OVER_ETH)) {
822                                         err = EINVAL;
823                                         *bad_wr = wr;
824                                         goto out;
825                                 }
826
827                                 eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
828                         }
829
830                         if (wr->opcode == IBV_WR_TSO) {
831                                 max_tso = qp->max_tso;
832                                 err = set_tso_eth_seg(&seg, wr, qend, qp, &size);
833                                 if (unlikely(err)) {
834                                         *bad_wr = wr;
835                                         goto out;
836                                 }
837                         } else {
838                                 err = copy_eth_inline_headers(ibqp, wr, seg, &sg_copy_ptr);
839                                 if (unlikely(err)) {
840                                         *bad_wr = wr;
841                                         mlx5_dbg(fp, MLX5_DBG_QP_SEND,
842                                                  "copy_eth_inline_headers failed, err: %d\n",
843                                                  err);
844                                         goto out;
845                                 }
846                         }
847
848                         seg += sizeof(struct mlx5_wqe_eth_seg);
849                         size += sizeof(struct mlx5_wqe_eth_seg) / 16;
850                         break;
851
852                 default:
853                         break;
854                 }
855
856                 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
857                         int sz = 0;
858
859                         err = set_data_inl_seg(qp, wr, seg, &sz, &sg_copy_ptr);
860                         if (unlikely(err)) {
861                                 *bad_wr = wr;
862                                 mlx5_dbg(fp, MLX5_DBG_QP_SEND,
863                                          "inline layout failed, err %d\n", err);
864                                 goto out;
865                         }
866                         inl = 1;
867                         size += sz;
868                 } else {
869                         dpseg = seg;
870                         for (i = sg_copy_ptr.index; i < wr->num_sge; ++i) {
871                                 if (unlikely(dpseg == qend)) {
872                                         seg = mlx5_get_send_wqe(qp, 0);
873                                         dpseg = seg;
874                                 }
875                                 if (likely(wr->sg_list[i].length)) {
876                                         if (unlikely(wr->opcode ==
877                                                    IBV_WR_ATOMIC_CMP_AND_SWP ||
878                                                    wr->opcode ==
879                                                    IBV_WR_ATOMIC_FETCH_AND_ADD))
880                                                 set_data_ptr_seg_atomic(dpseg, wr->sg_list + i);
881                                         else {
882                                                 if (unlikely(wr->opcode == IBV_WR_TSO)) {
883                                                         if (max_tso < wr->sg_list[i].length) {
884                                                                 err = EINVAL;
885                                                                 *bad_wr = wr;
886                                                                 goto out;
887                                                         }
888                                                         max_tso -= wr->sg_list[i].length;
889                                                 }
890                                                 set_data_ptr_seg(dpseg, wr->sg_list + i,
891                                                                  sg_copy_ptr.offset);
892                                         }
893                                         sg_copy_ptr.offset = 0;
894                                         ++dpseg;
895                                         size += sizeof(struct mlx5_wqe_data_seg) / 16;
896                                 }
897                         }
898                 }
899
900                 mlx5_opcode = mlx5_ib_opcode[wr->opcode];
901                 ctrl->opmod_idx_opcode = htobe32(((qp->sq.cur_post & 0xffff) << 8) |
902                                                mlx5_opcode                       |
903                                                (opmod << 24));
904                 ctrl->qpn_ds = htobe32(size | (ibqp->qp_num << 8));
905
906                 if (unlikely(qp->wq_sig))
907                         ctrl->signature = wq_sig(ctrl);
908
909                 qp->sq.wrid[idx] = wr->wr_id;
910                 qp->sq.wqe_head[idx] = qp->sq.head + nreq;
911                 qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
912
913 #ifdef MLX5_DEBUG
914                 if (mlx5_debug_mask & MLX5_DBG_QP_SEND)
915                         dump_wqe(to_mctx(ibqp->context)->dbg_fp, idx, size, qp);
916 #endif
917         }
918
919 out:
920         if (likely(nreq)) {
921                 qp->sq.head += nreq;
922                 qp->fm_cache = next_fence;
923
924                 /*
925                  * Make sure that descriptors are written before
926                  * updating doorbell record and ringing the doorbell
927                  */
928                 udma_to_device_barrier();
929                 qp->db[MLX5_SND_DBR] = htobe32(qp->sq.cur_post & 0xffff);
930
931                 /* Make sure that the doorbell write happens before the memcpy
932                  * to WC memory below */
933                 ctx = to_mctx(ibqp->context);
934                 if (bf->need_lock)
935                         mmio_wc_spinlock(&bf->lock.lock);
936                 else
937                         mmio_wc_start();
938
939                 if (!ctx->shut_up_bf && nreq == 1 && bf->uuarn &&
940                     (inl || ctx->prefer_bf) && size > 1 &&
941                     size <= bf->buf_size / 16)
942                         mlx5_bf_copy(bf->reg + bf->offset, (unsigned long long *)ctrl,
943                                      align(size * 16, 64), qp);
944                 else
945                         mlx5_write64((__be32 *)ctrl, bf->reg + bf->offset,
946                                      &ctx->lock32);
947
948                 /*
949                  * use mmio_flush_writes() to ensure write combining buffers are flushed out
950                  * of the running CPU. This must be carried inside the spinlock.
951                  * Otherwise, there is a potential race. In the race, CPU A
952                  * writes doorbell 1, which is waiting in the WC buffer. CPU B
953                  * writes doorbell 2, and it's write is flushed earlier. Since
954                  * the mmio_flush_writes is CPU local, this will result in the HCA seeing
955                  * doorbell 2, followed by doorbell 1.
956                  * Flush before toggling bf_offset to be latency oriented.
957                  */
958                 mmio_flush_writes();
959                 bf->offset ^= bf->buf_size;
960                 if (bf->need_lock)
961                         mlx5_spin_unlock(&bf->lock);
962         }
963
964         mlx5_spin_unlock(&qp->sq.lock);
965
966         return err;
967 }
968
969 int mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
970                    struct ibv_send_wr **bad_wr)
971 {
972 #ifdef MW_DEBUG
973         if (wr->opcode == IBV_WR_BIND_MW) {
974                 if (wr->bind_mw.mw->type == IBV_MW_TYPE_1)
975                         return EINVAL;
976
977                 if (!wr->bind_mw.bind_info.mr ||
978                     !wr->bind_mw.bind_info.addr ||
979                     !wr->bind_mw.bind_info.length)
980                         return EINVAL;
981
982                 if (wr->bind_mw.bind_info.mr->pd != wr->bind_mw.mw->pd)
983                         return EINVAL;
984         }
985 #endif
986
987         return _mlx5_post_send(ibqp, wr, bad_wr);
988 }
989
990 int mlx5_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
991                  struct ibv_mw_bind *mw_bind)
992 {
993         struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info;
994         struct ibv_send_wr wr = {};
995         struct ibv_send_wr *bad_wr = NULL;
996         int ret;
997
998         if (!bind_info->mr && (bind_info->addr || bind_info->length)) {
999                 errno = EINVAL;
1000                 return errno;
1001         }
1002
1003         if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) {
1004                 errno = EINVAL;
1005                 return errno;
1006         }
1007
1008         if (bind_info->mr) {
1009                 if (to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_ZERO_BASED) {
1010                         errno = EINVAL;
1011                         return errno;
1012                 }
1013
1014                 if (mw->pd != bind_info->mr->pd) {
1015                         errno = EPERM;
1016                         return errno;
1017                 }
1018         }
1019
1020         wr.opcode = IBV_WR_BIND_MW;
1021         wr.next = NULL;
1022         wr.wr_id = mw_bind->wr_id;
1023         wr.send_flags = mw_bind->send_flags;
1024         wr.bind_mw.bind_info = mw_bind->bind_info;
1025         wr.bind_mw.mw = mw;
1026         wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
1027
1028         ret = _mlx5_post_send(qp, &wr, &bad_wr);
1029         if (ret)
1030                 return ret;
1031
1032         mw->rkey = wr.bind_mw.rkey;
1033
1034         return 0;
1035 }
1036
1037 static void set_sig_seg(struct mlx5_qp *qp, struct mlx5_rwqe_sig *sig,
1038                         int size, uint16_t idx)
1039 {
1040         uint8_t  sign;
1041         uint32_t qpn = qp->ibv_qp->qp_num;
1042
1043         sign = calc_sig(sig, size);
1044         sign ^= calc_sig(&qpn, 4);
1045         sign ^= calc_sig(&idx, 2);
1046         sig->signature = sign;
1047 }
1048
1049 static void set_wq_sig_seg(struct mlx5_rwq *rwq, struct mlx5_rwqe_sig *sig,
1050                            int size, uint16_t idx)
1051 {
1052         uint8_t  sign;
1053         uint32_t qpn = rwq->wq.wq_num;
1054
1055         sign = calc_sig(sig, size);
1056         sign ^= calc_sig(&qpn, 4);
1057         sign ^= calc_sig(&idx, 2);
1058         sig->signature = sign;
1059 }
1060
1061 int mlx5_post_wq_recv(struct ibv_wq *ibwq, struct ibv_recv_wr *wr,
1062                       struct ibv_recv_wr **bad_wr)
1063 {
1064         struct mlx5_rwq *rwq = to_mrwq(ibwq);
1065         struct mlx5_wqe_data_seg *scat;
1066         int err = 0;
1067         int nreq;
1068         int ind;
1069         int i, j;
1070         struct mlx5_rwqe_sig *sig;
1071
1072         mlx5_spin_lock(&rwq->rq.lock);
1073
1074         ind = rwq->rq.head & (rwq->rq.wqe_cnt - 1);
1075
1076         for (nreq = 0; wr; ++nreq, wr = wr->next) {
1077                 if (unlikely(mlx5_wq_overflow(&rwq->rq, nreq,
1078                                               to_mcq(rwq->wq.cq)))) {
1079                         err = ENOMEM;
1080                         *bad_wr = wr;
1081                         goto out;
1082                 }
1083
1084                 if (unlikely(wr->num_sge > rwq->rq.max_gs)) {
1085                         err = EINVAL;
1086                         *bad_wr = wr;
1087                         goto out;
1088                 }
1089
1090                 scat = get_wq_recv_wqe(rwq, ind);
1091                 sig = (struct mlx5_rwqe_sig *)scat;
1092                 if (unlikely(rwq->wq_sig)) {
1093                         memset(sig, 0, 1 << rwq->rq.wqe_shift);
1094                         ++scat;
1095                 }
1096
1097                 for (i = 0, j = 0; i < wr->num_sge; ++i) {
1098                         if (unlikely(!wr->sg_list[i].length))
1099                                 continue;
1100                         set_data_ptr_seg(scat + j++, wr->sg_list + i, 0);
1101                 }
1102
1103                 if (j < rwq->rq.max_gs) {
1104                         scat[j].byte_count = 0;
1105                         scat[j].lkey       = htobe32(MLX5_INVALID_LKEY);
1106                         scat[j].addr       = 0;
1107                 }
1108
1109                 if (unlikely(rwq->wq_sig))
1110                         set_wq_sig_seg(rwq, sig, (wr->num_sge + 1) << 4,
1111                                        rwq->rq.head & 0xffff);
1112
1113                 rwq->rq.wrid[ind] = wr->wr_id;
1114
1115                 ind = (ind + 1) & (rwq->rq.wqe_cnt - 1);
1116         }
1117
1118 out:
1119         if (likely(nreq)) {
1120                 rwq->rq.head += nreq;
1121                 /*
1122                  * Make sure that descriptors are written before
1123                  * doorbell record.
1124                  */
1125                 udma_to_device_barrier();
1126                 *(rwq->recv_db) = htobe32(rwq->rq.head & 0xffff);
1127         }
1128
1129         mlx5_spin_unlock(&rwq->rq.lock);
1130
1131         return err;
1132 }
1133
1134 int mlx5_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
1135                    struct ibv_recv_wr **bad_wr)
1136 {
1137         struct mlx5_qp *qp = to_mqp(ibqp);
1138         struct mlx5_wqe_data_seg *scat;
1139         int err = 0;
1140         int nreq;
1141         int ind;
1142         int i, j;
1143         struct mlx5_rwqe_sig *sig;
1144
1145         mlx5_spin_lock(&qp->rq.lock);
1146
1147         ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
1148
1149         for (nreq = 0; wr; ++nreq, wr = wr->next) {
1150                 if (unlikely(mlx5_wq_overflow(&qp->rq, nreq,
1151                                               to_mcq(qp->ibv_qp->recv_cq)))) {
1152                         err = ENOMEM;
1153                         *bad_wr = wr;
1154                         goto out;
1155                 }
1156
1157                 if (unlikely(wr->num_sge > qp->rq.max_gs)) {
1158                         err = EINVAL;
1159                         *bad_wr = wr;
1160                         goto out;
1161                 }
1162
1163                 scat = get_recv_wqe(qp, ind);
1164                 sig = (struct mlx5_rwqe_sig *)scat;
1165                 if (unlikely(qp->wq_sig)) {
1166                         memset(sig, 0, 1 << qp->rq.wqe_shift);
1167                         ++scat;
1168                 }
1169
1170                 for (i = 0, j = 0; i < wr->num_sge; ++i) {
1171                         if (unlikely(!wr->sg_list[i].length))
1172                                 continue;
1173                         set_data_ptr_seg(scat + j++, wr->sg_list + i, 0);
1174                 }
1175
1176                 if (j < qp->rq.max_gs) {
1177                         scat[j].byte_count = 0;
1178                         scat[j].lkey       = htobe32(MLX5_INVALID_LKEY);
1179                         scat[j].addr       = 0;
1180                 }
1181
1182                 if (unlikely(qp->wq_sig))
1183                         set_sig_seg(qp, sig, (wr->num_sge + 1) << 4,
1184                                     qp->rq.head & 0xffff);
1185
1186                 qp->rq.wrid[ind] = wr->wr_id;
1187
1188                 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
1189         }
1190
1191 out:
1192         if (likely(nreq)) {
1193                 qp->rq.head += nreq;
1194
1195                 /*
1196                  * Make sure that descriptors are written before
1197                  * doorbell record.
1198                  */
1199                 udma_to_device_barrier();
1200
1201                 /*
1202                  * For Raw Packet QP, avoid updating the doorbell record
1203                  * as long as the QP isn't in RTR state, to avoid receiving
1204                  * packets in illegal states.
1205                  * This is only for Raw Packet QPs since they are represented
1206                  * differently in the hardware.
1207                  */
1208                 if (likely(!(ibqp->qp_type == IBV_QPT_RAW_PACKET &&
1209                              ibqp->state < IBV_QPS_RTR)))
1210                         qp->db[MLX5_RCV_DBR] = htobe32(qp->rq.head & 0xffff);
1211         }
1212
1213         mlx5_spin_unlock(&qp->rq.lock);
1214
1215         return err;
1216 }
1217
1218 int mlx5_use_huge(const char *key)
1219 {
1220         char *e;
1221         e = getenv(key);
1222         if (e && !strcmp(e, "y"))
1223                 return 1;
1224
1225         return 0;
1226 }
1227
1228 struct mlx5_qp *mlx5_find_qp(struct mlx5_context *ctx, uint32_t qpn)
1229 {
1230         int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1231
1232         if (ctx->qp_table[tind].refcnt)
1233                 return ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK];
1234         else
1235                 return NULL;
1236 }
1237
1238 int mlx5_store_qp(struct mlx5_context *ctx, uint32_t qpn, struct mlx5_qp *qp)
1239 {
1240         int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1241
1242         if (!ctx->qp_table[tind].refcnt) {
1243                 ctx->qp_table[tind].table = calloc(MLX5_QP_TABLE_MASK + 1,
1244                                                    sizeof(struct mlx5_qp *));
1245                 if (!ctx->qp_table[tind].table)
1246                         return -1;
1247         }
1248
1249         ++ctx->qp_table[tind].refcnt;
1250         ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK] = qp;
1251         return 0;
1252 }
1253
1254 void mlx5_clear_qp(struct mlx5_context *ctx, uint32_t qpn)
1255 {
1256         int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1257
1258         if (!--ctx->qp_table[tind].refcnt)
1259                 free(ctx->qp_table[tind].table);
1260         else
1261                 ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK] = NULL;
1262 }