2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX 0x15b3
54 #define CPU_OR(x, y, z) do {} while (0)
58 #define CPU_EQUAL(x, y) 1
63 { .vendor = PCI_VENDOR_ID_##v, \
70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */
71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */
72 HCA(MELLANOX, 4115), /* ConnectX-4 */
73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */
74 HCA(MELLANOX, 4117), /* ConnectX-4LX */
75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */
76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */
77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */
78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */
79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */
80 HCA(MELLANOX, 4123), /* ConnectX-6 */
81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */
82 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */
83 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */
86 uint32_t mlx5_debug_mask = 0;
87 int mlx5_freeze_on_error_cqe;
89 static struct ibv_context_ops mlx5_ctx_ops = {
90 .query_device = mlx5_query_device,
91 .query_port = mlx5_query_port,
92 .alloc_pd = mlx5_alloc_pd,
93 .dealloc_pd = mlx5_free_pd,
94 .reg_mr = mlx5_reg_mr,
95 .rereg_mr = mlx5_rereg_mr,
96 .dereg_mr = mlx5_dereg_mr,
97 .alloc_mw = mlx5_alloc_mw,
98 .dealloc_mw = mlx5_dealloc_mw,
99 .bind_mw = mlx5_bind_mw,
100 .create_cq = mlx5_create_cq,
101 .poll_cq = mlx5_poll_cq,
102 .req_notify_cq = mlx5_arm_cq,
103 .cq_event = mlx5_cq_event,
104 .resize_cq = mlx5_resize_cq,
105 .destroy_cq = mlx5_destroy_cq,
106 .create_srq = mlx5_create_srq,
107 .modify_srq = mlx5_modify_srq,
108 .query_srq = mlx5_query_srq,
109 .destroy_srq = mlx5_destroy_srq,
110 .post_srq_recv = mlx5_post_srq_recv,
111 .create_qp = mlx5_create_qp,
112 .query_qp = mlx5_query_qp,
113 .modify_qp = mlx5_modify_qp,
114 .destroy_qp = mlx5_destroy_qp,
115 .post_send = mlx5_post_send,
116 .post_recv = mlx5_post_recv,
117 .create_ah = mlx5_create_ah,
118 .destroy_ah = mlx5_destroy_ah,
119 .attach_mcast = mlx5_attach_mcast,
120 .detach_mcast = mlx5_detach_mcast
123 static int read_number_from_line(const char *line, int *value)
127 ptr = strchr(line, ':');
137 * The function looks for the first free user-index in all the
138 * user-index tables. If all are used, returns -1, otherwise
139 * a valid user-index.
140 * In case the reference count of the table is zero, it means the
141 * table is not in use and wasn't allocated yet, therefore the
142 * mlx5_store_uidx allocates the table, and increment the reference
143 * count on the table.
145 static int32_t get_free_uidx(struct mlx5_context *ctx)
150 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
151 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
155 if (tind == MLX5_UIDX_TABLE_SIZE)
158 if (!ctx->uidx_table[tind].refcnt)
159 return tind << MLX5_UIDX_TABLE_SHIFT;
161 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
162 if (!ctx->uidx_table[tind].table[i])
166 return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
169 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
175 pthread_mutex_lock(&ctx->uidx_table_mutex);
176 uidx = get_free_uidx(ctx);
180 tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
182 if (!ctx->uidx_table[tind].refcnt) {
183 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
184 sizeof(struct mlx5_resource *));
185 if (!ctx->uidx_table[tind].table)
189 ++ctx->uidx_table[tind].refcnt;
190 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
194 pthread_mutex_unlock(&ctx->uidx_table_mutex);
198 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
200 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
202 pthread_mutex_lock(&ctx->uidx_table_mutex);
204 if (!--ctx->uidx_table[tind].refcnt)
205 free(ctx->uidx_table[tind].table);
207 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
209 pthread_mutex_unlock(&ctx->uidx_table_mutex);
212 static int mlx5_is_sandy_bridge(int *num_cores)
217 int cur_cpu_family = -1;
218 int cur_cpu_model = -1;
220 fd = fopen("/proc/cpuinfo", "r");
226 while (fgets(line, 128, fd)) {
229 /* if this is information on new processor */
230 if (!strncmp(line, "processor", 9)) {
235 } else if (!strncmp(line, "cpu family", 10)) {
236 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
237 cur_cpu_family = value;
238 } else if (!strncmp(line, "model", 5)) {
239 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
240 cur_cpu_model = value;
243 /* if this is a Sandy Bridge CPU */
244 if ((cur_cpu_family == 6) &&
245 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
256 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
257 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
258 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
259 within a word are also in big-endian order.
261 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
262 the size of the bitmask.
264 Examples of the Mask Format:
266 00000001 # just bit 0 set
267 40000000,00000000,00000000 # just bit 94 set
268 000000ff,00000000 # bits 32-39 set
269 00000000,000E3862 # 1,5,6,11-13,17-19 set
271 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
273 00000001,00000001,00010117
275 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
276 bit 4, and the "7" is for bits 2, 1, and 0.
278 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
285 env_value = getenv("MLX5_LOCAL_CPUS");
287 strncpy(buf, env_value, sizeof(buf));
289 char fname[MAXPATHLEN];
291 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
292 ibv_get_device_name(ibdev));
294 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
295 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
300 p = strrchr(buf, ',');
311 word = strtoul(p, NULL, 16);
313 for (k = 0; word; ++k, word >>= 1)
315 CPU_SET(k+i, cpu_set);
320 p = strrchr(buf, ',');
325 } while (i < CPU_SETSIZE);
328 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
330 cpuset_t my_cpus, dev_local_cpus, result_set;
335 if (!mlx5_is_sandy_bridge(&num_cores))
338 /* by default enable stall on sandy bridge arch */
342 * check if app is bound to cpu set that is inside
343 * of device local cpu set. Disable stalling if true
346 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
348 CPU_ZERO(&dev_local_cpus);
349 CPU_ZERO(&result_set);
350 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
351 sizeof(my_cpus), &my_cpus);
354 fprintf(stderr, PFX "Warning: my cpu set is too small\n");
356 fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
360 /* get device local cpu set */
361 mlx5_local_cpu_set(ibdev, &dev_local_cpus);
363 /* check if my cpu set is in dev cpu */
364 CPU_OR(&result_set, &my_cpus);
365 CPU_OR(&result_set, &dev_local_cpus);
366 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
372 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
376 env_value = getenv("MLX5_STALL_CQ_POLL");
378 /* check if cq stall is enforced by user */
379 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
381 /* autodetect if we need to do cq polling */
382 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
384 env_value = getenv("MLX5_STALL_NUM_LOOP");
386 mlx5_stall_num_loop = atoi(env_value);
388 env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
390 mlx5_stall_cq_poll_min = atoi(env_value);
392 env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
394 mlx5_stall_cq_poll_max = atoi(env_value);
396 env_value = getenv("MLX5_STALL_CQ_INC_STEP");
398 mlx5_stall_cq_inc_step = atoi(env_value);
400 env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
402 mlx5_stall_cq_dec_step = atoi(env_value);
404 ctx->stall_adaptive_enable = 0;
405 ctx->stall_cycles = 0;
407 if (mlx5_stall_num_loop < 0) {
408 ctx->stall_adaptive_enable = 1;
409 ctx->stall_cycles = mlx5_stall_cq_poll_min;
414 static int get_total_uuars(int page_size)
416 int size = MLX5_DEF_TOT_UUARS;
420 env = getenv("MLX5_TOTAL_UUARS");
427 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
428 size = max(uuars_in_page, size);
429 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
430 if (size > MLX5_MAX_BFREGS)
436 static void open_debug_file(struct mlx5_context *ctx)
440 env = getenv("MLX5_DEBUG_FILE");
442 ctx->dbg_fp = stderr;
446 ctx->dbg_fp = fopen(env, "aw+");
448 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
449 ctx->dbg_fp = stderr;
454 static void close_debug_file(struct mlx5_context *ctx)
456 if (ctx->dbg_fp && ctx->dbg_fp != stderr)
460 static void set_debug_mask(void)
464 env = getenv("MLX5_DEBUG_MASK");
466 mlx5_debug_mask = strtol(env, NULL, 0);
469 static void set_freeze_on_error(void)
473 env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
475 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
478 static int get_always_bf(void)
482 env = getenv("MLX5_POST_SEND_PREFER_BF");
486 return strcmp(env, "0") ? 1 : 0;
489 static int get_shut_up_bf(void)
493 env = getenv("MLX5_SHUT_UP_BF");
497 return strcmp(env, "0") ? 1 : 0;
500 static int get_num_low_lat_uuars(int tot_uuars)
505 env = getenv("MLX5_NUM_LOW_LAT_UUARS");
512 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
516 /* The library allocates an array of uuar contexts. The one in index zero does
517 * not to execersize odd/even policy so it can avoid a lock but it may not use
518 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
519 * since they are assigned to one QP only. The rest can use blue flame but since
520 * they are shared they need a lock
522 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
524 if (uuarn == 0 || mlx5_single_threaded)
527 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
533 static int single_threaded_app(void)
538 env = getenv("MLX5_SINGLE_THREADED");
540 return strcmp(env, "1") ? 0 : 1;
545 static int mlx5_cmd_get_context(struct mlx5_context *context,
546 struct mlx5_alloc_ucontext *req,
548 struct mlx5_alloc_ucontext_resp *resp,
551 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
552 req_len, &resp->ibv_resp, resp_len))
555 /* The ibv_cmd_get_context fails in older kernels when passing
556 * a request length that the kernel doesn't know.
557 * To avoid breaking compatibility of new libmlx5 and older
558 * kernels, when ibv_cmd_get_context fails with the full
559 * request length, we try once again with the legacy length.
560 * We repeat this process while reducing requested size based
561 * on the feature input size. To avoid this in the future, we
562 * will remove the check in kernel that requires fields unknown
563 * to the kernel to be cleared. This will require that any new
564 * feature that involves extending struct mlx5_alloc_ucontext
565 * will be accompanied by an indication in the form of one or
566 * more fields in struct mlx5_alloc_ucontext_resp. If the
567 * response value can be interpreted as feature not supported
568 * when the returned value is zero, this will suffice to
569 * indicate to the library that the request was ignored by the
570 * kernel, either because it is unaware or because it decided
571 * to do so. If zero is a valid response, we will add a new
572 * field that indicates whether the request was handled.
574 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
575 offsetof(struct mlx5_alloc_ucontext, lib_caps),
576 &resp->ibv_resp, resp_len))
579 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
580 offsetof(struct mlx5_alloc_ucontext,
582 &resp->ibv_resp, resp_len);
585 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
586 struct ibv_context *ibv_ctx)
588 struct mlx5_context *context = to_mctx(ibv_ctx);
589 void *hca_clock_page;
592 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
593 hca_clock_page = mmap(NULL, mdev->page_size,
594 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
595 mdev->page_size * offset);
597 if (hca_clock_page == MAP_FAILED) {
599 "Warning: Timestamp available,\n"
600 "but failed to mmap() hca core clock page.\n");
604 context->hca_core_clock = hca_clock_page +
605 (context->core_clock.offset & (mdev->page_size - 1));
609 int mlx5dv_query_device(struct ibv_context *ctx_in,
610 struct mlx5dv_context *attrs_out)
612 struct mlx5_context *mctx = to_mctx(ctx_in);
613 uint64_t comp_mask_out = 0;
615 attrs_out->version = 0;
616 attrs_out->flags = 0;
618 if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
619 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
621 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
622 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
624 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
625 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
626 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
629 attrs_out->comp_mask = comp_mask_out;
634 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
635 struct mlx5dv_qp *qp_out)
637 struct mlx5_qp *mqp = to_mqp(qp_in);
639 qp_out->comp_mask = 0;
640 qp_out->dbrec = mqp->db;
642 if (mqp->sq_buf_size)
643 /* IBV_QPT_RAW_PACKET */
644 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
646 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
647 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
648 qp_out->sq.stride = 1 << mqp->sq.wqe_shift;
650 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
651 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
652 qp_out->rq.stride = 1 << mqp->rq.wqe_shift;
654 qp_out->bf.reg = mqp->bf->reg;
656 if (mqp->bf->uuarn > 0)
657 qp_out->bf.size = mqp->bf->buf_size;
664 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
665 struct mlx5dv_cq *cq_out)
667 struct mlx5_cq *mcq = to_mcq(cq_in);
668 struct mlx5_context *mctx = to_mctx(cq_in->context);
670 cq_out->comp_mask = 0;
671 cq_out->cqn = mcq->cqn;
672 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1;
673 cq_out->cqe_size = mcq->cqe_sz;
674 cq_out->buf = mcq->active_buf->buf;
675 cq_out->dbrec = mcq->dbrec;
676 cq_out->uar = mctx->uar;
678 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED;
683 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
684 struct mlx5dv_rwq *rwq_out)
686 struct mlx5_rwq *mrwq = to_mrwq(wq_in);
688 rwq_out->comp_mask = 0;
689 rwq_out->buf = mrwq->pbuff;
690 rwq_out->dbrec = mrwq->recv_db;
691 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt;
692 rwq_out->stride = 1 << mrwq->rq.wqe_shift;
697 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
698 struct mlx5dv_srq *srq_out)
700 struct mlx5_srq *msrq;
702 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
704 srq_out->comp_mask = 0;
705 srq_out->buf = msrq->buf.buf;
706 srq_out->dbrec = msrq->db;
707 srq_out->stride = 1 << msrq->wqe_shift;
708 srq_out->head = msrq->head;
709 srq_out->tail = msrq->tail;
714 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
718 if (obj_type & MLX5DV_OBJ_QP)
719 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
720 if (!ret && (obj_type & MLX5DV_OBJ_CQ))
721 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
722 if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
723 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
724 if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
725 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
730 static void adjust_uar_info(struct mlx5_device *mdev,
731 struct mlx5_context *context,
732 struct mlx5_alloc_ucontext_resp resp)
734 if (!resp.log_uar_size && !resp.num_uars_per_page) {
736 context->uar_size = mdev->page_size;
737 context->num_uars_per_page = 1;
741 context->uar_size = 1 << resp.log_uar_size;
742 context->num_uars_per_page = resp.num_uars_per_page;
745 static int mlx5_init_context(struct verbs_device *vdev,
746 struct ibv_context *ctx, int cmd_fd)
748 struct mlx5_context *context;
749 struct mlx5_alloc_ucontext req;
750 struct mlx5_alloc_ucontext_resp resp;
758 struct mlx5_device *mdev;
759 struct verbs_context *v_ctx;
760 struct ibv_port_attr port_attr;
761 struct ibv_device_attr_ex device_attr;
764 int num_sys_page_map;
766 mdev = to_mdev(&vdev->device);
767 v_ctx = verbs_get_ctx(ctx);
768 page_size = mdev->page_size;
769 mlx5_single_threaded = single_threaded_app();
771 context = to_mctx(ctx);
772 context->ibv_ctx.cmd_fd = cmd_fd;
774 open_debug_file(context);
776 set_freeze_on_error();
777 if (gethostname(context->hostname, sizeof(context->hostname)))
778 strcpy(context->hostname, "host_unknown");
780 tot_uuars = get_total_uuars(page_size);
786 low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
787 if (low_lat_uuars < 0) {
788 errno = -low_lat_uuars;
792 if (low_lat_uuars > tot_uuars - 1) {
797 memset(&req, 0, sizeof(req));
798 memset(&resp, 0, sizeof(resp));
800 req.total_num_uuars = tot_uuars;
801 req.num_low_latency_uuars = low_lat_uuars;
802 req.cqe_version = MLX5_CQE_VERSION_V1;
803 req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
805 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
809 context->max_num_qps = resp.qp_tab_size;
810 context->bf_reg_size = resp.bf_reg_size;
811 context->tot_uuars = resp.tot_uuars;
812 context->low_lat_uuars = low_lat_uuars;
813 context->cache_line_size = resp.cache_line_size;
814 context->max_sq_desc_sz = resp.max_sq_desc_sz;
815 context->max_rq_desc_sz = resp.max_rq_desc_sz;
816 context->max_send_wqebb = resp.max_send_wqebb;
817 context->num_ports = resp.num_ports;
818 context->max_recv_wr = resp.max_recv_wr;
819 context->max_srq_recv_wr = resp.max_srq_recv_wr;
821 context->cqe_version = resp.cqe_version;
822 if (context->cqe_version) {
823 if (context->cqe_version == MLX5_CQE_VERSION_V1)
824 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
829 adjust_uar_info(mdev, context, resp);
831 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
832 context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
838 context->cmds_supp_uhw = resp.cmds_supp_uhw;
839 context->vendor_cap_flags = 0;
841 pthread_mutex_init(&context->qp_table_mutex, NULL);
842 pthread_mutex_init(&context->srq_table_mutex, NULL);
843 pthread_mutex_init(&context->uidx_table_mutex, NULL);
844 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
845 context->qp_table[i].refcnt = 0;
847 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
848 context->uidx_table[i].refcnt = 0;
850 context->db_list = NULL;
852 pthread_mutex_init(&context->db_list_mutex, NULL);
854 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
855 for (i = 0; i < num_sys_page_map; ++i) {
857 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
858 set_index(i, &offset);
859 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
860 cmd_fd, page_size * offset);
861 if (context->uar[i] == MAP_FAILED) {
862 context->uar[i] = NULL;
867 for (i = 0; i < num_sys_page_map; i++) {
868 for (j = 0; j < context->num_uars_per_page; j++) {
869 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
870 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
871 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
872 MLX5_BF_OFFSET + k * context->bf_reg_size;
873 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
874 mlx5_spinlock_init(&context->bfs[bfi].lock);
875 context->bfs[bfi].offset = 0;
877 context->bfs[bfi].buf_size = context->bf_reg_size / 2;
878 context->bfs[bfi].uuarn = bfi;
882 context->hca_core_clock = NULL;
883 if (resp.response_length + sizeof(resp.ibv_resp) >=
884 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
885 sizeof(resp.hca_core_clock_offset) &&
886 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
887 context->core_clock.offset = resp.hca_core_clock_offset;
888 mlx5_map_internal_clock(mdev, ctx);
891 mlx5_spinlock_init(&context->lock32);
893 context->prefer_bf = get_always_bf();
894 context->shut_up_bf = get_shut_up_bf();
895 mlx5_read_env(&vdev->device, context);
897 mlx5_spinlock_init(&context->hugetlb_lock);
898 TAILQ_INIT(&context->hugetlb_list);
900 context->ibv_ctx.ops = mlx5_ctx_ops;
902 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
903 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
904 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
905 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
906 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
907 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
908 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
909 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
910 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
911 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
912 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
913 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
914 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
915 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
916 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
918 memset(&device_attr, 0, sizeof(device_attr));
919 if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
920 sizeof(struct ibv_device_attr_ex))) {
921 context->cached_device_cap_flags =
922 device_attr.orig_attr.device_cap_flags;
923 context->atomic_cap = device_attr.orig_attr.atomic_cap;
924 context->cached_tso_caps = device_attr.tso_caps;
927 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
928 memset(&port_attr, 0, sizeof(port_attr));
929 if (!mlx5_query_port(ctx, j + 1, &port_attr))
930 context->cached_link_layer[j] = port_attr.link_layer;
939 for (i = 0; i < MLX5_MAX_UARS; ++i) {
941 munmap(context->uar[i], page_size);
943 close_debug_file(context);
947 static void mlx5_cleanup_context(struct verbs_device *device,
948 struct ibv_context *ibctx)
950 struct mlx5_context *context = to_mctx(ibctx);
951 int page_size = to_mdev(ibctx->device)->page_size;
955 for (i = 0; i < MLX5_MAX_UARS; ++i) {
957 munmap(context->uar[i], page_size);
959 if (context->hca_core_clock)
960 munmap(context->hca_core_clock - context->core_clock.offset,
962 close_debug_file(context);
965 static struct verbs_device_ops mlx5_dev_ops = {
966 .init_context = mlx5_init_context,
967 .uninit_context = mlx5_cleanup_context,
970 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
974 struct mlx5_device *dev;
975 unsigned vendor, device;
978 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
979 value, sizeof value) < 0)
981 sscanf(value, "%i", &vendor);
983 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
984 value, sizeof value) < 0)
986 sscanf(value, "%i", &device);
988 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
989 if (vendor == hca_table[i].vendor &&
990 device == hca_table[i].device)
996 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
997 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
998 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
999 "(min supported %d, max supported %d)\n",
1000 abi_version, uverbs_sys_path,
1001 MLX5_UVERBS_MIN_ABI_VERSION,
1002 MLX5_UVERBS_MAX_ABI_VERSION);
1006 dev = calloc(1, sizeof *dev);
1008 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1013 dev->page_size = sysconf(_SC_PAGESIZE);
1014 dev->driver_abi_ver = abi_version;
1016 dev->verbs_dev.ops = &mlx5_dev_ops;
1017 dev->verbs_dev.sz = sizeof(*dev);
1018 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1019 sizeof(struct ibv_context);
1021 return &dev->verbs_dev;
1024 static __attribute__((constructor)) void mlx5_register_driver(void)
1026 verbs_register_driver("mlx5", mlx5_driver_init);