2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX 0x15b3
54 #define CPU_OR(x, y, z) do {} while (0)
58 #define CPU_EQUAL(x, y) 1
63 { .vendor = PCI_VENDOR_ID_##v, \
70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */
71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */
72 HCA(MELLANOX, 4115), /* ConnectX-4 */
73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */
74 HCA(MELLANOX, 4117), /* ConnectX-4LX */
75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */
76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */
77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */
78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */
79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */
80 HCA(MELLANOX, 4123), /* ConnectX-6 */
81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */
82 HCA(MELLANOX, 4125), /* ConnectX-6 DX */
83 HCA(MELLANOX, 4126), /* ConnectX family mlx5Gen Virtual Function */
84 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */
85 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */
88 uint32_t mlx5_debug_mask = 0;
89 int mlx5_freeze_on_error_cqe;
91 static struct ibv_context_ops mlx5_ctx_ops = {
92 .query_device = mlx5_query_device,
93 .query_port = mlx5_query_port,
94 .alloc_pd = mlx5_alloc_pd,
95 .dealloc_pd = mlx5_free_pd,
96 .reg_mr = mlx5_reg_mr,
97 .rereg_mr = mlx5_rereg_mr,
98 .dereg_mr = mlx5_dereg_mr,
99 .alloc_mw = mlx5_alloc_mw,
100 .dealloc_mw = mlx5_dealloc_mw,
101 .bind_mw = mlx5_bind_mw,
102 .create_cq = mlx5_create_cq,
103 .poll_cq = mlx5_poll_cq,
104 .req_notify_cq = mlx5_arm_cq,
105 .cq_event = mlx5_cq_event,
106 .resize_cq = mlx5_resize_cq,
107 .destroy_cq = mlx5_destroy_cq,
108 .create_srq = mlx5_create_srq,
109 .modify_srq = mlx5_modify_srq,
110 .query_srq = mlx5_query_srq,
111 .destroy_srq = mlx5_destroy_srq,
112 .post_srq_recv = mlx5_post_srq_recv,
113 .create_qp = mlx5_create_qp,
114 .query_qp = mlx5_query_qp,
115 .modify_qp = mlx5_modify_qp,
116 .destroy_qp = mlx5_destroy_qp,
117 .post_send = mlx5_post_send,
118 .post_recv = mlx5_post_recv,
119 .create_ah = mlx5_create_ah,
120 .destroy_ah = mlx5_destroy_ah,
121 .attach_mcast = mlx5_attach_mcast,
122 .detach_mcast = mlx5_detach_mcast
125 static int read_number_from_line(const char *line, int *value)
129 ptr = strchr(line, ':');
139 * The function looks for the first free user-index in all the
140 * user-index tables. If all are used, returns -1, otherwise
141 * a valid user-index.
142 * In case the reference count of the table is zero, it means the
143 * table is not in use and wasn't allocated yet, therefore the
144 * mlx5_store_uidx allocates the table, and increment the reference
145 * count on the table.
147 static int32_t get_free_uidx(struct mlx5_context *ctx)
152 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
153 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
157 if (tind == MLX5_UIDX_TABLE_SIZE)
160 if (!ctx->uidx_table[tind].refcnt)
161 return tind << MLX5_UIDX_TABLE_SHIFT;
163 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
164 if (!ctx->uidx_table[tind].table[i])
168 return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
171 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
177 pthread_mutex_lock(&ctx->uidx_table_mutex);
178 uidx = get_free_uidx(ctx);
182 tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
184 if (!ctx->uidx_table[tind].refcnt) {
185 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
186 sizeof(struct mlx5_resource *));
187 if (!ctx->uidx_table[tind].table)
191 ++ctx->uidx_table[tind].refcnt;
192 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
196 pthread_mutex_unlock(&ctx->uidx_table_mutex);
200 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
202 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
204 pthread_mutex_lock(&ctx->uidx_table_mutex);
206 if (!--ctx->uidx_table[tind].refcnt)
207 free(ctx->uidx_table[tind].table);
209 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
211 pthread_mutex_unlock(&ctx->uidx_table_mutex);
214 static int mlx5_is_sandy_bridge(int *num_cores)
219 int cur_cpu_family = -1;
220 int cur_cpu_model = -1;
222 fd = fopen("/proc/cpuinfo", "r");
228 while (fgets(line, 128, fd)) {
231 /* if this is information on new processor */
232 if (!strncmp(line, "processor", 9)) {
237 } else if (!strncmp(line, "cpu family", 10)) {
238 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
239 cur_cpu_family = value;
240 } else if (!strncmp(line, "model", 5)) {
241 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
242 cur_cpu_model = value;
245 /* if this is a Sandy Bridge CPU */
246 if ((cur_cpu_family == 6) &&
247 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
258 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
259 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
260 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
261 within a word are also in big-endian order.
263 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
264 the size of the bitmask.
266 Examples of the Mask Format:
268 00000001 # just bit 0 set
269 40000000,00000000,00000000 # just bit 94 set
270 000000ff,00000000 # bits 32-39 set
271 00000000,000E3862 # 1,5,6,11-13,17-19 set
273 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
275 00000001,00000001,00010117
277 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
278 bit 4, and the "7" is for bits 2, 1, and 0.
280 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
287 env_value = getenv("MLX5_LOCAL_CPUS");
289 strncpy(buf, env_value, sizeof(buf));
291 char fname[MAXPATHLEN];
293 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
294 ibv_get_device_name(ibdev));
296 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
297 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
302 p = strrchr(buf, ',');
313 word = strtoul(p, NULL, 16);
315 for (k = 0; word; ++k, word >>= 1)
317 CPU_SET(k+i, cpu_set);
322 p = strrchr(buf, ',');
327 } while (i < CPU_SETSIZE);
330 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
332 cpuset_t my_cpus, dev_local_cpus, result_set;
337 if (!mlx5_is_sandy_bridge(&num_cores))
340 /* by default enable stall on sandy bridge arch */
344 * check if app is bound to cpu set that is inside
345 * of device local cpu set. Disable stalling if true
348 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
350 CPU_ZERO(&dev_local_cpus);
351 CPU_ZERO(&result_set);
352 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
353 sizeof(my_cpus), &my_cpus);
356 fprintf(stderr, PFX "Warning: my cpu set is too small\n");
358 fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
362 /* get device local cpu set */
363 mlx5_local_cpu_set(ibdev, &dev_local_cpus);
365 /* check if my cpu set is in dev cpu */
366 CPU_OR(&result_set, &my_cpus);
367 CPU_OR(&result_set, &dev_local_cpus);
368 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
374 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
378 env_value = getenv("MLX5_STALL_CQ_POLL");
380 /* check if cq stall is enforced by user */
381 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
383 /* autodetect if we need to do cq polling */
384 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
386 env_value = getenv("MLX5_STALL_NUM_LOOP");
388 mlx5_stall_num_loop = atoi(env_value);
390 env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
392 mlx5_stall_cq_poll_min = atoi(env_value);
394 env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
396 mlx5_stall_cq_poll_max = atoi(env_value);
398 env_value = getenv("MLX5_STALL_CQ_INC_STEP");
400 mlx5_stall_cq_inc_step = atoi(env_value);
402 env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
404 mlx5_stall_cq_dec_step = atoi(env_value);
406 ctx->stall_adaptive_enable = 0;
407 ctx->stall_cycles = 0;
409 if (mlx5_stall_num_loop < 0) {
410 ctx->stall_adaptive_enable = 1;
411 ctx->stall_cycles = mlx5_stall_cq_poll_min;
416 static int get_total_uuars(int page_size)
418 int size = MLX5_DEF_TOT_UUARS;
422 env = getenv("MLX5_TOTAL_UUARS");
429 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
430 size = max(uuars_in_page, size);
431 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
432 if (size > MLX5_MAX_BFREGS)
438 static void open_debug_file(struct mlx5_context *ctx)
442 env = getenv("MLX5_DEBUG_FILE");
444 ctx->dbg_fp = stderr;
448 ctx->dbg_fp = fopen(env, "aw+");
450 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
451 ctx->dbg_fp = stderr;
456 static void close_debug_file(struct mlx5_context *ctx)
458 if (ctx->dbg_fp && ctx->dbg_fp != stderr)
462 static void set_debug_mask(void)
466 env = getenv("MLX5_DEBUG_MASK");
468 mlx5_debug_mask = strtol(env, NULL, 0);
471 static void set_freeze_on_error(void)
475 env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
477 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
480 static int get_always_bf(void)
484 env = getenv("MLX5_POST_SEND_PREFER_BF");
488 return strcmp(env, "0") ? 1 : 0;
491 static int get_shut_up_bf(void)
495 env = getenv("MLX5_SHUT_UP_BF");
499 return strcmp(env, "0") ? 1 : 0;
502 static int get_num_low_lat_uuars(int tot_uuars)
507 env = getenv("MLX5_NUM_LOW_LAT_UUARS");
514 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
518 /* The library allocates an array of uuar contexts. The one in index zero does
519 * not to execersize odd/even policy so it can avoid a lock but it may not use
520 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
521 * since they are assigned to one QP only. The rest can use blue flame but since
522 * they are shared they need a lock
524 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
526 if (uuarn == 0 || mlx5_single_threaded)
529 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
535 static int single_threaded_app(void)
540 env = getenv("MLX5_SINGLE_THREADED");
542 return strcmp(env, "1") ? 0 : 1;
547 static int mlx5_cmd_get_context(struct mlx5_context *context,
548 struct mlx5_alloc_ucontext *req,
550 struct mlx5_alloc_ucontext_resp *resp,
553 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
554 req_len, &resp->ibv_resp, resp_len))
557 /* The ibv_cmd_get_context fails in older kernels when passing
558 * a request length that the kernel doesn't know.
559 * To avoid breaking compatibility of new libmlx5 and older
560 * kernels, when ibv_cmd_get_context fails with the full
561 * request length, we try once again with the legacy length.
562 * We repeat this process while reducing requested size based
563 * on the feature input size. To avoid this in the future, we
564 * will remove the check in kernel that requires fields unknown
565 * to the kernel to be cleared. This will require that any new
566 * feature that involves extending struct mlx5_alloc_ucontext
567 * will be accompanied by an indication in the form of one or
568 * more fields in struct mlx5_alloc_ucontext_resp. If the
569 * response value can be interpreted as feature not supported
570 * when the returned value is zero, this will suffice to
571 * indicate to the library that the request was ignored by the
572 * kernel, either because it is unaware or because it decided
573 * to do so. If zero is a valid response, we will add a new
574 * field that indicates whether the request was handled.
576 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
577 offsetof(struct mlx5_alloc_ucontext, lib_caps),
578 &resp->ibv_resp, resp_len))
581 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
582 offsetof(struct mlx5_alloc_ucontext,
584 &resp->ibv_resp, resp_len);
587 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
588 struct ibv_context *ibv_ctx)
590 struct mlx5_context *context = to_mctx(ibv_ctx);
591 void *hca_clock_page;
594 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
595 hca_clock_page = mmap(NULL, mdev->page_size,
596 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
597 mdev->page_size * offset);
599 if (hca_clock_page == MAP_FAILED) {
601 "Warning: Timestamp available,\n"
602 "but failed to mmap() hca core clock page.\n");
606 context->hca_core_clock = hca_clock_page +
607 (context->core_clock.offset & (mdev->page_size - 1));
611 int mlx5dv_query_device(struct ibv_context *ctx_in,
612 struct mlx5dv_context *attrs_out)
614 struct mlx5_context *mctx = to_mctx(ctx_in);
615 uint64_t comp_mask_out = 0;
617 attrs_out->version = 0;
618 attrs_out->flags = 0;
620 if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
621 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
623 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
624 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
626 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
627 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
628 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
631 attrs_out->comp_mask = comp_mask_out;
636 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
637 struct mlx5dv_qp *qp_out)
639 struct mlx5_qp *mqp = to_mqp(qp_in);
641 qp_out->comp_mask = 0;
642 qp_out->dbrec = mqp->db;
644 if (mqp->sq_buf_size)
645 /* IBV_QPT_RAW_PACKET */
646 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
648 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
649 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
650 qp_out->sq.stride = 1 << mqp->sq.wqe_shift;
652 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
653 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
654 qp_out->rq.stride = 1 << mqp->rq.wqe_shift;
656 qp_out->bf.reg = mqp->bf->reg;
658 if (mqp->bf->uuarn > 0)
659 qp_out->bf.size = mqp->bf->buf_size;
666 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
667 struct mlx5dv_cq *cq_out)
669 struct mlx5_cq *mcq = to_mcq(cq_in);
670 struct mlx5_context *mctx = to_mctx(cq_in->context);
672 cq_out->comp_mask = 0;
673 cq_out->cqn = mcq->cqn;
674 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1;
675 cq_out->cqe_size = mcq->cqe_sz;
676 cq_out->buf = mcq->active_buf->buf;
677 cq_out->dbrec = mcq->dbrec;
678 cq_out->uar = mctx->uar;
680 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED;
685 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
686 struct mlx5dv_rwq *rwq_out)
688 struct mlx5_rwq *mrwq = to_mrwq(wq_in);
690 rwq_out->comp_mask = 0;
691 rwq_out->buf = mrwq->pbuff;
692 rwq_out->dbrec = mrwq->recv_db;
693 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt;
694 rwq_out->stride = 1 << mrwq->rq.wqe_shift;
699 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
700 struct mlx5dv_srq *srq_out)
702 struct mlx5_srq *msrq;
704 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
706 srq_out->comp_mask = 0;
707 srq_out->buf = msrq->buf.buf;
708 srq_out->dbrec = msrq->db;
709 srq_out->stride = 1 << msrq->wqe_shift;
710 srq_out->head = msrq->head;
711 srq_out->tail = msrq->tail;
716 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
720 if (obj_type & MLX5DV_OBJ_QP)
721 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
722 if (!ret && (obj_type & MLX5DV_OBJ_CQ))
723 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
724 if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
725 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
726 if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
727 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
732 static void adjust_uar_info(struct mlx5_device *mdev,
733 struct mlx5_context *context,
734 struct mlx5_alloc_ucontext_resp resp)
736 if (!resp.log_uar_size && !resp.num_uars_per_page) {
738 context->uar_size = mdev->page_size;
739 context->num_uars_per_page = 1;
743 context->uar_size = 1 << resp.log_uar_size;
744 context->num_uars_per_page = resp.num_uars_per_page;
747 static int mlx5_init_context(struct verbs_device *vdev,
748 struct ibv_context *ctx, int cmd_fd)
750 struct mlx5_context *context;
751 struct mlx5_alloc_ucontext req;
752 struct mlx5_alloc_ucontext_resp resp;
760 struct mlx5_device *mdev;
761 struct verbs_context *v_ctx;
762 struct ibv_port_attr port_attr;
763 struct ibv_device_attr_ex device_attr;
766 int num_sys_page_map;
768 mdev = to_mdev(&vdev->device);
769 v_ctx = verbs_get_ctx(ctx);
770 page_size = mdev->page_size;
771 mlx5_single_threaded = single_threaded_app();
773 context = to_mctx(ctx);
774 context->ibv_ctx.cmd_fd = cmd_fd;
776 open_debug_file(context);
778 set_freeze_on_error();
779 if (gethostname(context->hostname, sizeof(context->hostname)))
780 strcpy(context->hostname, "host_unknown");
782 tot_uuars = get_total_uuars(page_size);
788 low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
789 if (low_lat_uuars < 0) {
790 errno = -low_lat_uuars;
794 if (low_lat_uuars > tot_uuars - 1) {
799 memset(&req, 0, sizeof(req));
800 memset(&resp, 0, sizeof(resp));
802 req.total_num_uuars = tot_uuars;
803 req.num_low_latency_uuars = low_lat_uuars;
804 req.cqe_version = MLX5_CQE_VERSION_V1;
805 req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
807 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
811 context->max_num_qps = resp.qp_tab_size;
812 context->bf_reg_size = resp.bf_reg_size;
813 context->tot_uuars = resp.tot_uuars;
814 context->low_lat_uuars = low_lat_uuars;
815 context->cache_line_size = resp.cache_line_size;
816 context->max_sq_desc_sz = resp.max_sq_desc_sz;
817 context->max_rq_desc_sz = resp.max_rq_desc_sz;
818 context->max_send_wqebb = resp.max_send_wqebb;
819 context->num_ports = resp.num_ports;
820 context->max_recv_wr = resp.max_recv_wr;
821 context->max_srq_recv_wr = resp.max_srq_recv_wr;
823 context->cqe_version = resp.cqe_version;
824 if (context->cqe_version) {
825 if (context->cqe_version == MLX5_CQE_VERSION_V1)
826 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
831 adjust_uar_info(mdev, context, resp);
833 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
834 context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
840 context->cmds_supp_uhw = resp.cmds_supp_uhw;
841 context->vendor_cap_flags = 0;
843 pthread_mutex_init(&context->qp_table_mutex, NULL);
844 pthread_mutex_init(&context->srq_table_mutex, NULL);
845 pthread_mutex_init(&context->uidx_table_mutex, NULL);
846 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
847 context->qp_table[i].refcnt = 0;
849 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
850 context->uidx_table[i].refcnt = 0;
852 context->db_list = NULL;
854 pthread_mutex_init(&context->db_list_mutex, NULL);
856 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
857 for (i = 0; i < num_sys_page_map; ++i) {
859 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
860 set_index(i, &offset);
861 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
862 cmd_fd, page_size * offset);
863 if (context->uar[i] == MAP_FAILED) {
864 context->uar[i] = NULL;
869 for (i = 0; i < num_sys_page_map; i++) {
870 for (j = 0; j < context->num_uars_per_page; j++) {
871 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
872 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
873 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
874 MLX5_BF_OFFSET + k * context->bf_reg_size;
875 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
876 mlx5_spinlock_init(&context->bfs[bfi].lock);
877 context->bfs[bfi].offset = 0;
879 context->bfs[bfi].buf_size = context->bf_reg_size / 2;
880 context->bfs[bfi].uuarn = bfi;
884 context->hca_core_clock = NULL;
885 if (resp.response_length + sizeof(resp.ibv_resp) >=
886 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
887 sizeof(resp.hca_core_clock_offset) &&
888 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
889 context->core_clock.offset = resp.hca_core_clock_offset;
890 mlx5_map_internal_clock(mdev, ctx);
893 mlx5_spinlock_init(&context->lock32);
895 context->prefer_bf = get_always_bf();
896 context->shut_up_bf = get_shut_up_bf();
897 mlx5_read_env(&vdev->device, context);
899 mlx5_spinlock_init(&context->hugetlb_lock);
900 TAILQ_INIT(&context->hugetlb_list);
902 context->ibv_ctx.ops = mlx5_ctx_ops;
904 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
905 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
906 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
907 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
908 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
909 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
910 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
911 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
912 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
913 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
914 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
915 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
916 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
917 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
918 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
920 memset(&device_attr, 0, sizeof(device_attr));
921 if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
922 sizeof(struct ibv_device_attr_ex))) {
923 context->cached_device_cap_flags =
924 device_attr.orig_attr.device_cap_flags;
925 context->atomic_cap = device_attr.orig_attr.atomic_cap;
926 context->cached_tso_caps = device_attr.tso_caps;
929 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
930 memset(&port_attr, 0, sizeof(port_attr));
931 if (!mlx5_query_port(ctx, j + 1, &port_attr))
932 context->cached_link_layer[j] = port_attr.link_layer;
941 for (i = 0; i < MLX5_MAX_UARS; ++i) {
943 munmap(context->uar[i], page_size);
945 close_debug_file(context);
949 static void mlx5_cleanup_context(struct verbs_device *device,
950 struct ibv_context *ibctx)
952 struct mlx5_context *context = to_mctx(ibctx);
953 int page_size = to_mdev(ibctx->device)->page_size;
957 for (i = 0; i < MLX5_MAX_UARS; ++i) {
959 munmap(context->uar[i], page_size);
961 if (context->hca_core_clock)
962 munmap(context->hca_core_clock - context->core_clock.offset,
964 close_debug_file(context);
967 static struct verbs_device_ops mlx5_dev_ops = {
968 .init_context = mlx5_init_context,
969 .uninit_context = mlx5_cleanup_context,
972 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
976 struct mlx5_device *dev;
977 unsigned vendor, device;
980 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
981 value, sizeof value) < 0)
983 sscanf(value, "%i", &vendor);
985 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
986 value, sizeof value) < 0)
988 sscanf(value, "%i", &device);
990 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
991 if (vendor == hca_table[i].vendor &&
992 device == hca_table[i].device)
998 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
999 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
1000 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
1001 "(min supported %d, max supported %d)\n",
1002 abi_version, uverbs_sys_path,
1003 MLX5_UVERBS_MIN_ABI_VERSION,
1004 MLX5_UVERBS_MAX_ABI_VERSION);
1008 dev = calloc(1, sizeof *dev);
1010 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1015 dev->page_size = sysconf(_SC_PAGESIZE);
1016 dev->driver_abi_ver = abi_version;
1018 dev->verbs_dev.ops = &mlx5_dev_ops;
1019 dev->verbs_dev.sz = sizeof(*dev);
1020 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1021 sizeof(struct ibv_context);
1023 return &dev->verbs_dev;
1026 static __attribute__((constructor)) void mlx5_register_driver(void)
1028 verbs_register_driver("mlx5", mlx5_driver_init);