2 * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * $Id: cm.c 3453 2005-09-15 21:43:21Z sean.hefty $
37 #endif /* HAVE_CONFIG_H */
49 #include <infiniband/endian.h>
50 #include <infiniband/byteswap.h>
53 #include <infiniband/driver.h>
54 #include <infiniband/marshall.h>
55 #include <rdma/rdma_cma.h>
56 #include <rdma/rdma_cma_abi.h>
58 #ifdef INCLUDE_VALGRIND
59 # include <valgrind/memcheck.h>
60 # ifndef VALGRIND_MAKE_MEM_DEFINED
61 # warning "Valgrind requested, but VALGRIND_MAKE_MEM_DEFINED undefined"
65 #ifndef VALGRIND_MAKE_MEM_DEFINED
66 # define VALGRIND_MAKE_MEM_DEFINED(addr,len)
69 #define PFX "librdmacm: "
71 #if __BYTE_ORDER == __LITTLE_ENDIAN
72 static inline uint64_t htonll(uint64_t x) { return bswap_64(x); }
73 static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); }
75 static inline uint64_t htonll(uint64_t x) { return x; }
76 static inline uint64_t ntohll(uint64_t x) { return x; }
79 static inline int ERR(int err)
85 #define CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, type, size) \
87 struct ucma_abi_cmd_hdr *hdr; \
89 size = sizeof(*hdr) + sizeof(*cmd); \
94 cmd = msg + sizeof(*hdr); \
96 hdr->in = sizeof(*cmd); \
97 hdr->out = sizeof(*resp); \
98 memset(cmd, 0, sizeof(*cmd)); \
99 resp = alloca(sizeof(*resp)); \
101 return ERR(ENOMEM); \
102 cmd->response = (uintptr_t)resp;\
105 #define CMA_CREATE_MSG_CMD(msg, cmd, type, size) \
107 struct ucma_abi_cmd_hdr *hdr; \
109 size = sizeof(*hdr) + sizeof(*cmd); \
110 msg = alloca(size); \
112 return ERR(ENOMEM); \
114 cmd = msg + sizeof(*hdr); \
116 hdr->in = sizeof(*cmd); \
118 memset(cmd, 0, sizeof(*cmd)); \
122 struct ibv_context *verbs;
125 uint8_t max_initiator_depth;
126 uint8_t max_responder_resources;
129 struct cma_id_private {
130 struct rdma_cm_id id;
131 struct cma_device *cma_dev;
132 int events_completed;
137 struct cma_multicast *mc_list;
140 struct cma_multicast {
141 struct cma_multicast *next;
142 struct cma_id_private *id_priv;
144 int events_completed;
149 struct sockaddr_storage addr;
153 struct rdma_cm_event event;
154 uint8_t private_data[RDMA_MAX_PRIVATE_DATA];
155 struct cma_id_private *id_priv;
156 struct cma_multicast *mc;
159 static struct cma_device *cma_dev_array;
160 static int cma_dev_cnt;
161 static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
162 static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION;
164 #define container_of(ptr, type, field) \
165 ((type *) ((void *)ptr - offsetof(type, field)))
167 static void ucma_cleanup(void)
171 ibv_close_device(cma_dev_array[--cma_dev_cnt].verbs);
178 static int check_abi_version(void)
182 if ((ibv_read_sysfs_file(ibv_get_sysfs_path(),
183 "class/misc/rdma_cm/abi_version",
184 value, sizeof value) < 0) &&
185 (ibv_read_sysfs_file(ibv_get_sysfs_path(),
186 "class/infiniband_ucma/abi_version",
187 value, sizeof value) < 0)) {
189 * Older version of Linux do not have class/misc. To support
190 * backports, assume the most recent version of the ABI. If
191 * we're wrong, we'll simply fail later when calling the ABI.
193 fprintf(stderr, "librdmacm: couldn't read ABI version.\n");
194 fprintf(stderr, "librdmacm: assuming: %d\n", abi_ver);
198 abi_ver = strtol(value, NULL, 10);
199 if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION ||
200 abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) {
201 fprintf(stderr, "librdmacm: kernel ABI version %d "
202 "doesn't match library version %d.\n",
203 abi_ver, RDMA_USER_CM_MAX_ABI_VERSION);
209 static int ucma_init(void)
211 struct ibv_device **dev_list = NULL;
212 struct cma_device *cma_dev;
213 struct ibv_device_attr attr;
216 pthread_mutex_lock(&mut);
218 pthread_mutex_unlock(&mut);
222 ret = check_abi_version();
226 dev_list = ibv_get_device_list(&dev_cnt);
228 printf("CMA: unable to get RDMA device list\n");
233 cma_dev_array = malloc(sizeof *cma_dev * dev_cnt);
234 if (!cma_dev_array) {
239 for (i = 0; dev_list[i];) {
240 cma_dev = &cma_dev_array[i];
242 cma_dev->guid = ibv_get_device_guid(dev_list[i]);
243 cma_dev->verbs = ibv_open_device(dev_list[i]);
244 if (!cma_dev->verbs) {
245 printf("CMA: unable to open RDMA device\n");
251 ret = ibv_query_device(cma_dev->verbs, &attr);
253 printf("CMA: unable to query RDMA device\n");
257 cma_dev->port_cnt = attr.phys_port_cnt;
258 cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
259 cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
262 cma_dev_cnt = dev_cnt;
263 pthread_mutex_unlock(&mut);
264 ibv_free_device_list(dev_list);
269 ibv_close_device(cma_dev_array[i].verbs);
272 ibv_free_device_list(dev_list);
274 pthread_mutex_unlock(&mut);
278 struct ibv_context **rdma_get_devices(int *num_devices)
280 struct ibv_context **devs = NULL;
283 if (!cma_dev_cnt && ucma_init())
286 devs = malloc(sizeof *devs * (cma_dev_cnt + 1));
290 for (i = 0; i < cma_dev_cnt; i++)
291 devs[i] = cma_dev_array[i].verbs;
295 *num_devices = devs ? cma_dev_cnt : 0;
299 void rdma_free_devices(struct ibv_context **list)
304 static void __attribute__((destructor)) rdma_cma_fini(void)
309 struct rdma_event_channel *rdma_create_event_channel(void)
311 struct rdma_event_channel *channel;
313 if (!cma_dev_cnt && ucma_init())
316 channel = malloc(sizeof *channel);
320 channel->fd = open("/dev/rdma_cm", O_RDWR);
321 if (channel->fd < 0) {
322 printf("CMA: unable to open /dev/rdma_cm\n");
331 void rdma_destroy_event_channel(struct rdma_event_channel *channel)
337 static int ucma_get_device(struct cma_id_private *id_priv, uint64_t guid)
339 struct cma_device *cma_dev;
342 for (i = 0; i < cma_dev_cnt; i++) {
343 cma_dev = &cma_dev_array[i];
344 if (cma_dev->guid == guid) {
345 id_priv->cma_dev = cma_dev;
346 id_priv->id.verbs = cma_dev->verbs;
354 static void ucma_free_id(struct cma_id_private *id_priv)
356 pthread_cond_destroy(&id_priv->cond);
357 pthread_mutex_destroy(&id_priv->mut);
358 if (id_priv->id.route.path_rec)
359 free(id_priv->id.route.path_rec);
363 static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel,
365 enum rdma_port_space ps)
367 struct cma_id_private *id_priv;
369 id_priv = malloc(sizeof *id_priv);
373 memset(id_priv, 0, sizeof *id_priv);
374 id_priv->id.context = context;
376 id_priv->id.channel = channel;
377 pthread_mutex_init(&id_priv->mut, NULL);
378 if (pthread_cond_init(&id_priv->cond, NULL))
383 err: ucma_free_id(id_priv);
387 int rdma_create_id(struct rdma_event_channel *channel,
388 struct rdma_cm_id **id, void *context,
389 enum rdma_port_space ps)
391 struct ucma_abi_create_id_resp *resp;
392 struct ucma_abi_create_id *cmd;
393 struct cma_id_private *id_priv;
397 ret = cma_dev_cnt ? 0 : ucma_init();
401 id_priv = ucma_alloc_id(channel, context, ps);
405 CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_CREATE_ID, size);
406 cmd->uid = (uintptr_t) id_priv;
409 ret = write(channel->fd, msg, size);
413 VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp);
415 id_priv->handle = resp->id;
419 err: ucma_free_id(id_priv);
423 static int ucma_destroy_kern_id(int fd, uint32_t handle)
425 struct ucma_abi_destroy_id_resp *resp;
426 struct ucma_abi_destroy_id *cmd;
430 CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_DESTROY_ID, size);
433 ret = write(fd, msg, size);
435 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
437 VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp);
439 return resp->events_reported;
442 int rdma_destroy_id(struct rdma_cm_id *id)
444 struct cma_id_private *id_priv;
447 id_priv = container_of(id, struct cma_id_private, id);
448 ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle);
452 pthread_mutex_lock(&id_priv->mut);
453 while (id_priv->events_completed < ret)
454 pthread_cond_wait(&id_priv->cond, &id_priv->mut);
455 pthread_mutex_unlock(&id_priv->mut);
457 ucma_free_id(id_priv);
461 static int ucma_addrlen(struct sockaddr *addr)
466 switch (addr->sa_family) {
468 return sizeof(struct sockaddr_in);
470 return sizeof(struct sockaddr_in6);
476 static int ucma_query_route(struct rdma_cm_id *id)
478 struct ucma_abi_query_route_resp *resp;
479 struct ucma_abi_query_route *cmd;
480 struct cma_id_private *id_priv;
484 CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_QUERY_ROUTE, size);
485 id_priv = container_of(id, struct cma_id_private, id);
486 cmd->id = id_priv->handle;
488 ret = write(id->channel->fd, msg, size);
490 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
492 VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp);
494 if (resp->num_paths) {
495 id->route.path_rec = malloc(sizeof *id->route.path_rec *
497 if (!id->route.path_rec)
500 id->route.num_paths = resp->num_paths;
501 for (i = 0; i < resp->num_paths; i++)
502 ibv_copy_path_rec_from_kern(&id->route.path_rec[i],
506 memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp->ib_route[0].sgid,
507 sizeof id->route.addr.addr.ibaddr.sgid);
508 memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp->ib_route[0].dgid,
509 sizeof id->route.addr.addr.ibaddr.dgid);
510 id->route.addr.addr.ibaddr.pkey = resp->ib_route[0].pkey;
511 memcpy(&id->route.addr.src_addr, &resp->src_addr,
512 sizeof resp->src_addr);
513 memcpy(&id->route.addr.dst_addr, &resp->dst_addr,
514 sizeof resp->dst_addr);
516 if (!id_priv->cma_dev && resp->node_guid) {
517 ret = ucma_get_device(id_priv, resp->node_guid);
520 id_priv->id.port_num = resp->port_num;
526 int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
528 struct ucma_abi_bind_addr *cmd;
529 struct cma_id_private *id_priv;
531 int ret, size, addrlen;
533 addrlen = ucma_addrlen(addr);
537 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_BIND_ADDR, size);
538 id_priv = container_of(id, struct cma_id_private, id);
539 cmd->id = id_priv->handle;
540 memcpy(&cmd->addr, addr, addrlen);
542 ret = write(id->channel->fd, msg, size);
544 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
546 return ucma_query_route(id);
549 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
550 struct sockaddr *dst_addr, int timeout_ms)
552 struct ucma_abi_resolve_addr *cmd;
553 struct cma_id_private *id_priv;
555 int ret, size, daddrlen;
557 daddrlen = ucma_addrlen(dst_addr);
561 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_RESOLVE_ADDR, size);
562 id_priv = container_of(id, struct cma_id_private, id);
563 cmd->id = id_priv->handle;
565 memcpy(&cmd->src_addr, src_addr, ucma_addrlen(src_addr));
566 memcpy(&cmd->dst_addr, dst_addr, daddrlen);
567 cmd->timeout_ms = timeout_ms;
569 ret = write(id->channel->fd, msg, size);
571 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
573 memcpy(&id->route.addr.dst_addr, dst_addr, daddrlen);
577 int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
579 struct ucma_abi_resolve_route *cmd;
580 struct cma_id_private *id_priv;
584 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_RESOLVE_ROUTE, size);
585 id_priv = container_of(id, struct cma_id_private, id);
586 cmd->id = id_priv->handle;
587 cmd->timeout_ms = timeout_ms;
589 ret = write(id->channel->fd, msg, size);
591 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
596 static int ucma_is_ud_ps(enum rdma_port_space ps)
598 return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB);
601 static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr,
604 struct ucma_abi_init_qp_attr *cmd;
605 struct ibv_kern_qp_attr *resp;
606 struct cma_id_private *id_priv;
610 CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_INIT_QP_ATTR, size);
611 id_priv = container_of(id, struct cma_id_private, id);
612 cmd->id = id_priv->handle;
613 cmd->qp_state = qp_attr->qp_state;
615 ret = write(id->channel->fd, msg, size);
617 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
619 VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp);
621 ibv_copy_qp_attr_from_kern(qp_attr, resp);
622 *qp_attr_mask = resp->qp_attr_mask;
626 static int ucma_modify_qp_rtr(struct rdma_cm_id *id,
627 struct rdma_conn_param *conn_param)
629 struct ibv_qp_attr qp_attr;
630 int qp_attr_mask, ret;
635 /* Need to update QP attributes from default values. */
636 qp_attr.qp_state = IBV_QPS_INIT;
637 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
641 ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
645 qp_attr.qp_state = IBV_QPS_RTR;
646 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
651 qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
652 return ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
655 static int ucma_modify_qp_rts(struct rdma_cm_id *id)
657 struct ibv_qp_attr qp_attr;
658 int qp_attr_mask, ret;
660 qp_attr.qp_state = IBV_QPS_RTS;
661 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
665 return ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
668 static int ucma_modify_qp_sqd(struct rdma_cm_id *id)
670 struct ibv_qp_attr qp_attr;
675 qp_attr.qp_state = IBV_QPS_SQD;
676 return ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE);
679 static int ucma_modify_qp_err(struct rdma_cm_id *id)
681 struct ibv_qp_attr qp_attr;
686 qp_attr.qp_state = IBV_QPS_ERR;
687 return ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE);
690 static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num,
691 uint16_t pkey, uint16_t *pkey_index)
696 for (i = 0, ret = 0; !ret; i++) {
697 ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey);
698 if (!ret && pkey == chk_pkey) {
699 *pkey_index = (uint16_t) i;
706 static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
708 struct ibv_qp_attr qp_attr;
711 ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
712 id_priv->id.route.addr.addr.ibaddr.pkey,
713 &qp_attr.pkey_index);
717 qp_attr.port_num = id_priv->id.port_num;
718 qp_attr.qp_state = IBV_QPS_INIT;
719 qp_attr.qp_access_flags = 0;
721 return ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS |
722 IBV_QP_PKEY_INDEX | IBV_QP_PORT);
725 static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
727 struct ibv_qp_attr qp_attr;
728 int qp_attr_mask, ret;
731 return ucma_init_conn_qp3(id_priv, qp);
733 qp_attr.qp_state = IBV_QPS_INIT;
734 ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
738 return ibv_modify_qp(qp, &qp_attr, qp_attr_mask);
741 static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
743 struct ibv_qp_attr qp_attr;
746 ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
747 id_priv->id.route.addr.addr.ibaddr.pkey,
748 &qp_attr.pkey_index);
752 qp_attr.port_num = id_priv->id.port_num;
753 qp_attr.qp_state = IBV_QPS_INIT;
754 qp_attr.qkey = RDMA_UDP_QKEY;
756 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY |
757 IBV_QP_PKEY_INDEX | IBV_QP_PORT);
761 qp_attr.qp_state = IBV_QPS_RTR;
762 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
766 qp_attr.qp_state = IBV_QPS_RTS;
768 return ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
771 static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
773 struct ibv_qp_attr qp_attr;
774 int qp_attr_mask, ret;
777 return ucma_init_ud_qp3(id_priv, qp);
779 qp_attr.qp_state = IBV_QPS_INIT;
780 ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
784 ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask);
788 qp_attr.qp_state = IBV_QPS_RTR;
789 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
793 qp_attr.qp_state = IBV_QPS_RTS;
795 return ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
798 int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd,
799 struct ibv_qp_init_attr *qp_init_attr)
801 struct cma_id_private *id_priv;
805 id_priv = container_of(id, struct cma_id_private, id);
806 if (id->verbs != pd->context)
809 qp = ibv_create_qp(pd, qp_init_attr);
813 if (ucma_is_ud_ps(id->ps))
814 ret = ucma_init_ud_qp(id_priv, qp);
816 ret = ucma_init_conn_qp(id_priv, qp);
827 void rdma_destroy_qp(struct rdma_cm_id *id)
829 ibv_destroy_qp(id->qp);
832 static int ucma_valid_param(struct cma_id_private *id_priv,
833 struct rdma_conn_param *conn_param)
835 if (id_priv->id.ps != RDMA_PS_TCP)
838 if ((conn_param->responder_resources >
839 id_priv->cma_dev->max_responder_resources) ||
840 (conn_param->initiator_depth >
841 id_priv->cma_dev->max_initiator_depth))
847 static void ucma_copy_conn_param_to_kern(struct ucma_abi_conn_param *dst,
848 struct rdma_conn_param *src,
849 uint32_t qp_num, uint8_t srq)
851 dst->qp_num = qp_num;
853 dst->responder_resources = src->responder_resources;
854 dst->initiator_depth = src->initiator_depth;
855 dst->flow_control = src->flow_control;
856 dst->retry_count = src->retry_count;
857 dst->rnr_retry_count = src->rnr_retry_count;
860 if (src->private_data && src->private_data_len) {
861 memcpy(dst->private_data, src->private_data,
862 src->private_data_len);
863 dst->private_data_len = src->private_data_len;
867 int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
869 struct ucma_abi_connect *cmd;
870 struct cma_id_private *id_priv;
874 id_priv = container_of(id, struct cma_id_private, id);
875 ret = ucma_valid_param(id_priv, conn_param);
879 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_CONNECT, size);
880 cmd->id = id_priv->handle;
882 ucma_copy_conn_param_to_kern(&cmd->conn_param, conn_param,
884 (id->qp->srq != NULL));
886 ucma_copy_conn_param_to_kern(&cmd->conn_param, conn_param,
890 ret = write(id->channel->fd, msg, size);
892 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
897 int rdma_listen(struct rdma_cm_id *id, int backlog)
899 struct ucma_abi_listen *cmd;
900 struct cma_id_private *id_priv;
904 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_LISTEN, size);
905 id_priv = container_of(id, struct cma_id_private, id);
906 cmd->id = id_priv->handle;
907 cmd->backlog = backlog;
909 ret = write(id->channel->fd, msg, size);
911 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
913 return ucma_query_route(id);
916 int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
918 struct ucma_abi_accept *cmd;
919 struct cma_id_private *id_priv;
923 id_priv = container_of(id, struct cma_id_private, id);
924 ret = ucma_valid_param(id_priv, conn_param);
928 if (!ucma_is_ud_ps(id->ps)) {
929 ret = ucma_modify_qp_rtr(id, conn_param);
934 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_ACCEPT, size);
935 cmd->id = id_priv->handle;
936 cmd->uid = (uintptr_t) id_priv;
938 ucma_copy_conn_param_to_kern(&cmd->conn_param, conn_param,
940 (id->qp->srq != NULL));
942 ucma_copy_conn_param_to_kern(&cmd->conn_param, conn_param,
946 ret = write(id->channel->fd, msg, size);
948 ucma_modify_qp_err(id);
949 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
955 int rdma_reject(struct rdma_cm_id *id, const void *private_data,
956 uint8_t private_data_len)
958 struct ucma_abi_reject *cmd;
959 struct cma_id_private *id_priv;
963 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_REJECT, size);
965 id_priv = container_of(id, struct cma_id_private, id);
966 cmd->id = id_priv->handle;
967 if (private_data && private_data_len) {
968 memcpy(cmd->private_data, private_data, private_data_len);
969 cmd->private_data_len = private_data_len;
971 cmd->private_data_len = 0;
973 ret = write(id->channel->fd, msg, size);
975 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
980 int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event)
982 struct ucma_abi_notify *cmd;
983 struct cma_id_private *id_priv;
987 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_NOTIFY, size);
989 id_priv = container_of(id, struct cma_id_private, id);
990 cmd->id = id_priv->handle;
992 ret = write(id->channel->fd, msg, size);
994 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
999 int rdma_disconnect(struct rdma_cm_id *id)
1001 struct ucma_abi_disconnect *cmd;
1002 struct cma_id_private *id_priv;
1006 switch (id->verbs->device->transport_type) {
1007 case IBV_TRANSPORT_IB:
1008 ret = ucma_modify_qp_err(id);
1010 case IBV_TRANSPORT_IWARP:
1011 ret = ucma_modify_qp_sqd(id);
1019 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_DISCONNECT, size);
1020 id_priv = container_of(id, struct cma_id_private, id);
1021 cmd->id = id_priv->handle;
1023 ret = write(id->channel->fd, msg, size);
1025 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
1030 int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
1033 struct ucma_abi_join_mcast *cmd;
1034 struct ucma_abi_create_id_resp *resp;
1035 struct cma_id_private *id_priv;
1036 struct cma_multicast *mc, **pos;
1038 int ret, size, addrlen;
1040 id_priv = container_of(id, struct cma_id_private, id);
1041 addrlen = ucma_addrlen(addr);
1045 mc = malloc(sizeof *mc);
1049 memset(mc, 0, sizeof *mc);
1050 mc->context = context;
1051 mc->id_priv = id_priv;
1052 memcpy(&mc->addr, addr, addrlen);
1053 if (pthread_cond_init(&mc->cond, NULL)) {
1058 pthread_mutex_lock(&id_priv->mut);
1059 mc->next = id_priv->mc_list;
1060 id_priv->mc_list = mc;
1061 pthread_mutex_unlock(&id_priv->mut);
1063 CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_JOIN_MCAST, size);
1064 cmd->id = id_priv->handle;
1065 memcpy(&cmd->addr, addr, addrlen);
1066 cmd->uid = (uintptr_t) mc;
1068 ret = write(id->channel->fd, msg, size);
1070 ret = (ret >= 0) ? ERR(ECONNREFUSED) : -1;
1074 VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp);
1076 mc->handle = resp->id;
1079 pthread_mutex_lock(&id_priv->mut);
1080 for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next)
1083 pthread_mutex_unlock(&id_priv->mut);
1089 int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
1091 struct ucma_abi_destroy_id *cmd;
1092 struct ucma_abi_destroy_id_resp *resp;
1093 struct cma_id_private *id_priv;
1094 struct cma_multicast *mc, **pos;
1096 int ret, size, addrlen;
1098 addrlen = ucma_addrlen(addr);
1102 id_priv = container_of(id, struct cma_id_private, id);
1103 pthread_mutex_lock(&id_priv->mut);
1104 for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next)
1105 if (!memcmp(&(*pos)->addr, addr, addrlen))
1111 pthread_mutex_unlock(&id_priv->mut);
1113 return ERR(EADDRNOTAVAIL);
1116 ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid);
1118 CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_LEAVE_MCAST, size);
1119 cmd->id = mc->handle;
1121 ret = write(id->channel->fd, msg, size);
1123 ret = (ret >= 0) ? ERR(ECONNREFUSED) : -1;
1127 VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp);
1129 pthread_mutex_lock(&id_priv->mut);
1130 while (mc->events_completed < resp->events_reported)
1131 pthread_cond_wait(&mc->cond, &id_priv->mut);
1132 pthread_mutex_unlock(&id_priv->mut);
1140 static void ucma_complete_event(struct cma_id_private *id_priv)
1142 pthread_mutex_lock(&id_priv->mut);
1143 id_priv->events_completed++;
1144 pthread_cond_signal(&id_priv->cond);
1145 pthread_mutex_unlock(&id_priv->mut);
1148 static void ucma_complete_mc_event(struct cma_multicast *mc)
1150 pthread_mutex_lock(&mc->id_priv->mut);
1151 mc->events_completed++;
1152 pthread_cond_signal(&mc->cond);
1153 mc->id_priv->events_completed++;
1154 pthread_cond_signal(&mc->id_priv->cond);
1155 pthread_mutex_unlock(&mc->id_priv->mut);
1158 int rdma_ack_cm_event(struct rdma_cm_event *event)
1160 struct cma_event *evt;
1165 evt = container_of(event, struct cma_event, event);
1168 ucma_complete_mc_event(evt->mc);
1170 ucma_complete_event(evt->id_priv);
1175 static int ucma_process_conn_req(struct cma_event *evt,
1178 struct cma_id_private *id_priv;
1181 id_priv = ucma_alloc_id(evt->id_priv->id.channel,
1182 evt->id_priv->id.context, evt->id_priv->id.ps);
1184 ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle);
1189 evt->event.listen_id = &evt->id_priv->id;
1190 evt->event.id = &id_priv->id;
1191 id_priv->handle = handle;
1193 ret = ucma_query_route(&id_priv->id);
1195 rdma_destroy_id(&id_priv->id);
1201 ucma_complete_event(evt->id_priv);
1205 static int ucma_process_conn_resp(struct cma_id_private *id_priv)
1207 struct ucma_abi_accept *cmd;
1211 ret = ucma_modify_qp_rtr(&id_priv->id, NULL);
1215 ret = ucma_modify_qp_rts(&id_priv->id);
1219 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_ACCEPT, size);
1220 cmd->id = id_priv->handle;
1222 ret = write(id_priv->id.channel->fd, msg, size);
1224 ret = (ret >= 0) ? ERR(ECONNREFUSED) : -1;
1230 ucma_modify_qp_err(&id_priv->id);
1234 static int ucma_process_establish(struct rdma_cm_id *id)
1238 ret = ucma_modify_qp_rts(id);
1240 ucma_modify_qp_err(id);
1245 static int ucma_process_join(struct cma_event *evt)
1247 evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid;
1248 evt->mc->mlid = evt->event.param.ud.ah_attr.dlid;
1250 if (!evt->id_priv->id.qp)
1253 return ibv_attach_mcast(evt->id_priv->id.qp, &evt->mc->mgid,
1257 static void ucma_copy_conn_event(struct cma_event *event,
1258 struct ucma_abi_conn_param *src)
1260 struct rdma_conn_param *dst = &event->event.param.conn;
1262 dst->private_data_len = src->private_data_len;
1263 if (src->private_data_len) {
1264 dst->private_data = &event->private_data;
1265 memcpy(&event->private_data, src->private_data,
1266 src->private_data_len);
1269 dst->responder_resources = src->responder_resources;
1270 dst->initiator_depth = src->initiator_depth;
1271 dst->flow_control = src->flow_control;
1272 dst->retry_count = src->retry_count;
1273 dst->rnr_retry_count = src->rnr_retry_count;
1274 dst->srq = src->srq;
1275 dst->qp_num = src->qp_num;
1278 static void ucma_copy_ud_event(struct cma_event *event,
1279 struct ucma_abi_ud_param *src)
1281 struct rdma_ud_param *dst = &event->event.param.ud;
1283 dst->private_data_len = src->private_data_len;
1284 if (src->private_data_len) {
1285 dst->private_data = &event->private_data;
1286 memcpy(&event->private_data, src->private_data,
1287 src->private_data_len);
1290 ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr);
1291 dst->qp_num = src->qp_num;
1292 dst->qkey = src->qkey;
1295 int rdma_get_cm_event(struct rdma_event_channel *channel,
1296 struct rdma_cm_event **event)
1298 struct ucma_abi_event_resp *resp;
1299 struct ucma_abi_get_event *cmd;
1300 struct cma_event *evt;
1304 ret = cma_dev_cnt ? 0 : ucma_init();
1311 evt = malloc(sizeof *evt);
1316 memset(evt, 0, sizeof *evt);
1317 CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_GET_EVENT, size);
1318 ret = write(channel->fd, msg, size);
1321 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
1324 VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp);
1326 evt->event.event = resp->event;
1327 evt->id_priv = (void *) (uintptr_t) resp->uid;
1328 evt->event.id = &evt->id_priv->id;
1329 evt->event.status = resp->status;
1331 switch (resp->event) {
1332 case RDMA_CM_EVENT_ADDR_RESOLVED:
1333 evt->event.status = ucma_query_route(&evt->id_priv->id);
1334 if (evt->event.status)
1335 evt->event.event = RDMA_CM_EVENT_ADDR_ERROR;
1337 case RDMA_CM_EVENT_ROUTE_RESOLVED:
1338 evt->event.status = ucma_query_route(&evt->id_priv->id);
1339 if (evt->event.status)
1340 evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
1342 case RDMA_CM_EVENT_CONNECT_REQUEST:
1343 evt->id_priv = (void *) (uintptr_t) resp->uid;
1344 if (ucma_is_ud_ps(evt->id_priv->id.ps))
1345 ucma_copy_ud_event(evt, &resp->param.ud);
1347 ucma_copy_conn_event(evt, &resp->param.conn);
1349 ret = ucma_process_conn_req(evt, resp->id);
1353 case RDMA_CM_EVENT_CONNECT_RESPONSE:
1354 ucma_copy_conn_event(evt, &resp->param.conn);
1355 evt->event.status = ucma_process_conn_resp(evt->id_priv);
1356 if (!evt->event.status)
1357 evt->event.event = RDMA_CM_EVENT_ESTABLISHED;
1359 evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR;
1360 evt->id_priv->connect_error = 1;
1363 case RDMA_CM_EVENT_ESTABLISHED:
1364 if (ucma_is_ud_ps(evt->id_priv->id.ps)) {
1365 ucma_copy_ud_event(evt, &resp->param.ud);
1369 ucma_copy_conn_event(evt, &resp->param.conn);
1370 evt->event.status = ucma_process_establish(&evt->id_priv->id);
1371 if (evt->event.status) {
1372 evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR;
1373 evt->id_priv->connect_error = 1;
1376 case RDMA_CM_EVENT_REJECTED:
1377 if (evt->id_priv->connect_error) {
1378 ucma_complete_event(evt->id_priv);
1381 ucma_copy_conn_event(evt, &resp->param.conn);
1382 ucma_modify_qp_err(evt->event.id);
1384 case RDMA_CM_EVENT_DISCONNECTED:
1385 if (evt->id_priv->connect_error) {
1386 ucma_complete_event(evt->id_priv);
1389 ucma_copy_conn_event(evt, &resp->param.conn);
1391 case RDMA_CM_EVENT_MULTICAST_JOIN:
1392 evt->mc = (void *) (uintptr_t) resp->uid;
1393 evt->id_priv = evt->mc->id_priv;
1394 evt->event.id = &evt->id_priv->id;
1395 ucma_copy_ud_event(evt, &resp->param.ud);
1396 evt->event.param.ud.private_data = evt->mc->context;
1397 evt->event.status = ucma_process_join(evt);
1398 if (evt->event.status)
1399 evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
1401 case RDMA_CM_EVENT_MULTICAST_ERROR:
1402 evt->mc = (void *) (uintptr_t) resp->uid;
1403 evt->id_priv = evt->mc->id_priv;
1404 evt->event.id = &evt->id_priv->id;
1405 evt->event.param.ud.private_data = evt->mc->context;
1408 evt->id_priv = (void *) (uintptr_t) resp->uid;
1409 evt->event.id = &evt->id_priv->id;
1410 evt->event.status = resp->status;
1411 if (ucma_is_ud_ps(evt->id_priv->id.ps))
1412 ucma_copy_ud_event(evt, &resp->param.ud);
1414 ucma_copy_conn_event(evt, &resp->param.conn);
1418 *event = &evt->event;
1422 const char *rdma_event_str(enum rdma_cm_event_type event)
1425 case RDMA_CM_EVENT_ADDR_RESOLVED:
1426 return "RDMA_CM_EVENT_ADDR_RESOLVED";
1427 case RDMA_CM_EVENT_ADDR_ERROR:
1428 return "RDMA_CM_EVENT_ADDR_ERROR";
1429 case RDMA_CM_EVENT_ROUTE_RESOLVED:
1430 return "RDMA_CM_EVENT_ROUTE_RESOLVED";
1431 case RDMA_CM_EVENT_ROUTE_ERROR:
1432 return "RDMA_CM_EVENT_ROUTE_ERROR";
1433 case RDMA_CM_EVENT_CONNECT_REQUEST:
1434 return "RDMA_CM_EVENT_CONNECT_REQUEST";
1435 case RDMA_CM_EVENT_CONNECT_RESPONSE:
1436 return "RDMA_CM_EVENT_CONNECT_RESPONSE";
1437 case RDMA_CM_EVENT_CONNECT_ERROR:
1438 return "RDMA_CM_EVENT_CONNECT_ERROR";
1439 case RDMA_CM_EVENT_UNREACHABLE:
1440 return "RDMA_CM_EVENT_UNREACHABLE";
1441 case RDMA_CM_EVENT_REJECTED:
1442 return "RDMA_CM_EVENT_REJECTED";
1443 case RDMA_CM_EVENT_ESTABLISHED:
1444 return "RDMA_CM_EVENT_ESTABLISHED";
1445 case RDMA_CM_EVENT_DISCONNECTED:
1446 return "RDMA_CM_EVENT_DISCONNECTED";
1447 case RDMA_CM_EVENT_DEVICE_REMOVAL:
1448 return "RDMA_CM_EVENT_DEVICE_REMOVAL";
1449 case RDMA_CM_EVENT_MULTICAST_JOIN:
1450 return "RDMA_CM_EVENT_MULTICAST_JOIN";
1451 case RDMA_CM_EVENT_MULTICAST_ERROR:
1452 return "RDMA_CM_EVENT_MULTICAST_ERROR";
1453 case RDMA_CM_EVENT_ADDR_CHANGE:
1454 return "RDMA_CM_EVENT_ADDR_CHANGE";
1455 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1456 return "RDMA_CM_EVENT_TIMEWAIT_EXIT";
1458 return "UNKNOWN EVENT";
1462 int rdma_set_option(struct rdma_cm_id *id, int level, int optname,
1463 void *optval, size_t optlen)
1465 struct ucma_abi_set_option *cmd;
1466 struct cma_id_private *id_priv;
1470 CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_SET_OPTION, size);
1471 id_priv = container_of(id, struct cma_id_private, id);
1472 cmd->id = id_priv->handle;
1473 cmd->optval = (uintptr_t) optval;
1475 cmd->optname = optname;
1476 cmd->optlen = optlen;
1478 ret = write(id->channel->fd, msg, size);
1480 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
1485 int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel)
1487 struct ucma_abi_migrate_resp *resp;
1488 struct ucma_abi_migrate_id *cmd;
1489 struct cma_id_private *id_priv;
1493 id_priv = container_of(id, struct cma_id_private, id);
1494 CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_MIGRATE_ID, size);
1495 cmd->id = id_priv->handle;
1496 cmd->fd = id->channel->fd;
1498 ret = write(channel->fd, msg, size);
1500 return (ret >= 0) ? ERR(ECONNREFUSED) : -1;
1502 VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp);
1505 * Eventually if we want to support migrating channels while events are
1506 * being processed on the current channel, we need to block here while
1507 * there are any outstanding events on the current channel for this id
1508 * to prevent the user from processing events for this id on the old
1509 * channel after this call returns.
1511 pthread_mutex_lock(&id_priv->mut);
1512 id->channel = channel;
1513 while (id_priv->events_completed < resp->events_reported)
1514 pthread_cond_wait(&id_priv->cond, &id_priv->mut);
1515 pthread_mutex_unlock(&id_priv->mut);