]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/ofed/librdmacm/cma.c
OpenZFS: MFV 2.0-rc3-gfc5966
[FreeBSD/FreeBSD.git] / contrib / ofed / librdmacm / cma.c
1 /*
2  * Copyright (c) 2005-2014 Intel Corporation.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <config.h>
34
35 #include <stdlib.h>
36 #include <string.h>
37 #include <glob.h>
38 #include <stdio.h>
39 #include <fcntl.h>
40 #include <errno.h>
41 #include <stdint.h>
42 #include <poll.h>
43 #include <unistd.h>
44 #include <pthread.h>
45 #include <infiniband/endian.h>
46 #include <stddef.h>
47 #include <netdb.h>
48 #include <syslog.h>
49 #include <limits.h>
50
51 #include "cma.h"
52 #include "indexer.h"
53 #include <infiniband/driver.h>
54 #include <infiniband/marshall.h>
55 #include <rdma/rdma_cma.h>
56 #include <rdma/rdma_cma_abi.h>
57 #include <rdma/rdma_verbs.h>
58 #include <infiniband/ib.h>
59
60 #define CMA_INIT_CMD(req, req_size, op)         \
61 do {                                            \
62         memset(req, 0, req_size);               \
63         (req)->cmd = UCMA_CMD_##op;             \
64         (req)->in  = req_size - sizeof(struct ucma_abi_cmd_hdr); \
65 } while (0)
66
67 #define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \
68 do {                                            \
69         CMA_INIT_CMD(req, req_size, op);        \
70         (req)->out = resp_size;                 \
71         (req)->response = (uintptr_t) (resp);   \
72 } while (0)
73
74 struct cma_port {
75         uint8_t                 link_layer;
76 };
77
78 struct cma_device {
79         struct ibv_context *verbs;
80         struct ibv_pd      *pd;
81         struct ibv_xrcd    *xrcd;
82         struct cma_port    *port;
83         __be64              guid;
84         int                 port_cnt;
85         int                 refcnt;
86         int                 max_qpsize;
87         uint8_t             max_initiator_depth;
88         uint8_t             max_responder_resources;
89 };
90
91 struct cma_id_private {
92         struct rdma_cm_id       id;
93         struct cma_device       *cma_dev;
94         void                    *connect;
95         size_t                  connect_len;
96         int                     events_completed;
97         int                     connect_error;
98         int                     sync;
99         pthread_cond_t          cond;
100         pthread_mutex_t         mut;
101         uint32_t                handle;
102         struct cma_multicast    *mc_list;
103         struct ibv_qp_init_attr *qp_init_attr;
104         uint8_t                 initiator_depth;
105         uint8_t                 responder_resources;
106 };
107
108 struct cma_multicast {
109         struct cma_multicast  *next;
110         struct cma_id_private *id_priv;
111         void            *context;
112         int             events_completed;
113         pthread_cond_t  cond;
114         uint32_t        handle;
115         union ibv_gid   mgid;
116         uint16_t        mlid;
117         struct sockaddr_storage addr;
118 };
119
120 struct cma_event {
121         struct rdma_cm_event    event;
122         uint8_t                 private_data[RDMA_MAX_PRIVATE_DATA];
123         struct cma_id_private   *id_priv;
124         struct cma_multicast    *mc;
125 };
126
127 static struct cma_device *cma_dev_array;
128 static int cma_dev_cnt;
129 static int cma_init_cnt;
130 static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
131 static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION;
132 int af_ib_support;
133 static struct index_map ucma_idm;
134 static fastlock_t idm_lock;
135
136 static int check_abi_version(void)
137 {
138         char value[8];
139
140         if ((ibv_read_sysfs_file(ibv_get_sysfs_path(),
141                                  "class/misc/rdma_cm/abi_version",
142                                  value, sizeof value) < 0) &&
143             (ibv_read_sysfs_file(ibv_get_sysfs_path(),
144                                  "class/infiniband_ucma/abi_version",
145                                  value, sizeof value) < 0)) {
146                 /*
147                  * Older version of Linux do not have class/misc.  To support
148                  * backports, assume the most recent version of the ABI.  If
149                  * we're wrong, we'll simply fail later when calling the ABI.
150                  */
151                 return 0;
152         }
153
154         abi_ver = strtol(value, NULL, 10);
155         if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION ||
156             abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) {
157                 return -1;
158         }
159         return 0;
160 }
161
162 /*
163  * This function is called holding the mutex lock
164  * cma_dev_cnt must be set before calling this function to
165  * ensure that the lock is not acquired recursively.
166  */
167 static void ucma_set_af_ib_support(void)
168 {
169         struct rdma_cm_id *id;
170         struct sockaddr_ib sib;
171         int ret;
172
173         ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB);
174         if (ret)
175                 return;
176
177         memset(&sib, 0, sizeof sib);
178         sib.sib_family = AF_IB;
179         sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP);
180         sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK);
181         af_ib_support = 1;
182         ret = rdma_bind_addr(id, (struct sockaddr *) &sib);
183         af_ib_support = !ret;
184
185         rdma_destroy_id(id);
186 }
187
188 int ucma_init(void)
189 {
190         struct ibv_device **dev_list = NULL;
191         int i, ret, dev_cnt;
192
193         /* Quick check without lock to see if we're already initialized */
194         if (cma_dev_cnt)
195                 return 0;
196
197         pthread_mutex_lock(&mut);
198         if (cma_dev_cnt) {
199                 pthread_mutex_unlock(&mut);
200                 return 0;
201         }
202
203         fastlock_init(&idm_lock);
204         ret = check_abi_version();
205         if (ret)
206                 goto err1;
207
208         dev_list = ibv_get_device_list(&dev_cnt);
209         if (!dev_list) {
210                 ret = ERR(ENODEV);
211                 goto err1;
212         }
213
214         if (!dev_cnt) {
215                 ret = ERR(ENODEV);
216                 goto err2;
217         }
218
219         cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array));
220         if (!cma_dev_array) {
221                 ret = ERR(ENOMEM);
222                 goto err2;
223         }
224
225         for (i = 0; dev_list[i]; i++)
226                 cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]);
227
228         cma_dev_cnt = dev_cnt;
229         ucma_set_af_ib_support();
230         pthread_mutex_unlock(&mut);
231         ibv_free_device_list(dev_list);
232         return 0;
233
234 err2:
235         ibv_free_device_list(dev_list);
236 err1:
237         fastlock_destroy(&idm_lock);
238         pthread_mutex_unlock(&mut);
239         return ret;
240 }
241
242 static struct ibv_context *ucma_open_device(__be64 guid)
243 {
244         struct ibv_device **dev_list;
245         struct ibv_context *verbs = NULL;
246         int i;
247
248         dev_list = ibv_get_device_list(NULL);
249         if (!dev_list) {
250                 return NULL;
251         }
252
253         for (i = 0; dev_list[i]; i++) {
254                 if (ibv_get_device_guid(dev_list[i]) == guid) {
255                         verbs = ibv_open_device(dev_list[i]);
256                         break;
257                 }
258         }
259
260         ibv_free_device_list(dev_list);
261         return verbs;
262 }
263
264 static int ucma_init_device(struct cma_device *cma_dev)
265 {
266         struct ibv_port_attr port_attr;
267         struct ibv_device_attr attr;
268         int i, ret;
269
270         if (cma_dev->verbs)
271                 return 0;
272
273         cma_dev->verbs = ucma_open_device(cma_dev->guid);
274         if (!cma_dev->verbs)
275                 return ERR(ENODEV);
276
277         ret = ibv_query_device(cma_dev->verbs, &attr);
278         if (ret) {
279                 ret = ERR(ret);
280                 goto err;
281         }
282
283         cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt);
284         if (!cma_dev->port) {
285                 ret = ERR(ENOMEM);
286                 goto err;
287         }
288
289         for (i = 1; i <= attr.phys_port_cnt; i++) {
290                 if (ibv_query_port(cma_dev->verbs, i, &port_attr))
291                         cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED;
292                 else
293                         cma_dev->port[i - 1].link_layer = port_attr.link_layer;
294         }
295
296         cma_dev->port_cnt = attr.phys_port_cnt;
297         cma_dev->max_qpsize = attr.max_qp_wr;
298         cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
299         cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
300         cma_init_cnt++;
301         return 0;
302
303 err:
304         ibv_close_device(cma_dev->verbs);
305         cma_dev->verbs = NULL;
306         return ret;
307 }
308
309 static int ucma_init_all(void)
310 {
311         int i, ret = 0;
312
313         if (!cma_dev_cnt) {
314                 ret = ucma_init();
315                 if (ret)
316                         return ret;
317         }
318
319         if (cma_init_cnt == cma_dev_cnt)
320                 return 0;
321
322         pthread_mutex_lock(&mut);
323         for (i = 0; i < cma_dev_cnt; i++) {
324                 ret = ucma_init_device(&cma_dev_array[i]);
325                 if (ret)
326                         break;
327         }
328         pthread_mutex_unlock(&mut);
329         return ret;
330 }
331
332 struct ibv_context **rdma_get_devices(int *num_devices)
333 {
334         struct ibv_context **devs = NULL;
335         int i;
336
337         if (ucma_init_all())
338                 goto out;
339
340         devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1));
341         if (!devs)
342                 goto out;
343
344         for (i = 0; i < cma_dev_cnt; i++)
345                 devs[i] = cma_dev_array[i].verbs;
346         devs[i] = NULL;
347 out:
348         if (num_devices)
349                 *num_devices = devs ? cma_dev_cnt : 0;
350         return devs;
351 }
352
353 void rdma_free_devices(struct ibv_context **list)
354 {
355         free(list);
356 }
357
358 struct rdma_event_channel *rdma_create_event_channel(void)
359 {
360         struct rdma_event_channel *channel;
361
362         if (ucma_init())
363                 return NULL;
364
365         channel = malloc(sizeof(*channel));
366         if (!channel)
367                 return NULL;
368
369         channel->fd = open("/dev/rdma_cm", O_RDWR | O_CLOEXEC);
370         if (channel->fd < 0) {
371                 goto err;
372         }
373         return channel;
374 err:
375         free(channel);
376         return NULL;
377 }
378
379 void rdma_destroy_event_channel(struct rdma_event_channel *channel)
380 {
381         close(channel->fd);
382         free(channel);
383 }
384
385 static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid)
386 {
387         struct cma_device *cma_dev;
388         int i, ret;
389
390         for (i = 0; i < cma_dev_cnt; i++) {
391                 cma_dev = &cma_dev_array[i];
392                 if (cma_dev->guid == guid)
393                         goto match;
394         }
395
396         return ERR(ENODEV);
397 match:
398         pthread_mutex_lock(&mut);
399         if ((ret = ucma_init_device(cma_dev)))
400                 goto out;
401
402         if (!cma_dev->refcnt++) {
403                 cma_dev->pd = ibv_alloc_pd(cma_dev->verbs);
404                 if (!cma_dev->pd) {
405                         cma_dev->refcnt--;
406                         ret = ERR(ENOMEM);
407                         goto out;
408                 }
409         }
410         id_priv->cma_dev = cma_dev;
411         id_priv->id.verbs = cma_dev->verbs;
412         id_priv->id.pd = cma_dev->pd;
413 out:
414         pthread_mutex_unlock(&mut);
415         return ret;
416 }
417
418 static void ucma_put_device(struct cma_device *cma_dev)
419 {
420         pthread_mutex_lock(&mut);
421         if (!--cma_dev->refcnt) {
422                 ibv_dealloc_pd(cma_dev->pd);
423                 if (cma_dev->xrcd)
424                         ibv_close_xrcd(cma_dev->xrcd);
425         }
426         pthread_mutex_unlock(&mut);
427 }
428
429 static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev)
430 {
431         struct ibv_xrcd_init_attr attr;
432
433         pthread_mutex_lock(&mut);
434         if (!cma_dev->xrcd) {
435                 memset(&attr, 0, sizeof attr);
436                 attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS;
437                 attr.fd = -1;
438                 attr.oflags = O_CREAT;
439                 cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr);
440         }
441         pthread_mutex_unlock(&mut);
442         return cma_dev->xrcd;
443 }
444
445 static void ucma_insert_id(struct cma_id_private *id_priv)
446 {
447         fastlock_acquire(&idm_lock);
448         idm_set(&ucma_idm, id_priv->handle, id_priv);
449         fastlock_release(&idm_lock);
450 }
451
452 static void ucma_remove_id(struct cma_id_private *id_priv)
453 {
454         if (id_priv->handle <= IDX_MAX_INDEX)
455                 idm_clear(&ucma_idm, id_priv->handle);
456 }
457
458 static struct cma_id_private *ucma_lookup_id(int handle)
459 {
460         return idm_lookup(&ucma_idm, handle);
461 }
462
463 static void ucma_free_id(struct cma_id_private *id_priv)
464 {
465         ucma_remove_id(id_priv);
466         if (id_priv->cma_dev)
467                 ucma_put_device(id_priv->cma_dev);
468         pthread_cond_destroy(&id_priv->cond);
469         pthread_mutex_destroy(&id_priv->mut);
470         if (id_priv->id.route.path_rec)
471                 free(id_priv->id.route.path_rec);
472
473         if (id_priv->sync)
474                 rdma_destroy_event_channel(id_priv->id.channel);
475         if (id_priv->connect_len)
476                 free(id_priv->connect);
477         free(id_priv);
478 }
479
480 static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel,
481                                             void *context,
482                                             enum rdma_port_space ps,
483                                             enum ibv_qp_type qp_type)
484 {
485         struct cma_id_private *id_priv;
486
487         id_priv = calloc(1, sizeof(*id_priv));
488         if (!id_priv)
489                 return NULL;
490
491         id_priv->id.context = context;
492         id_priv->id.ps = ps;
493         id_priv->id.qp_type = qp_type;
494         id_priv->handle = 0xFFFFFFFF;
495
496         if (!channel) {
497                 id_priv->id.channel = rdma_create_event_channel();
498                 if (!id_priv->id.channel)
499                         goto err;
500                 id_priv->sync = 1;
501         } else {
502                 id_priv->id.channel = channel;
503         }
504
505         pthread_mutex_init(&id_priv->mut, NULL);
506         if (pthread_cond_init(&id_priv->cond, NULL))
507                 goto err;
508
509         return id_priv;
510
511 err:    ucma_free_id(id_priv);
512         return NULL;
513 }
514
515 static int rdma_create_id2(struct rdma_event_channel *channel,
516                            struct rdma_cm_id **id, void *context,
517                            enum rdma_port_space ps, enum ibv_qp_type qp_type)
518 {
519         struct ucma_abi_create_id_resp resp;
520         struct ucma_abi_create_id cmd;
521         struct cma_id_private *id_priv;
522         int ret;
523
524         ret = ucma_init();
525         if (ret)
526                 return ret;
527
528         id_priv = ucma_alloc_id(channel, context, ps, qp_type);
529         if (!id_priv)
530                 return ERR(ENOMEM);
531
532         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp);
533         cmd.uid = (uintptr_t) id_priv;
534         cmd.ps = ps;
535         cmd.qp_type = qp_type;
536
537         ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd);
538         if (ret != sizeof cmd)
539                 goto err;
540
541         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
542
543         id_priv->handle = resp.id;
544         ucma_insert_id(id_priv);
545         *id = &id_priv->id;
546         return 0;
547
548 err:    ucma_free_id(id_priv);
549         return ret;
550 }
551
552 int rdma_create_id(struct rdma_event_channel *channel,
553                    struct rdma_cm_id **id, void *context,
554                    enum rdma_port_space ps)
555 {
556         enum ibv_qp_type qp_type;
557
558         qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ?
559                   IBV_QPT_UD : IBV_QPT_RC;
560         return rdma_create_id2(channel, id, context, ps, qp_type);
561 }
562
563 static int ucma_destroy_kern_id(int fd, uint32_t handle)
564 {
565         struct ucma_abi_destroy_id_resp resp;
566         struct ucma_abi_destroy_id cmd;
567         int ret;
568         
569         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp);
570         cmd.id = handle;
571
572         ret = write(fd, &cmd, sizeof cmd);
573         if (ret != sizeof cmd)
574                 return (ret >= 0) ? ERR(ENODATA) : -1;
575
576         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
577
578         return resp.events_reported;
579 }
580
581 int rdma_destroy_id(struct rdma_cm_id *id)
582 {
583         struct cma_id_private *id_priv;
584         int ret;
585
586         id_priv = container_of(id, struct cma_id_private, id);
587         ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle);
588         if (ret < 0)
589                 return ret;
590
591         if (id_priv->id.event)
592                 rdma_ack_cm_event(id_priv->id.event);
593
594         pthread_mutex_lock(&id_priv->mut);
595         while (id_priv->events_completed < ret)
596                 pthread_cond_wait(&id_priv->cond, &id_priv->mut);
597         pthread_mutex_unlock(&id_priv->mut);
598
599         ucma_free_id(id_priv);
600         return 0;
601 }
602
603 int ucma_addrlen(struct sockaddr *addr)
604 {
605         if (!addr)
606                 return 0;
607
608         switch (addr->sa_family) {
609         case PF_INET:
610                 return sizeof(struct sockaddr_in);
611         case PF_INET6:
612                 return sizeof(struct sockaddr_in6);
613         case PF_IB:
614                 return af_ib_support ? sizeof(struct sockaddr_ib) : 0;
615         default:
616                 return 0;
617         }
618 }
619
620 static int ucma_query_addr(struct rdma_cm_id *id)
621 {
622         struct ucma_abi_query_addr_resp resp;
623         struct ucma_abi_query cmd;
624         struct cma_id_private *id_priv;
625         int ret;
626         
627         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
628         id_priv = container_of(id, struct cma_id_private, id);
629         cmd.id = id_priv->handle;
630         cmd.option = UCMA_QUERY_ADDR;
631
632         ret = write(id->channel->fd, &cmd, sizeof cmd);
633         if (ret != sizeof cmd)
634                 return (ret >= 0) ? ERR(ENODATA) : -1;
635
636         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
637
638         memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size);
639         memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size);
640
641         if (!id_priv->cma_dev && resp.node_guid) {
642                 ret = ucma_get_device(id_priv, resp.node_guid);
643                 if (ret)
644                         return ret;
645                 id->port_num = resp.port_num;
646                 id->route.addr.addr.ibaddr.pkey = resp.pkey;
647         }
648
649         return 0;
650 }
651
652 static int ucma_query_gid(struct rdma_cm_id *id)
653 {
654         struct ucma_abi_query_addr_resp resp;
655         struct ucma_abi_query cmd;
656         struct cma_id_private *id_priv;
657         struct sockaddr_ib *sib;
658         int ret;
659         
660         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
661         id_priv = container_of(id, struct cma_id_private, id);
662         cmd.id = id_priv->handle;
663         cmd.option = UCMA_QUERY_GID;
664
665         ret = write(id->channel->fd, &cmd, sizeof cmd);
666         if (ret != sizeof cmd)
667                 return (ret >= 0) ? ERR(ENODATA) : -1;
668
669         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
670
671         sib = (struct sockaddr_ib *) &resp.src_addr;
672         memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw,
673                sizeof id->route.addr.addr.ibaddr.sgid);
674
675         sib = (struct sockaddr_ib *) &resp.dst_addr;
676         memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw,
677                sizeof id->route.addr.addr.ibaddr.dgid);
678
679         return 0;
680 }
681
682 static void ucma_convert_path(struct ibv_path_data *path_data,
683                               struct ibv_sa_path_rec *sa_path)
684 {
685         uint32_t fl_hop;
686
687         sa_path->dgid = path_data->path.dgid;
688         sa_path->sgid = path_data->path.sgid;
689         sa_path->dlid = path_data->path.dlid;
690         sa_path->slid = path_data->path.slid;
691         sa_path->raw_traffic = 0;
692
693         fl_hop = be32toh(path_data->path.flowlabel_hoplimit);
694         sa_path->flow_label = htobe32(fl_hop >> 8);
695         sa_path->hop_limit = (uint8_t) fl_hop;
696
697         sa_path->traffic_class = path_data->path.tclass;
698         sa_path->reversible = path_data->path.reversible_numpath >> 7;
699         sa_path->numb_path = 1;
700         sa_path->pkey = path_data->path.pkey;
701         sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF;
702         sa_path->mtu_selector = 2;      /* exactly */
703         sa_path->mtu = path_data->path.mtu & 0x1F;
704         sa_path->rate_selector = 2;
705         sa_path->rate = path_data->path.rate & 0x1F;
706         sa_path->packet_life_time_selector = 2;
707         sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F;
708
709         sa_path->preference = (uint8_t) path_data->flags;
710 }
711
712 static int ucma_query_path(struct rdma_cm_id *id)
713 {
714         struct ucma_abi_query_path_resp *resp;
715         struct ucma_abi_query cmd;
716         struct cma_id_private *id_priv;
717         int ret, i, size;
718
719         size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6;
720         resp = alloca(size);
721         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size);
722         id_priv = container_of(id, struct cma_id_private, id);
723         cmd.id = id_priv->handle;
724         cmd.option = UCMA_QUERY_PATH;
725
726         ret = write(id->channel->fd, &cmd, sizeof cmd);
727         if (ret != sizeof cmd)
728                 return (ret >= 0) ? ERR(ENODATA) : -1;
729
730         VALGRIND_MAKE_MEM_DEFINED(resp, size);
731
732         if (resp->num_paths) {
733                 id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
734                                             resp->num_paths);
735                 if (!id->route.path_rec)
736                         return ERR(ENOMEM);
737
738                 id->route.num_paths = resp->num_paths;
739                 for (i = 0; i < resp->num_paths; i++)
740                         ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]);
741         }
742
743         return 0;
744 }
745
746 static int ucma_query_route(struct rdma_cm_id *id)
747 {
748         struct ucma_abi_query_route_resp resp;
749         struct ucma_abi_query cmd;
750         struct cma_id_private *id_priv;
751         int ret, i;
752
753         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp);
754         id_priv = container_of(id, struct cma_id_private, id);
755         cmd.id = id_priv->handle;
756
757         ret = write(id->channel->fd, &cmd, sizeof cmd);
758         if (ret != sizeof cmd)
759                 return (ret >= 0) ? ERR(ENODATA) : -1;
760
761         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
762
763         if (resp.num_paths) {
764                 id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
765                                             resp.num_paths);
766                 if (!id->route.path_rec)
767                         return ERR(ENOMEM);
768
769                 id->route.num_paths = resp.num_paths;
770                 for (i = 0; i < resp.num_paths; i++)
771                         ibv_copy_path_rec_from_kern(&id->route.path_rec[i],
772                                                     &resp.ib_route[i]);
773         }
774
775         memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid,
776                sizeof id->route.addr.addr.ibaddr.sgid);
777         memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid,
778                sizeof id->route.addr.addr.ibaddr.dgid);
779         id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey;
780         memcpy(&id->route.addr.src_addr, &resp.src_addr,
781                sizeof resp.src_addr);
782         memcpy(&id->route.addr.dst_addr, &resp.dst_addr,
783                sizeof resp.dst_addr);
784
785         if (!id_priv->cma_dev && resp.node_guid) {
786                 ret = ucma_get_device(id_priv, resp.node_guid);
787                 if (ret)
788                         return ret;
789                 id_priv->id.port_num = resp.port_num;
790         }
791
792         return 0;
793 }
794
795 static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr,
796                            socklen_t addrlen)
797 {
798         struct ucma_abi_bind cmd;
799         struct cma_id_private *id_priv;
800         int ret;
801         
802         CMA_INIT_CMD(&cmd, sizeof cmd, BIND);
803         id_priv = container_of(id, struct cma_id_private, id);
804         cmd.id = id_priv->handle;
805         cmd.addr_size = addrlen;
806         memcpy(&cmd.addr, addr, addrlen);
807
808         ret = write(id->channel->fd, &cmd, sizeof cmd);
809         if (ret != sizeof cmd)
810                 return (ret >= 0) ? ERR(ENODATA) : -1;
811
812         ret = ucma_query_addr(id);
813         if (!ret)
814                 ret = ucma_query_gid(id);
815         return ret;
816 }
817
818 int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
819 {
820         struct ucma_abi_bind_ip cmd;
821         struct cma_id_private *id_priv;
822         int ret, addrlen;
823         
824         addrlen = ucma_addrlen(addr);
825         if (!addrlen)
826                 return ERR(EINVAL);
827
828         if (af_ib_support)
829                 return rdma_bind_addr2(id, addr, addrlen);
830
831         CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP);
832         id_priv = container_of(id, struct cma_id_private, id);
833         cmd.id = id_priv->handle;
834         memcpy(&cmd.addr, addr, addrlen);
835
836         ret = write(id->channel->fd, &cmd, sizeof cmd);
837         if (ret != sizeof cmd)
838                 return (ret >= 0) ? ERR(ENODATA) : -1;
839
840         return ucma_query_route(id);
841 }
842
843 int ucma_complete(struct rdma_cm_id *id)
844 {
845         struct cma_id_private *id_priv;
846         int ret;
847
848         id_priv = container_of(id, struct cma_id_private, id);
849         if (!id_priv->sync)
850                 return 0;
851
852         if (id_priv->id.event) {
853                 rdma_ack_cm_event(id_priv->id.event);
854                 id_priv->id.event = NULL;
855         }
856
857         ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event);
858         if (ret)
859                 return ret;
860
861         if (id_priv->id.event->status) {
862                 if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED)
863                         ret = ERR(ECONNREFUSED);
864                 else if (id_priv->id.event->status < 0)
865                         ret = ERR(-id_priv->id.event->status);
866                 else
867                         ret = ERR(-id_priv->id.event->status);
868         }
869         return ret;
870 }
871
872 static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr,
873                               socklen_t src_len, struct sockaddr *dst_addr,
874                               socklen_t dst_len, int timeout_ms)
875 {
876         struct ucma_abi_resolve_addr cmd;
877         struct cma_id_private *id_priv;
878         int ret;
879         
880         CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR);
881         id_priv = container_of(id, struct cma_id_private, id);
882         cmd.id = id_priv->handle;
883         if ((cmd.src_size = src_len))
884                 memcpy(&cmd.src_addr, src_addr, src_len);
885         memcpy(&cmd.dst_addr, dst_addr, dst_len);
886         cmd.dst_size = dst_len;
887         cmd.timeout_ms = timeout_ms;
888
889         ret = write(id->channel->fd, &cmd, sizeof cmd);
890         if (ret != sizeof cmd)
891                 return (ret >= 0) ? ERR(ENODATA) : -1;
892
893         memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
894         return ucma_complete(id);
895 }
896
897 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
898                       struct sockaddr *dst_addr, int timeout_ms)
899 {
900         struct ucma_abi_resolve_ip cmd;
901         struct cma_id_private *id_priv;
902         int ret, dst_len, src_len;
903         
904         dst_len = ucma_addrlen(dst_addr);
905         if (!dst_len)
906                 return ERR(EINVAL);
907
908         src_len = ucma_addrlen(src_addr);
909         if (src_addr && !src_len)
910                 return ERR(EINVAL);
911
912         if (af_ib_support)
913                 return rdma_resolve_addr2(id, src_addr, src_len, dst_addr,
914                                           dst_len, timeout_ms);
915
916         CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP);
917         id_priv = container_of(id, struct cma_id_private, id);
918         cmd.id = id_priv->handle;
919         if (src_addr)
920                 memcpy(&cmd.src_addr, src_addr, src_len);
921         memcpy(&cmd.dst_addr, dst_addr, dst_len);
922         cmd.timeout_ms = timeout_ms;
923
924         ret = write(id->channel->fd, &cmd, sizeof cmd);
925         if (ret != sizeof cmd)
926                 return (ret >= 0) ? ERR(ENODATA) : -1;
927
928         memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
929         return ucma_complete(id);
930 }
931
932 static int ucma_set_ib_route(struct rdma_cm_id *id)
933 {
934         struct rdma_addrinfo hint, *rai;
935         int ret;
936
937         memset(&hint, 0, sizeof hint);
938         hint.ai_flags = RAI_ROUTEONLY;
939         hint.ai_family = id->route.addr.src_addr.sa_family;
940         hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr);
941         hint.ai_src_addr = &id->route.addr.src_addr;
942         hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr);
943         hint.ai_dst_addr = &id->route.addr.dst_addr;
944
945         ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai);
946         if (ret)
947                 return ret;
948
949         if (rai->ai_route_len)
950                 ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
951                                       rai->ai_route, rai->ai_route_len);
952         else
953                 ret = -1;
954
955         rdma_freeaddrinfo(rai);
956         return ret;
957 }
958
959 int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
960 {
961         struct ucma_abi_resolve_route cmd;
962         struct cma_id_private *id_priv;
963         int ret;
964
965         id_priv = container_of(id, struct cma_id_private, id);
966         if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) {
967                 ret = ucma_set_ib_route(id);
968                 if (!ret)
969                         goto out;
970         }
971
972         CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE);
973         cmd.id = id_priv->handle;
974         cmd.timeout_ms = timeout_ms;
975
976         ret = write(id->channel->fd, &cmd, sizeof cmd);
977         if (ret != sizeof cmd)
978                 return (ret >= 0) ? ERR(ENODATA) : -1;
979
980 out:
981         return ucma_complete(id);
982 }
983
984 static int ucma_is_ud_qp(enum ibv_qp_type qp_type)
985 {
986         return (qp_type == IBV_QPT_UD);
987 }
988
989 static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr,
990                              int *qp_attr_mask)
991 {
992         struct ucma_abi_init_qp_attr cmd;
993         struct ibv_kern_qp_attr resp;
994         struct cma_id_private *id_priv;
995         int ret;
996         
997         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp);
998         id_priv = container_of(id, struct cma_id_private, id);
999         cmd.id = id_priv->handle;
1000         cmd.qp_state = qp_attr->qp_state;
1001
1002         ret = write(id->channel->fd, &cmd, sizeof cmd);
1003         if (ret != sizeof cmd)
1004                 return (ret >= 0) ? ERR(ENODATA) : -1;
1005
1006         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1007
1008         ibv_copy_qp_attr_from_kern(qp_attr, &resp);
1009         *qp_attr_mask = resp.qp_attr_mask;
1010         return 0;
1011 }
1012
1013 static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res)
1014 {
1015         struct cma_id_private *id_priv;
1016         struct ibv_qp_attr qp_attr;
1017         int qp_attr_mask, ret;
1018         uint8_t link_layer;
1019
1020         if (!id->qp)
1021                 return ERR(EINVAL);
1022
1023         /* Need to update QP attributes from default values. */
1024         qp_attr.qp_state = IBV_QPS_INIT;
1025         ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1026         if (ret)
1027                 return ret;
1028
1029         ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
1030         if (ret)
1031                 return ERR(ret);
1032
1033         qp_attr.qp_state = IBV_QPS_RTR;
1034         ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1035         if (ret)
1036                 return ret;
1037
1038         /*
1039          * Workaround for rdma_ucm kernel bug:
1040          * mask off qp_attr_mask bits 21-24 which are used for RoCE
1041          */
1042         id_priv = container_of(id, struct cma_id_private, id);
1043         link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer;
1044
1045         if (link_layer == IBV_LINK_LAYER_INFINIBAND)
1046                 qp_attr_mask &= UINT_MAX ^ 0xe00000;
1047
1048         if (resp_res != RDMA_MAX_RESP_RES)
1049                 qp_attr.max_dest_rd_atomic = resp_res;
1050         return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
1051 }
1052
1053 static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth)
1054 {
1055         struct ibv_qp_attr qp_attr;
1056         int qp_attr_mask, ret;
1057
1058         qp_attr.qp_state = IBV_QPS_RTS;
1059         ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1060         if (ret)
1061                 return ret;
1062
1063         if (init_depth != RDMA_MAX_INIT_DEPTH)
1064                 qp_attr.max_rd_atomic = init_depth;
1065         return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
1066 }
1067
1068 static int ucma_modify_qp_sqd(struct rdma_cm_id *id)
1069 {
1070         struct ibv_qp_attr qp_attr;
1071
1072         if (!id->qp)
1073                 return 0;
1074
1075         qp_attr.qp_state = IBV_QPS_SQD;
1076         return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
1077 }
1078
1079 static int ucma_modify_qp_err(struct rdma_cm_id *id)
1080 {
1081         struct ibv_qp_attr qp_attr;
1082
1083         if (!id->qp)
1084                 return 0;
1085
1086         qp_attr.qp_state = IBV_QPS_ERR;
1087         return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
1088 }
1089
1090 static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num,
1091                           __be16 pkey, uint16_t *pkey_index)
1092 {
1093         int ret, i;
1094         __be16 chk_pkey;
1095
1096         for (i = 0, ret = 0; !ret; i++) {
1097                 ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey);
1098                 if (!ret && pkey == chk_pkey) {
1099                         *pkey_index = (uint16_t) i;
1100                         return 0;
1101                 }
1102         }
1103         return ERR(EINVAL);
1104 }
1105
1106 static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
1107 {
1108         struct ibv_qp_attr qp_attr;
1109         int ret;
1110
1111         ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
1112                              id_priv->id.route.addr.addr.ibaddr.pkey,
1113                              &qp_attr.pkey_index);
1114         if (ret)
1115                 return ret;
1116
1117         qp_attr.port_num = id_priv->id.port_num;
1118         qp_attr.qp_state = IBV_QPS_INIT;
1119         qp_attr.qp_access_flags = 0;
1120
1121         ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS |
1122                                           IBV_QP_PKEY_INDEX | IBV_QP_PORT);
1123         return rdma_seterrno(ret);
1124 }
1125
1126 static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
1127 {
1128         struct ibv_qp_attr qp_attr;
1129         int qp_attr_mask, ret;
1130
1131         if (abi_ver == 3)
1132                 return ucma_init_conn_qp3(id_priv, qp);
1133
1134         qp_attr.qp_state = IBV_QPS_INIT;
1135         ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
1136         if (ret)
1137                 return ret;
1138
1139         return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask));
1140 }
1141
1142 static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
1143 {
1144         struct ibv_qp_attr qp_attr;
1145         int ret;
1146
1147         ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
1148                              id_priv->id.route.addr.addr.ibaddr.pkey,
1149                              &qp_attr.pkey_index);
1150         if (ret)
1151                 return ret;
1152
1153         qp_attr.port_num = id_priv->id.port_num;
1154         qp_attr.qp_state = IBV_QPS_INIT;
1155         qp_attr.qkey = RDMA_UDP_QKEY;
1156
1157         ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY |
1158                                           IBV_QP_PKEY_INDEX | IBV_QP_PORT);
1159         if (ret)
1160                 return ERR(ret);
1161
1162         qp_attr.qp_state = IBV_QPS_RTR;
1163         ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
1164         if (ret)
1165                 return ERR(ret);
1166
1167         qp_attr.qp_state = IBV_QPS_RTS;
1168         qp_attr.sq_psn = 0;
1169         ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
1170         return rdma_seterrno(ret);
1171 }
1172
1173 static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
1174 {
1175         struct ibv_qp_attr qp_attr;
1176         int qp_attr_mask, ret;
1177
1178         if (abi_ver == 3)
1179                 return ucma_init_ud_qp3(id_priv, qp);
1180
1181         qp_attr.qp_state = IBV_QPS_INIT;
1182         ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
1183         if (ret)
1184                 return ret;
1185
1186         ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask);
1187         if (ret)
1188                 return ERR(ret);
1189
1190         qp_attr.qp_state = IBV_QPS_RTR;
1191         ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
1192         if (ret)
1193                 return ERR(ret);
1194
1195         qp_attr.qp_state = IBV_QPS_RTS;
1196         qp_attr.sq_psn = 0;
1197         ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
1198         return rdma_seterrno(ret);
1199 }
1200
1201 static void ucma_destroy_cqs(struct rdma_cm_id *id)
1202 {
1203         if (id->qp_type == IBV_QPT_XRC_RECV && id->srq)
1204                 return;
1205
1206         if (id->recv_cq) {
1207                 ibv_destroy_cq(id->recv_cq);
1208                 if (id->send_cq && (id->send_cq != id->recv_cq)) {
1209                         ibv_destroy_cq(id->send_cq);
1210                         id->send_cq = NULL;
1211                 }
1212                 id->recv_cq = NULL;
1213         }
1214
1215         if (id->recv_cq_channel) {
1216                 ibv_destroy_comp_channel(id->recv_cq_channel);
1217                 if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) {
1218                         ibv_destroy_comp_channel(id->send_cq_channel);
1219                         id->send_cq_channel = NULL;
1220                 }
1221                 id->recv_cq_channel = NULL;
1222         }
1223 }
1224
1225 static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size)
1226 {
1227         if (recv_size) {
1228                 id->recv_cq_channel = ibv_create_comp_channel(id->verbs);
1229                 if (!id->recv_cq_channel)
1230                         goto err;
1231
1232                 id->recv_cq = ibv_create_cq(id->verbs, recv_size,
1233                                             id, id->recv_cq_channel, 0);
1234                 if (!id->recv_cq)
1235                         goto err;
1236         }
1237
1238         if (send_size) {
1239                 id->send_cq_channel = ibv_create_comp_channel(id->verbs);
1240                 if (!id->send_cq_channel)
1241                         goto err;
1242
1243                 id->send_cq = ibv_create_cq(id->verbs, send_size,
1244                                             id, id->send_cq_channel, 0);
1245                 if (!id->send_cq)
1246                         goto err;
1247         }
1248
1249         return 0;
1250 err:
1251         ucma_destroy_cqs(id);
1252         return ERR(ENOMEM);
1253 }
1254
1255 int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr)
1256 {
1257         struct cma_id_private *id_priv;
1258         struct ibv_srq *srq;
1259         int ret;
1260
1261         id_priv = container_of(id, struct cma_id_private, id);
1262         if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE))
1263                 return ERR(EINVAL);
1264
1265         if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) {
1266                 attr->pd = id->pd;
1267                 attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD;
1268         }
1269
1270         if (attr->srq_type == IBV_SRQT_XRC) {
1271                 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) {
1272                         attr->xrcd = ucma_get_xrcd(id_priv->cma_dev);
1273                         if (!attr->xrcd)
1274                                 return -1;
1275                 }
1276                 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) {
1277                         ret = ucma_create_cqs(id, 0, attr->attr.max_wr);
1278                         if (ret)
1279                                 return ret;
1280                         attr->cq = id->recv_cq;
1281                 }
1282                 attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ;
1283         }
1284
1285         srq = ibv_create_srq_ex(id->verbs, attr);
1286         if (!srq) {
1287                 ret = -1;
1288                 goto err;
1289         }
1290
1291         if (!id->pd)
1292                 id->pd = attr->pd;
1293         id->srq = srq;
1294         return 0;
1295 err:
1296         ucma_destroy_cqs(id);
1297         return ret;
1298 }
1299
1300 int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd,
1301                     struct ibv_srq_init_attr *attr)
1302 {
1303         struct ibv_srq_init_attr_ex attr_ex;
1304         int ret;
1305
1306         memcpy(&attr_ex, attr, sizeof(*attr));
1307         attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD;
1308         if (id->qp_type == IBV_QPT_XRC_RECV) {
1309                 attr_ex.srq_type = IBV_SRQT_XRC;
1310         } else {
1311                 attr_ex.srq_type = IBV_SRQT_BASIC;
1312         }
1313         attr_ex.pd = pd;
1314         ret = rdma_create_srq_ex(id, &attr_ex);
1315         memcpy(attr, &attr_ex, sizeof(*attr));
1316         return ret;
1317 }
1318
1319 void rdma_destroy_srq(struct rdma_cm_id *id)
1320 {
1321         ibv_destroy_srq(id->srq);
1322         id->srq = NULL;
1323         ucma_destroy_cqs(id);
1324 }
1325
1326 int rdma_create_qp_ex(struct rdma_cm_id *id,
1327                       struct ibv_qp_init_attr_ex *attr)
1328 {
1329         struct cma_id_private *id_priv;
1330         struct ibv_qp *qp;
1331         int ret;
1332
1333         if (id->qp)
1334                 return ERR(EINVAL);
1335
1336         id_priv = container_of(id, struct cma_id_private, id);
1337         if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) {
1338                 attr->comp_mask |= IBV_QP_INIT_ATTR_PD;
1339                 attr->pd = id->pd;
1340         } else if (id->verbs != attr->pd->context)
1341                 return ERR(EINVAL);
1342
1343         if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) ||
1344             (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq))
1345                 return ERR(EINVAL);
1346
1347         if (id->qp_type == IBV_QPT_XRC_RECV) {
1348                 if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) {
1349                         attr->xrcd = ucma_get_xrcd(id_priv->cma_dev);
1350                         if (!attr->xrcd)
1351                                 return -1;
1352                         attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD;
1353                 }
1354         }
1355
1356         ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr,
1357                                   attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr);
1358         if (ret)
1359                 return ret;
1360
1361         if (!attr->send_cq)
1362                 attr->send_cq = id->send_cq;
1363         if (!attr->recv_cq)
1364                 attr->recv_cq = id->recv_cq;
1365         if (id->srq && !attr->srq)
1366                 attr->srq = id->srq;
1367         qp = ibv_create_qp_ex(id->verbs, attr);
1368         if (!qp) {
1369                 ret = ERR(ENOMEM);
1370                 goto err1;
1371         }
1372
1373         if (ucma_is_ud_qp(id->qp_type))
1374                 ret = ucma_init_ud_qp(id_priv, qp);
1375         else
1376                 ret = ucma_init_conn_qp(id_priv, qp);
1377         if (ret)
1378                 goto err2;
1379
1380         id->pd = qp->pd;
1381         id->qp = qp;
1382         return 0;
1383 err2:
1384         ibv_destroy_qp(qp);
1385 err1:
1386         ucma_destroy_cqs(id);
1387         return ret;
1388 }
1389
1390 int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd,
1391                    struct ibv_qp_init_attr *qp_init_attr)
1392 {
1393         struct ibv_qp_init_attr_ex attr_ex;
1394         int ret;
1395
1396         memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr));
1397         attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
1398         attr_ex.pd = pd ? pd : id->pd;
1399         ret = rdma_create_qp_ex(id, &attr_ex);
1400         memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr));
1401         return ret;
1402 }
1403
1404 void rdma_destroy_qp(struct rdma_cm_id *id)
1405 {
1406         ibv_destroy_qp(id->qp);
1407         id->qp = NULL;
1408         ucma_destroy_cqs(id);
1409 }
1410
1411 static int ucma_valid_param(struct cma_id_private *id_priv,
1412                             struct rdma_conn_param *param)
1413 {
1414         if (id_priv->id.ps != RDMA_PS_TCP)
1415                 return 0;
1416
1417         if (!id_priv->id.qp && !param)
1418                 goto err;
1419
1420         if (!param)
1421                 return 0;
1422
1423         if ((param->responder_resources != RDMA_MAX_RESP_RES) &&
1424             (param->responder_resources > id_priv->cma_dev->max_responder_resources))
1425                 goto err;
1426
1427         if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) &&
1428             (param->initiator_depth > id_priv->cma_dev->max_initiator_depth))
1429                 goto err;
1430
1431         return 0;
1432 err:
1433         return ERR(EINVAL);
1434 }
1435
1436 static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv,
1437                                          struct ucma_abi_conn_param *dst,
1438                                          struct rdma_conn_param *src,
1439                                          uint32_t qp_num, uint8_t srq)
1440 {
1441         dst->qp_num = qp_num;
1442         dst->srq = srq;
1443         dst->responder_resources = id_priv->responder_resources;
1444         dst->initiator_depth = id_priv->initiator_depth;
1445         dst->valid = 1;
1446
1447         if (id_priv->connect_len) {
1448                 memcpy(dst->private_data, id_priv->connect, id_priv->connect_len);
1449                 dst->private_data_len = id_priv->connect_len;
1450         }
1451
1452         if (src) {
1453                 dst->flow_control = src->flow_control;
1454                 dst->retry_count = src->retry_count;
1455                 dst->rnr_retry_count = src->rnr_retry_count;
1456
1457                 if (src->private_data && src->private_data_len) {
1458                         memcpy(dst->private_data + dst->private_data_len,
1459                                src->private_data, src->private_data_len);
1460                         dst->private_data_len += src->private_data_len;
1461                 }
1462         } else {
1463                 dst->retry_count = 7;
1464                 dst->rnr_retry_count = 7;
1465         }
1466 }
1467
1468 int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
1469 {
1470         struct ucma_abi_connect cmd;
1471         struct cma_id_private *id_priv;
1472         int ret;
1473         
1474         id_priv = container_of(id, struct cma_id_private, id);
1475         ret = ucma_valid_param(id_priv, conn_param);
1476         if (ret)
1477                 return ret;
1478
1479         if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH)
1480                 id_priv->initiator_depth = conn_param->initiator_depth;
1481         else
1482                 id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth;
1483         if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES)
1484                 id_priv->responder_resources = conn_param->responder_resources;
1485         else
1486                 id_priv->responder_resources = id_priv->cma_dev->max_responder_resources;
1487
1488         CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT);
1489         cmd.id = id_priv->handle;
1490         if (id->qp) {
1491                 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1492                                              conn_param, id->qp->qp_num,
1493                                              (id->qp->srq != NULL));
1494         } else if (conn_param) {
1495                 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1496                                              conn_param, conn_param->qp_num,
1497                                              conn_param->srq);
1498         } else {
1499                 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1500                                              conn_param, 0, 0);
1501         }
1502
1503         ret = write(id->channel->fd, &cmd, sizeof cmd);
1504         if (ret != sizeof cmd)
1505                 return (ret >= 0) ? ERR(ENODATA) : -1;
1506
1507         if (id_priv->connect_len) {
1508                 free(id_priv->connect);
1509                 id_priv->connect_len = 0;
1510         }
1511
1512         return ucma_complete(id);
1513 }
1514
1515 int rdma_listen(struct rdma_cm_id *id, int backlog)
1516 {
1517         struct ucma_abi_listen cmd;
1518         struct cma_id_private *id_priv;
1519         int ret;
1520         
1521         CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN);
1522         id_priv = container_of(id, struct cma_id_private, id);
1523         cmd.id = id_priv->handle;
1524         cmd.backlog = backlog;
1525
1526         ret = write(id->channel->fd, &cmd, sizeof cmd);
1527         if (ret != sizeof cmd)
1528                 return (ret >= 0) ? ERR(ENODATA) : -1;
1529
1530         if (af_ib_support)
1531                 return ucma_query_addr(id);
1532         else
1533                 return ucma_query_route(id);
1534 }
1535
1536 int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id)
1537 {
1538         struct cma_id_private *id_priv;
1539         struct rdma_cm_event *event;
1540         int ret;
1541
1542         id_priv = container_of(listen, struct cma_id_private, id);
1543         if (!id_priv->sync)
1544                 return ERR(EINVAL);
1545
1546         if (listen->event) {
1547                 rdma_ack_cm_event(listen->event);
1548                 listen->event = NULL;
1549         }
1550
1551         ret = rdma_get_cm_event(listen->channel, &event);
1552         if (ret)
1553                 return ret;
1554
1555         if (event->status) {
1556                 ret = ERR(event->status);
1557                 goto err;
1558         }
1559         
1560         if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
1561                 ret = ERR(EINVAL);
1562                 goto err;
1563         }
1564
1565         if (id_priv->qp_init_attr) {
1566                 struct ibv_qp_init_attr attr;
1567
1568                 attr = *id_priv->qp_init_attr;
1569                 ret = rdma_create_qp(event->id, listen->pd, &attr);
1570                 if (ret)
1571                         goto err;
1572         }
1573
1574         *id = event->id;
1575         (*id)->event = event;
1576         return 0;
1577
1578 err:
1579         listen->event = event;
1580         return ret;
1581 }
1582
1583 int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
1584 {
1585         struct ucma_abi_accept cmd;
1586         struct cma_id_private *id_priv;
1587         int ret;
1588
1589         id_priv = container_of(id, struct cma_id_private, id);
1590         ret = ucma_valid_param(id_priv, conn_param);
1591         if (ret)
1592                 return ret;
1593
1594         if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) {
1595                 id_priv->initiator_depth = min(id_priv->initiator_depth,
1596                                                id_priv->cma_dev->max_initiator_depth);
1597         } else {
1598                 id_priv->initiator_depth = conn_param->initiator_depth;
1599         }
1600         if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) {
1601                 id_priv->responder_resources = min(id_priv->responder_resources,
1602                                                    id_priv->cma_dev->max_responder_resources);
1603         } else {
1604                 id_priv->responder_resources = conn_param->responder_resources;
1605         }
1606
1607         if (!ucma_is_ud_qp(id->qp_type)) {
1608                 ret = ucma_modify_qp_rtr(id, id_priv->responder_resources);
1609                 if (ret)
1610                         return ret;
1611
1612                 ret = ucma_modify_qp_rts(id, id_priv->initiator_depth);
1613                 if (ret)
1614                         return ret;
1615         }
1616
1617         CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
1618         cmd.id = id_priv->handle;
1619         cmd.uid = (uintptr_t) id_priv;
1620         if (id->qp)
1621                 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1622                                              conn_param, id->qp->qp_num,
1623                                              (id->qp->srq != NULL));
1624         else
1625                 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1626                                              conn_param, conn_param->qp_num,
1627                                              conn_param->srq);
1628
1629         ret = write(id->channel->fd, &cmd, sizeof cmd);
1630         if (ret != sizeof cmd) {
1631                 ucma_modify_qp_err(id);
1632                 return (ret >= 0) ? ERR(ENODATA) : -1;
1633         }
1634
1635         if (ucma_is_ud_qp(id->qp_type))
1636                 return 0;
1637
1638         return ucma_complete(id);
1639 }
1640
1641 int rdma_reject(struct rdma_cm_id *id, const void *private_data,
1642                 uint8_t private_data_len)
1643 {
1644         struct ucma_abi_reject cmd;
1645         struct cma_id_private *id_priv;
1646         int ret;
1647         
1648         CMA_INIT_CMD(&cmd, sizeof cmd, REJECT);
1649
1650         id_priv = container_of(id, struct cma_id_private, id);
1651         cmd.id = id_priv->handle;
1652         if (private_data && private_data_len) {
1653                 memcpy(cmd.private_data, private_data, private_data_len);
1654                 cmd.private_data_len = private_data_len;
1655         }
1656
1657         ret = write(id->channel->fd, &cmd, sizeof cmd);
1658         if (ret != sizeof cmd)
1659                 return (ret >= 0) ? ERR(ENODATA) : -1;
1660
1661         return 0;
1662 }
1663
1664 int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event)
1665 {
1666         struct ucma_abi_notify cmd;
1667         struct cma_id_private *id_priv;
1668         int ret;
1669         
1670         CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY);
1671
1672         id_priv = container_of(id, struct cma_id_private, id);
1673         cmd.id = id_priv->handle;
1674         cmd.event = event;
1675         ret = write(id->channel->fd, &cmd, sizeof cmd);
1676         if (ret != sizeof cmd)
1677                 return (ret >= 0) ? ERR(ENODATA) : -1;
1678
1679         return 0;
1680 }
1681
1682 int ucma_shutdown(struct rdma_cm_id *id)
1683 {
1684         switch (id->verbs->device->transport_type) {
1685         case IBV_TRANSPORT_IB:
1686                 return ucma_modify_qp_err(id);
1687         case IBV_TRANSPORT_IWARP:
1688                 return ucma_modify_qp_sqd(id);
1689         default:
1690                 return ERR(EINVAL);
1691         }
1692 }
1693
1694 int rdma_disconnect(struct rdma_cm_id *id)
1695 {
1696         struct ucma_abi_disconnect cmd;
1697         struct cma_id_private *id_priv;
1698         int ret;
1699
1700         ret = ucma_shutdown(id);
1701         if (ret)
1702                 return ret;
1703
1704         CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT);
1705         id_priv = container_of(id, struct cma_id_private, id);
1706         cmd.id = id_priv->handle;
1707
1708         ret = write(id->channel->fd, &cmd, sizeof cmd);
1709         if (ret != sizeof cmd)
1710                 return (ret >= 0) ? ERR(ENODATA) : -1;
1711
1712         return ucma_complete(id);
1713 }
1714
1715 static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr,
1716                                 socklen_t addrlen, void *context)
1717 {
1718         struct ucma_abi_create_id_resp resp;
1719         struct cma_id_private *id_priv;
1720         struct cma_multicast *mc, **pos;
1721         int ret;
1722         
1723         id_priv = container_of(id, struct cma_id_private, id);
1724         mc = calloc(1, sizeof(*mc));
1725         if (!mc)
1726                 return ERR(ENOMEM);
1727
1728         mc->context = context;
1729         mc->id_priv = id_priv;
1730         memcpy(&mc->addr, addr, addrlen);
1731         if (pthread_cond_init(&mc->cond, NULL)) {
1732                 ret = -1;
1733                 goto err1;
1734         }
1735
1736         pthread_mutex_lock(&id_priv->mut);
1737         mc->next = id_priv->mc_list;
1738         id_priv->mc_list = mc;
1739         pthread_mutex_unlock(&id_priv->mut);
1740
1741         if (af_ib_support) {
1742                 struct ucma_abi_join_mcast cmd;
1743
1744                 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp);
1745                 cmd.id = id_priv->handle;
1746                 memcpy(&cmd.addr, addr, addrlen);
1747                 cmd.addr_size = addrlen;
1748                 cmd.uid = (uintptr_t) mc;
1749                 cmd.reserved = 0;
1750
1751                 ret = write(id->channel->fd, &cmd, sizeof cmd);
1752                 if (ret != sizeof cmd) {
1753                         ret = (ret >= 0) ? ERR(ENODATA) : -1;
1754                         goto err2;
1755                 }
1756         } else {
1757                 struct ucma_abi_join_ip_mcast cmd;
1758
1759                 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp);
1760                 cmd.id = id_priv->handle;
1761                 memcpy(&cmd.addr, addr, addrlen);
1762                 cmd.uid = (uintptr_t) mc;
1763
1764                 ret = write(id->channel->fd, &cmd, sizeof cmd);
1765                 if (ret != sizeof cmd) {
1766                         ret = (ret >= 0) ? ERR(ENODATA) : -1;
1767                         goto err2;
1768                 }
1769         }
1770
1771         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1772
1773         mc->handle = resp.id;
1774         return ucma_complete(id);
1775
1776 err2:
1777         pthread_mutex_lock(&id_priv->mut);
1778         for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next)
1779                 ;
1780         *pos = mc->next;
1781         pthread_mutex_unlock(&id_priv->mut);
1782 err1:
1783         free(mc);
1784         return ret;
1785 }
1786
1787 int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
1788                         void *context)
1789 {
1790         int addrlen;
1791         
1792         addrlen = ucma_addrlen(addr);
1793         if (!addrlen)
1794                 return ERR(EINVAL);
1795
1796         return rdma_join_multicast2(id, addr, addrlen, context);
1797 }
1798
1799 int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
1800 {
1801         struct ucma_abi_destroy_id cmd;
1802         struct ucma_abi_destroy_id_resp resp;
1803         struct cma_id_private *id_priv;
1804         struct cma_multicast *mc, **pos;
1805         int ret, addrlen;
1806         
1807         addrlen = ucma_addrlen(addr);
1808         if (!addrlen)
1809                 return ERR(EINVAL);
1810
1811         id_priv = container_of(id, struct cma_id_private, id);
1812         pthread_mutex_lock(&id_priv->mut);
1813         for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next)
1814                 if (!memcmp(&(*pos)->addr, addr, addrlen))
1815                         break;
1816
1817         mc = *pos;
1818         if (*pos)
1819                 *pos = mc->next;
1820         pthread_mutex_unlock(&id_priv->mut);
1821         if (!mc)
1822                 return ERR(EADDRNOTAVAIL);
1823
1824         if (id->qp)
1825                 ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid);
1826         
1827         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp);
1828         cmd.id = mc->handle;
1829
1830         ret = write(id->channel->fd, &cmd, sizeof cmd);
1831         if (ret != sizeof cmd) {
1832                 ret = (ret >= 0) ? ERR(ENODATA) : -1;
1833                 goto free;
1834         }
1835
1836         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1837
1838         pthread_mutex_lock(&id_priv->mut);
1839         while (mc->events_completed < resp.events_reported)
1840                 pthread_cond_wait(&mc->cond, &id_priv->mut);
1841         pthread_mutex_unlock(&id_priv->mut);
1842
1843         ret = 0;
1844 free:
1845         free(mc);
1846         return ret;
1847 }
1848
1849 static void ucma_complete_event(struct cma_id_private *id_priv)
1850 {
1851         pthread_mutex_lock(&id_priv->mut);
1852         id_priv->events_completed++;
1853         pthread_cond_signal(&id_priv->cond);
1854         pthread_mutex_unlock(&id_priv->mut);
1855 }
1856
1857 static void ucma_complete_mc_event(struct cma_multicast *mc)
1858 {
1859         pthread_mutex_lock(&mc->id_priv->mut);
1860         mc->events_completed++;
1861         pthread_cond_signal(&mc->cond);
1862         mc->id_priv->events_completed++;
1863         pthread_cond_signal(&mc->id_priv->cond);
1864         pthread_mutex_unlock(&mc->id_priv->mut);
1865 }
1866
1867 int rdma_ack_cm_event(struct rdma_cm_event *event)
1868 {
1869         struct cma_event *evt;
1870
1871         if (!event)
1872                 return ERR(EINVAL);
1873
1874         evt = container_of(event, struct cma_event, event);
1875
1876         if (evt->mc)
1877                 ucma_complete_mc_event(evt->mc);
1878         else
1879                 ucma_complete_event(evt->id_priv);
1880         free(evt);
1881         return 0;
1882 }
1883
1884 static void ucma_process_addr_resolved(struct cma_event *evt)
1885 {
1886         if (af_ib_support) {
1887                 evt->event.status = ucma_query_addr(&evt->id_priv->id);
1888                 if (!evt->event.status &&
1889                     evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB)
1890                         evt->event.status = ucma_query_gid(&evt->id_priv->id);
1891         } else {
1892                 evt->event.status = ucma_query_route(&evt->id_priv->id);
1893         }
1894
1895         if (evt->event.status)
1896                 evt->event.event = RDMA_CM_EVENT_ADDR_ERROR;
1897 }
1898
1899 static void ucma_process_route_resolved(struct cma_event *evt)
1900 {
1901         if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB)
1902                 return;
1903
1904         if (af_ib_support)
1905                 evt->event.status = ucma_query_path(&evt->id_priv->id);
1906         else
1907                 evt->event.status = ucma_query_route(&evt->id_priv->id);
1908
1909         if (evt->event.status)
1910                 evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
1911 }
1912
1913 static int ucma_query_req_info(struct rdma_cm_id *id)
1914 {
1915         int ret;
1916
1917         if (!af_ib_support)
1918                 return ucma_query_route(id);
1919
1920         ret = ucma_query_addr(id);
1921         if (ret)
1922                 return ret;
1923
1924         ret = ucma_query_gid(id);
1925         if (ret)
1926                 return ret;
1927
1928         ret = ucma_query_path(id);
1929         if (ret)
1930                 return ret;
1931
1932         return 0;
1933 }
1934
1935 static int ucma_process_conn_req(struct cma_event *evt,
1936                                  uint32_t handle)
1937 {
1938         struct cma_id_private *id_priv;
1939         int ret;
1940
1941         id_priv = ucma_alloc_id(evt->id_priv->id.channel,
1942                                 evt->id_priv->id.context, evt->id_priv->id.ps,
1943                                 evt->id_priv->id.qp_type);
1944         if (!id_priv) {
1945                 ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle);
1946                 ret = ERR(ENOMEM);
1947                 goto err1;
1948         }
1949
1950         evt->event.listen_id = &evt->id_priv->id;
1951         evt->event.id = &id_priv->id;
1952         id_priv->handle = handle;
1953         ucma_insert_id(id_priv);
1954         id_priv->initiator_depth = evt->event.param.conn.initiator_depth;
1955         id_priv->responder_resources = evt->event.param.conn.responder_resources;
1956
1957         if (evt->id_priv->sync) {
1958                 ret = rdma_migrate_id(&id_priv->id, NULL);
1959                 if (ret)
1960                         goto err2;
1961         }
1962
1963         ret = ucma_query_req_info(&id_priv->id);
1964         if (ret)
1965                 goto err2;
1966
1967         return 0;
1968
1969 err2:
1970         rdma_destroy_id(&id_priv->id);
1971 err1:
1972         ucma_complete_event(evt->id_priv);
1973         return ret;
1974 }
1975
1976 static int ucma_process_conn_resp(struct cma_id_private *id_priv)
1977 {
1978         struct ucma_abi_accept cmd;
1979         int ret;
1980
1981         ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES);
1982         if (ret)
1983                 goto err;
1984
1985         ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH);
1986         if (ret)
1987                 goto err;
1988
1989         CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
1990         cmd.id = id_priv->handle;
1991
1992         ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd);
1993         if (ret != sizeof cmd) {
1994                 ret = (ret >= 0) ? ERR(ENODATA) : -1;
1995                 goto err;
1996         }
1997
1998         return 0;
1999 err:
2000         ucma_modify_qp_err(&id_priv->id);
2001         return ret;
2002 }
2003
2004 static int ucma_process_join(struct cma_event *evt)
2005 {
2006         evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid;
2007         evt->mc->mlid = evt->event.param.ud.ah_attr.dlid;
2008
2009         if (!evt->id_priv->id.qp)
2010                 return 0;
2011
2012         return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp,
2013                                               &evt->mc->mgid, evt->mc->mlid));
2014 }
2015
2016 static void ucma_copy_conn_event(struct cma_event *event,
2017                                  struct ucma_abi_conn_param *src)
2018 {
2019         struct rdma_conn_param *dst = &event->event.param.conn;
2020
2021         dst->private_data_len = src->private_data_len;
2022         if (src->private_data_len) {
2023                 dst->private_data = &event->private_data;
2024                 memcpy(&event->private_data, src->private_data,
2025                        src->private_data_len);
2026         }
2027
2028         dst->responder_resources = src->responder_resources;
2029         dst->initiator_depth = src->initiator_depth;
2030         dst->flow_control = src->flow_control;
2031         dst->retry_count = src->retry_count;
2032         dst->rnr_retry_count = src->rnr_retry_count;
2033         dst->srq = src->srq;
2034         dst->qp_num = src->qp_num;
2035 }
2036
2037 static void ucma_copy_ud_event(struct cma_event *event,
2038                                struct ucma_abi_ud_param *src)
2039 {
2040         struct rdma_ud_param *dst = &event->event.param.ud;
2041
2042         dst->private_data_len = src->private_data_len;
2043         if (src->private_data_len) {
2044                 dst->private_data = &event->private_data;
2045                 memcpy(&event->private_data, src->private_data,
2046                        src->private_data_len);
2047         }
2048
2049         ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr);
2050         dst->qp_num = src->qp_num;
2051         dst->qkey = src->qkey;
2052 }
2053
2054 int rdma_get_cm_event(struct rdma_event_channel *channel,
2055                       struct rdma_cm_event **event)
2056 {
2057         struct ucma_abi_event_resp resp;
2058         struct ucma_abi_get_event cmd;
2059         struct cma_event *evt;
2060         int ret;
2061
2062         ret = ucma_init();
2063         if (ret)
2064                 return ret;
2065
2066         if (!event)
2067                 return ERR(EINVAL);
2068
2069         evt = malloc(sizeof(*evt));
2070         if (!evt)
2071                 return ERR(ENOMEM);
2072
2073 retry:
2074         memset(evt, 0, sizeof(*evt));
2075         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp);
2076         ret = write(channel->fd, &cmd, sizeof cmd);
2077         if (ret != sizeof cmd) {
2078                 free(evt);
2079                 return (ret >= 0) ? ERR(ENODATA) : -1;
2080         }
2081         
2082         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
2083
2084         evt->event.event = resp.event;
2085         /*
2086          * We should have a non-zero uid, except for connection requests.
2087          * But a bug in older kernels can report a uid 0.  Work-around this
2088          * issue by looking up the cma_id based on the kernel's id when the
2089          * uid is 0 and we're processing a connection established event.
2090          * In all other cases, if the uid is 0, we discard the event, like
2091          * the kernel should have done.
2092          */
2093         if (resp.uid) {
2094                 evt->id_priv = (void *) (uintptr_t) resp.uid;
2095         } else {
2096                 evt->id_priv = ucma_lookup_id(resp.id);
2097                 if (!evt->id_priv) {
2098                         syslog(LOG_WARNING, PFX "Warning: discarding unmatched "
2099                                 "event - rdma_destroy_id may hang.\n");
2100                         goto retry;
2101                 }
2102                 if (resp.event != RDMA_CM_EVENT_ESTABLISHED) {
2103                         ucma_complete_event(evt->id_priv);
2104                         goto retry;
2105                 }
2106         }
2107         evt->event.id = &evt->id_priv->id;
2108         evt->event.status = resp.status;
2109
2110         switch (resp.event) {
2111         case RDMA_CM_EVENT_ADDR_RESOLVED:
2112                 ucma_process_addr_resolved(evt);
2113                 break;
2114         case RDMA_CM_EVENT_ROUTE_RESOLVED:
2115                 ucma_process_route_resolved(evt);
2116                 break;
2117         case RDMA_CM_EVENT_CONNECT_REQUEST:
2118                 evt->id_priv = (void *) (uintptr_t) resp.uid;
2119                 if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
2120                         ucma_copy_ud_event(evt, &resp.param.ud);
2121                 else
2122                         ucma_copy_conn_event(evt, &resp.param.conn);
2123
2124                 ret = ucma_process_conn_req(evt, resp.id);
2125                 if (ret)
2126                         goto retry;
2127                 break;
2128         case RDMA_CM_EVENT_CONNECT_RESPONSE:
2129                 ucma_copy_conn_event(evt, &resp.param.conn);
2130                 evt->event.status = ucma_process_conn_resp(evt->id_priv);
2131                 if (!evt->event.status)
2132                         evt->event.event = RDMA_CM_EVENT_ESTABLISHED;
2133                 else {
2134                         evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR;
2135                         evt->id_priv->connect_error = 1;
2136                 }
2137                 break;
2138         case RDMA_CM_EVENT_ESTABLISHED:
2139                 if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) {
2140                         ucma_copy_ud_event(evt, &resp.param.ud);
2141                         break;
2142                 }
2143
2144                 ucma_copy_conn_event(evt, &resp.param.conn);
2145                 break;
2146         case RDMA_CM_EVENT_REJECTED:
2147                 if (evt->id_priv->connect_error) {
2148                         ucma_complete_event(evt->id_priv);
2149                         goto retry;
2150                 }
2151                 ucma_copy_conn_event(evt, &resp.param.conn);
2152                 ucma_modify_qp_err(evt->event.id);
2153                 break;
2154         case RDMA_CM_EVENT_DISCONNECTED:
2155                 if (evt->id_priv->connect_error) {
2156                         ucma_complete_event(evt->id_priv);
2157                         goto retry;
2158                 }
2159                 ucma_copy_conn_event(evt, &resp.param.conn);
2160                 break;
2161         case RDMA_CM_EVENT_MULTICAST_JOIN:
2162                 evt->mc = (void *) (uintptr_t) resp.uid;
2163                 evt->id_priv = evt->mc->id_priv;
2164                 evt->event.id = &evt->id_priv->id;
2165                 ucma_copy_ud_event(evt, &resp.param.ud);
2166                 evt->event.param.ud.private_data = evt->mc->context;
2167                 evt->event.status = ucma_process_join(evt);
2168                 if (evt->event.status)
2169                         evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
2170                 break;
2171         case RDMA_CM_EVENT_MULTICAST_ERROR:
2172                 evt->mc = (void *) (uintptr_t) resp.uid;
2173                 evt->id_priv = evt->mc->id_priv;
2174                 evt->event.id = &evt->id_priv->id;
2175                 evt->event.param.ud.private_data = evt->mc->context;
2176                 break;
2177         default:
2178                 evt->id_priv = (void *) (uintptr_t) resp.uid;
2179                 evt->event.id = &evt->id_priv->id;
2180                 evt->event.status = resp.status;
2181                 if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
2182                         ucma_copy_ud_event(evt, &resp.param.ud);
2183                 else
2184                         ucma_copy_conn_event(evt, &resp.param.conn);
2185                 break;
2186         }
2187
2188         *event = &evt->event;
2189         return 0;
2190 }
2191
2192 const char *rdma_event_str(enum rdma_cm_event_type event)
2193 {
2194         switch (event) {
2195         case RDMA_CM_EVENT_ADDR_RESOLVED:
2196                 return "RDMA_CM_EVENT_ADDR_RESOLVED";
2197         case RDMA_CM_EVENT_ADDR_ERROR:
2198                 return "RDMA_CM_EVENT_ADDR_ERROR";
2199         case RDMA_CM_EVENT_ROUTE_RESOLVED:
2200                 return "RDMA_CM_EVENT_ROUTE_RESOLVED";
2201         case RDMA_CM_EVENT_ROUTE_ERROR:
2202                 return "RDMA_CM_EVENT_ROUTE_ERROR";
2203         case RDMA_CM_EVENT_CONNECT_REQUEST:
2204                 return "RDMA_CM_EVENT_CONNECT_REQUEST";
2205         case RDMA_CM_EVENT_CONNECT_RESPONSE:
2206                 return "RDMA_CM_EVENT_CONNECT_RESPONSE";
2207         case RDMA_CM_EVENT_CONNECT_ERROR:
2208                 return "RDMA_CM_EVENT_CONNECT_ERROR";
2209         case RDMA_CM_EVENT_UNREACHABLE:
2210                 return "RDMA_CM_EVENT_UNREACHABLE";
2211         case RDMA_CM_EVENT_REJECTED:
2212                 return "RDMA_CM_EVENT_REJECTED";
2213         case RDMA_CM_EVENT_ESTABLISHED:
2214                 return "RDMA_CM_EVENT_ESTABLISHED";
2215         case RDMA_CM_EVENT_DISCONNECTED:
2216                 return "RDMA_CM_EVENT_DISCONNECTED";
2217         case RDMA_CM_EVENT_DEVICE_REMOVAL:
2218                 return "RDMA_CM_EVENT_DEVICE_REMOVAL";
2219         case RDMA_CM_EVENT_MULTICAST_JOIN:
2220                 return "RDMA_CM_EVENT_MULTICAST_JOIN";
2221         case RDMA_CM_EVENT_MULTICAST_ERROR:
2222                 return "RDMA_CM_EVENT_MULTICAST_ERROR";
2223         case RDMA_CM_EVENT_ADDR_CHANGE:
2224                 return "RDMA_CM_EVENT_ADDR_CHANGE";
2225         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
2226                 return "RDMA_CM_EVENT_TIMEWAIT_EXIT";
2227         default:
2228                 return "UNKNOWN EVENT";
2229         }
2230 }
2231
2232 int rdma_set_option(struct rdma_cm_id *id, int level, int optname,
2233                     void *optval, size_t optlen)
2234 {
2235         struct ucma_abi_set_option cmd;
2236         struct cma_id_private *id_priv;
2237         int ret;
2238         
2239         CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION);
2240         id_priv = container_of(id, struct cma_id_private, id);
2241         cmd.id = id_priv->handle;
2242         cmd.optval = (uintptr_t) optval;
2243         cmd.level = level;
2244         cmd.optname = optname;
2245         cmd.optlen = optlen;
2246
2247         ret = write(id->channel->fd, &cmd, sizeof cmd);
2248         if (ret != sizeof cmd)
2249                 return (ret >= 0) ? ERR(ENODATA) : -1;
2250
2251         return 0;
2252 }
2253
2254 int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel)
2255 {
2256         struct ucma_abi_migrate_resp resp;
2257         struct ucma_abi_migrate_id cmd;
2258         struct cma_id_private *id_priv;
2259         int ret, sync;
2260
2261         id_priv = container_of(id, struct cma_id_private, id);
2262         if (id_priv->sync && !channel)
2263                 return ERR(EINVAL);
2264
2265         if ((sync = (channel == NULL))) {
2266                 channel = rdma_create_event_channel();
2267                 if (!channel)
2268                         return -1;
2269         }
2270
2271         CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp);
2272         cmd.id = id_priv->handle;
2273         cmd.fd = id->channel->fd;
2274
2275         ret = write(channel->fd, &cmd, sizeof cmd);
2276         if (ret != sizeof cmd) {
2277                 if (sync)
2278                         rdma_destroy_event_channel(channel);
2279                 return (ret >= 0) ? ERR(ENODATA) : -1;
2280         }
2281
2282         VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
2283
2284         if (id_priv->sync) {
2285                 if (id->event) {
2286                         rdma_ack_cm_event(id->event);
2287                         id->event = NULL;
2288                 }
2289                 rdma_destroy_event_channel(id->channel);
2290         }
2291
2292         /*
2293          * Eventually if we want to support migrating channels while events are
2294          * being processed on the current channel, we need to block here while
2295          * there are any outstanding events on the current channel for this id
2296          * to prevent the user from processing events for this id on the old
2297          * channel after this call returns.
2298          */
2299         pthread_mutex_lock(&id_priv->mut);
2300         id_priv->sync = sync;
2301         id->channel = channel;
2302         while (id_priv->events_completed < resp.events_reported)
2303                 pthread_cond_wait(&id_priv->cond, &id_priv->mut);
2304         pthread_mutex_unlock(&id_priv->mut);
2305
2306         return 0;
2307 }
2308
2309 static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res,
2310                            struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
2311 {
2312         struct cma_id_private *id_priv;
2313         int ret;
2314
2315         if (af_ib_support)
2316                 ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len);
2317         else
2318                 ret = rdma_bind_addr(id, res->ai_src_addr);
2319         if (ret)
2320                 return ret;
2321
2322         id_priv = container_of(id, struct cma_id_private, id);
2323         if (pd)
2324                 id->pd = pd;
2325
2326         if (qp_init_attr) {
2327                 id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr));
2328                 if (!id_priv->qp_init_attr)
2329                         return ERR(ENOMEM);
2330
2331                 *id_priv->qp_init_attr = *qp_init_attr;
2332                 id_priv->qp_init_attr->qp_type = res->ai_qp_type;
2333         }
2334
2335         return 0;
2336 }
2337
2338 int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res,
2339                    struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
2340 {
2341         struct rdma_cm_id *cm_id;
2342         struct cma_id_private *id_priv;
2343         int ret;
2344
2345         ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type);
2346         if (ret)
2347                 return ret;
2348
2349         if (res->ai_flags & RAI_PASSIVE) {
2350                 ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr);
2351                 if (ret)
2352                         goto err;
2353                 goto out;
2354         }
2355
2356         if (af_ib_support)
2357                 ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len,
2358                                          res->ai_dst_addr, res->ai_dst_len, 2000);
2359         else
2360                 ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000);
2361         if (ret)
2362                 goto err;
2363
2364         if (res->ai_route_len) {
2365                 ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
2366                                       res->ai_route, res->ai_route_len);
2367                 if (!ret)
2368                         ret = ucma_complete(cm_id);
2369         } else {
2370                 ret = rdma_resolve_route(cm_id, 2000);
2371         }
2372         if (ret)
2373                 goto err;
2374
2375         if (qp_init_attr) {
2376                 qp_init_attr->qp_type = res->ai_qp_type;
2377                 ret = rdma_create_qp(cm_id, pd, qp_init_attr);
2378                 if (ret)
2379                         goto err;
2380         }
2381
2382         if (res->ai_connect_len) {
2383                 id_priv = container_of(cm_id, struct cma_id_private, id);
2384                 id_priv->connect = malloc(res->ai_connect_len);
2385                 if (!id_priv->connect) {
2386                         ret = ERR(ENOMEM);
2387                         goto err;
2388                 }
2389                 memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len);
2390                 id_priv->connect_len = res->ai_connect_len;
2391         }
2392
2393 out:
2394         *id = cm_id;
2395         return 0;
2396
2397 err:
2398         rdma_destroy_ep(cm_id);
2399         return ret;
2400 }
2401
2402 void rdma_destroy_ep(struct rdma_cm_id *id)
2403 {
2404         struct cma_id_private *id_priv;
2405
2406         if (id->qp)
2407                 rdma_destroy_qp(id);
2408
2409         if (id->srq)
2410                 rdma_destroy_srq(id);
2411
2412         id_priv = container_of(id, struct cma_id_private, id);
2413         if (id_priv->qp_init_attr)
2414                 free(id_priv->qp_init_attr);
2415
2416         rdma_destroy_id(id);
2417 }
2418
2419 int ucma_max_qpsize(struct rdma_cm_id *id)
2420 {
2421         struct cma_id_private *id_priv;
2422         int i, max_size = 0;
2423
2424         id_priv = container_of(id, struct cma_id_private, id);
2425         if (id && id_priv->cma_dev) {
2426                 max_size = id_priv->cma_dev->max_qpsize;
2427         } else {
2428                 ucma_init_all();
2429                 for (i = 0; i < cma_dev_cnt; i++) {
2430                         if (!max_size || max_size > cma_dev_array[i].max_qpsize)
2431                                 max_size = cma_dev_array[i].max_qpsize;
2432                 }
2433         }
2434         return max_size;
2435 }
2436
2437 __be16 ucma_get_port(struct sockaddr *addr)
2438 {
2439         switch (addr->sa_family) {
2440         case AF_INET:
2441                 return ((struct sockaddr_in *) addr)->sin_port;
2442         case AF_INET6:
2443                 return ((struct sockaddr_in6 *) addr)->sin6_port;
2444         case AF_IB:
2445                 return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid));
2446         default:
2447                 return 0;
2448         }
2449 }
2450
2451 __be16 rdma_get_src_port(struct rdma_cm_id *id)
2452 {
2453         return ucma_get_port(&id->route.addr.src_addr);
2454 }
2455
2456 __be16 rdma_get_dst_port(struct rdma_cm_id *id)
2457 {
2458         return ucma_get_port(&id->route.addr.dst_addr);
2459 }
2460