2 * Copyright (c) 2010-2012 Intel Corporation. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 #include <sys/types.h>
38 #include <sys/socket.h>
43 #include <rdma/rdma_cma.h>
44 #include <infiniband/ib.h>
45 #include <infiniband/sa.h>
49 #define ACM_OP_RESOLVE 0x01
50 #define ACM_OP_ACK 0x80
52 #define ACM_STATUS_SUCCESS 0
53 #define ACM_STATUS_ENOMEM 1
54 #define ACM_STATUS_EINVAL 2
55 #define ACM_STATUS_ENODATA 3
56 #define ACM_STATUS_ENOTCONN 5
57 #define ACM_STATUS_ETIMEDOUT 6
58 #define ACM_STATUS_ESRCADDR 7
59 #define ACM_STATUS_ESRCTYPE 8
60 #define ACM_STATUS_EDESTADDR 9
61 #define ACM_STATUS_EDESTTYPE 10
63 #define ACM_FLAGS_NODELAY (1<<30)
65 #define ACM_MSG_HDR_LENGTH 16
66 #define ACM_MAX_ADDRESS 64
67 #define ACM_MSG_EP_LENGTH 72
68 #define ACM_MSG_DATA_LENGTH (ACM_MSG_EP_LENGTH * 8)
79 #define ACM_EP_INFO_NAME 0x0001
80 #define ACM_EP_INFO_ADDRESS_IP 0x0002
81 #define ACM_EP_INFO_ADDRESS_IP6 0x0003
82 #define ACM_EP_INFO_PATH 0x0010
85 uint8_t addr[ACM_MAX_ADDRESS];
86 uint8_t name[ACM_MAX_ADDRESS];
87 struct ibv_path_record path;
90 #define ACM_EP_FLAG_SOURCE (1<<0)
91 #define ACM_EP_FLAG_DEST (1<<1)
93 struct acm_ep_addr_data {
97 union acm_ep_info info;
100 struct acm_resolve_msg {
102 struct acm_ep_addr_data data[0];
108 uint8_t data[ACM_MSG_DATA_LENGTH];
109 struct acm_ep_addr_data resolve_data[0];
113 static pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER;
114 static int sock = -1;
115 static uint16_t server_port;
117 static int ucma_set_server_port(void)
121 if ((f = fopen(IBACM_PORT_FILE, "r" STREAM_CLOEXEC))) {
122 if (fscanf(f, "%" SCNu16, &server_port) != 1)
129 void ucma_ib_init(void)
131 struct sockaddr_in addr;
138 pthread_mutex_lock(&acm_lock);
142 if (!ucma_set_server_port())
145 sock = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
149 memset(&addr, 0, sizeof addr);
150 addr.sin_family = AF_INET;
151 addr.sin_addr.s_addr = htobe32(INADDR_LOOPBACK);
152 addr.sin_port = htobe16(server_port);
153 ret = connect(sock, (struct sockaddr *) &addr, sizeof(addr));
161 pthread_mutex_unlock(&acm_lock);
164 void ucma_ib_cleanup(void)
167 shutdown(sock, SHUT_RDWR);
172 static int ucma_ib_set_addr(struct rdma_addrinfo *ib_rai,
173 struct rdma_addrinfo *rai)
175 struct sockaddr_ib *src, *dst;
176 struct ibv_path_record *path;
178 src = calloc(1, sizeof(*src));
182 dst = calloc(1, sizeof(*dst));
188 path = &((struct ibv_path_data *) ib_rai->ai_route)->path;
190 src->sib_family = AF_IB;
191 src->sib_pkey = path->pkey;
192 src->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8);
193 memcpy(&src->sib_addr, &path->sgid, 16);
194 ucma_set_sid(ib_rai->ai_port_space, rai->ai_src_addr, src);
196 dst->sib_family = AF_IB;
197 dst->sib_pkey = path->pkey;
198 dst->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8);
199 memcpy(&dst->sib_addr, &path->dgid, 16);
200 ucma_set_sid(ib_rai->ai_port_space, rai->ai_dst_addr, dst);
202 ib_rai->ai_src_addr = (struct sockaddr *) src;
203 ib_rai->ai_src_len = sizeof(*src);
205 ib_rai->ai_dst_addr = (struct sockaddr *) dst;
206 ib_rai->ai_dst_len = sizeof(*dst);
211 static int ucma_ib_set_connect(struct rdma_addrinfo *ib_rai,
212 struct rdma_addrinfo *rai)
214 struct ib_connect_hdr *hdr;
216 if (rai->ai_family == AF_IB)
219 hdr = calloc(1, sizeof(*hdr));
223 if (rai->ai_family == AF_INET) {
224 hdr->ip_version = 4 << 4;
225 memcpy(&hdr->cma_src_ip4,
226 &((struct sockaddr_in *) rai->ai_src_addr)->sin_addr, 4);
227 memcpy(&hdr->cma_dst_ip4,
228 &((struct sockaddr_in *) rai->ai_dst_addr)->sin_addr, 4);
230 hdr->ip_version = 6 << 4;
231 memcpy(&hdr->cma_src_ip6,
232 &((struct sockaddr_in6 *) rai->ai_src_addr)->sin6_addr, 16);
233 memcpy(&hdr->cma_dst_ip6,
234 &((struct sockaddr_in6 *) rai->ai_dst_addr)->sin6_addr, 16);
237 ib_rai->ai_connect = hdr;
238 ib_rai->ai_connect_len = sizeof(*hdr);
242 static void ucma_resolve_af_ib(struct rdma_addrinfo **rai)
244 struct rdma_addrinfo *ib_rai;
246 ib_rai = calloc(1, sizeof(*ib_rai));
250 ib_rai->ai_flags = (*rai)->ai_flags;
251 ib_rai->ai_family = AF_IB;
252 ib_rai->ai_qp_type = (*rai)->ai_qp_type;
253 ib_rai->ai_port_space = (*rai)->ai_port_space;
255 ib_rai->ai_route = calloc(1, (*rai)->ai_route_len);
256 if (!ib_rai->ai_route)
259 memcpy(ib_rai->ai_route, (*rai)->ai_route, (*rai)->ai_route_len);
260 ib_rai->ai_route_len = (*rai)->ai_route_len;
262 if ((*rai)->ai_src_canonname) {
263 ib_rai->ai_src_canonname = strdup((*rai)->ai_src_canonname);
264 if (!ib_rai->ai_src_canonname)
268 if ((*rai)->ai_dst_canonname) {
269 ib_rai->ai_dst_canonname = strdup((*rai)->ai_dst_canonname);
270 if (!ib_rai->ai_dst_canonname)
274 if (ucma_ib_set_connect(ib_rai, *rai))
277 if (ucma_ib_set_addr(ib_rai, *rai))
280 ib_rai->ai_next = *rai;
285 rdma_freeaddrinfo(ib_rai);
288 static void ucma_ib_save_resp(struct rdma_addrinfo *rai, struct acm_msg *msg)
290 struct acm_ep_addr_data *ep_data;
291 struct ibv_path_data *path_data = NULL;
292 struct sockaddr_in *sin;
293 struct sockaddr_in6 *sin6;
294 int i, cnt, path_cnt = 0;
296 cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH;
297 for (i = 0; i < cnt; i++) {
298 ep_data = &msg->resolve_data[i];
299 switch (ep_data->type) {
300 case ACM_EP_INFO_PATH:
303 path_data = (struct ibv_path_data *) ep_data;
306 case ACM_EP_INFO_ADDRESS_IP:
307 if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len)
310 sin = calloc(1, sizeof(*sin));
314 sin->sin_family = AF_INET;
315 memcpy(&sin->sin_addr, &ep_data->info.addr, 4);
316 rai->ai_src_len = sizeof(*sin);
317 rai->ai_src_addr = (struct sockaddr *) sin;
319 case ACM_EP_INFO_ADDRESS_IP6:
320 if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len)
323 sin6 = calloc(1, sizeof(*sin6));
327 sin6->sin6_family = AF_INET6;
328 memcpy(&sin6->sin6_addr, &ep_data->info.addr, 16);
329 rai->ai_src_len = sizeof(*sin6);
330 rai->ai_src_addr = (struct sockaddr *) sin6;
337 rai->ai_route = calloc(path_cnt, sizeof(*path_data));
339 memcpy(rai->ai_route, path_data, path_cnt * sizeof(*path_data));
340 rai->ai_route_len = path_cnt * sizeof(*path_data);
344 static void ucma_set_ep_addr(struct acm_ep_addr_data *data, struct sockaddr *addr)
346 if (addr->sa_family == AF_INET) {
347 data->type = ACM_EP_INFO_ADDRESS_IP;
348 memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4);
350 data->type = ACM_EP_INFO_ADDRESS_IP6;
351 memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16);
355 static int ucma_inet_addr(struct sockaddr *addr, socklen_t len)
357 return len && addr && (addr->sa_family == AF_INET ||
358 addr->sa_family == AF_INET6);
361 static int ucma_ib_addr(struct sockaddr *addr, socklen_t len)
363 return len && addr && (addr->sa_family == AF_IB);
366 void ucma_ib_resolve(struct rdma_addrinfo **rai,
367 const struct rdma_addrinfo *hints)
370 struct acm_ep_addr_data *data;
377 memset(&msg, 0, sizeof msg);
378 msg.hdr.version = ACM_VERSION;
379 msg.hdr.opcode = ACM_OP_RESOLVE;
380 msg.hdr.length = ACM_MSG_HDR_LENGTH;
382 data = &msg.resolve_data[0];
383 if (ucma_inet_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) {
384 data->flags = ACM_EP_FLAG_SOURCE;
385 ucma_set_ep_addr(data, (*rai)->ai_src_addr);
387 msg.hdr.length += ACM_MSG_EP_LENGTH;
390 if (ucma_inet_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
391 data->flags = ACM_EP_FLAG_DEST;
392 if (hints->ai_flags & (RAI_NUMERICHOST | RAI_NOROUTE))
393 data->flags |= ACM_FLAGS_NODELAY;
394 ucma_set_ep_addr(data, (*rai)->ai_dst_addr);
396 msg.hdr.length += ACM_MSG_EP_LENGTH;
399 if (hints->ai_route_len ||
400 ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len) ||
401 ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
402 struct ibv_path_record *path;
404 if (hints->ai_route_len == sizeof(struct ibv_path_record))
405 path = (struct ibv_path_record *) hints->ai_route;
406 else if (hints->ai_route_len == sizeof(struct ibv_path_data))
407 path = &((struct ibv_path_data *) hints->ai_route)->path;
412 memcpy(&data->info.path, path, sizeof(*path));
414 if (ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) {
415 memcpy(&data->info.path.sgid,
416 &((struct sockaddr_ib *) (*rai)->ai_src_addr)->sib_addr, 16);
418 if (ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
419 memcpy(&data->info.path.dgid,
420 &((struct sockaddr_ib *) (*rai)->ai_dst_addr)->sib_addr, 16);
422 data->type = ACM_EP_INFO_PATH;
424 msg.hdr.length += ACM_MSG_EP_LENGTH;
427 pthread_mutex_lock(&acm_lock);
428 ret = send(sock, (char *) &msg, msg.hdr.length, 0);
429 if (ret != msg.hdr.length) {
430 pthread_mutex_unlock(&acm_lock);
434 ret = recv(sock, (char *) &msg, sizeof msg, 0);
435 pthread_mutex_unlock(&acm_lock);
436 if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length || msg.hdr.status)
439 ucma_ib_save_resp(*rai, &msg);
441 if (af_ib_support && !(hints->ai_flags & RAI_ROUTEONLY) && (*rai)->ai_route_len)
442 ucma_resolve_af_ib(rai);