]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/contrib/rdma/krping/krping.c
Copy head (r256279) to stable/10 as part of the 10.0-RELEASE cycle.
[FreeBSD/stable/10.git] / sys / contrib / rdma / krping / krping.c
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/ctype.h>
38
39 #include <sys/param.h>
40 #include <sys/condvar.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/socket.h>
44 #include <sys/endian.h>
45 #include <sys/limits.h>
46 #include <sys/proc.h>
47 #include <sys/signalvar.h>
48
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/rwlock.h>
52 #include <sys/queue.h>
53 #include <sys/taskqueue.h>
54 #include <sys/syslog.h>
55 #include <netinet/in.h>
56
57 #include <vm/vm.h>
58 #include <vm/pmap.h>
59
60 #include <linux/types.h>
61 #include <rdma/rdma_cm.h>
62
63 #include "getopt.h"
64 #include "krping.h"
65
66 #define PFX "krping: "
67
68 static int debug = 0;
69 #define DEBUG_LOG if (debug) printf
70
71 static const struct krping_option krping_opts[] = {
72         {"count", OPT_INT, 'C'},
73         {"size", OPT_INT, 'S'},
74         {"addr", OPT_STRING, 'a'},
75         {"port", OPT_INT, 'p'},
76         {"verbose", OPT_NOPARAM, 'v'},
77         {"validate", OPT_NOPARAM, 'V'},
78         {"server", OPT_NOPARAM, 's'},
79         {"client", OPT_NOPARAM, 'c'},
80         {"dmamr", OPT_NOPARAM, 'D'},
81         {"debug", OPT_NOPARAM, 'd'},
82         {"wlat", OPT_NOPARAM, 'l'},
83         {"rlat", OPT_NOPARAM, 'L'},
84         {"bw", OPT_NOPARAM, 'B'},
85         {"tx-depth", OPT_INT, 't'},
86         {"poll", OPT_NOPARAM, 'P'},
87         {"memlimit", OPT_INT, 'm'},
88         {NULL, 0, 0}
89 };
90
91 struct mtx krping_mutex;
92
93 /*
94  * List of running krping threads.
95  */
96 struct krping_cb_list krping_cbs;
97
98 /*
99  * krping "ping/pong" loop:
100  *      client sends source rkey/addr/len
101  *      server receives source rkey/add/len
102  *      server rdma reads "ping" data from source
103  *      server sends "go ahead" on rdma read completion
104  *      client sends sink rkey/addr/len
105  *      server receives sink rkey/addr/len
106  *      server rdma writes "pong" data to sink
107  *      server sends "go ahead" on rdma write completion
108  *      <repeat loop>
109  */
110
111 /*
112  * Default max buffer size for IO...
113  */
114 #define RPING_BUFSIZE 128*1024
115 #define RPING_SQ_DEPTH 32
116
117 static void krping_wait(struct krping_cb *cb, int state)
118 {
119         int rc;
120         mtx_lock(&cb->lock);
121         while (cb->state < state) {
122                 rc = msleep(cb, &cb->lock, PCATCH, "krping", 0);
123                 if (rc && rc != ERESTART) {
124                         cb->state = ERROR;
125                         break;
126                 }
127         }
128         mtx_unlock(&cb->lock);
129 }
130
131 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
132                                    struct rdma_cm_event *event)
133 {
134         int ret;
135         struct krping_cb *cb = cma_id->context;
136
137         DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
138                   (cma_id == cb->cm_id) ? "parent" : "child");
139
140         mtx_lock(&cb->lock);
141         switch (event->event) {
142         case RDMA_CM_EVENT_ADDR_RESOLVED:
143                 cb->state = ADDR_RESOLVED;
144                 ret = rdma_resolve_route(cma_id, 2000);
145                 if (ret) {
146                         log(LOG_ERR, "rdma_resolve_route error %d\n", 
147                                ret);
148                         wakeup(cb);
149                 }
150                 break;
151
152         case RDMA_CM_EVENT_ROUTE_RESOLVED:
153                 cb->state = ROUTE_RESOLVED;
154                 wakeup(cb);
155                 break;
156
157         case RDMA_CM_EVENT_CONNECT_REQUEST:
158                 cb->state = CONNECT_REQUEST;
159                 cb->child_cm_id = cma_id;
160                 DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id);
161                 wakeup(cb);
162                 break;
163
164         case RDMA_CM_EVENT_ESTABLISHED:
165                 DEBUG_LOG(PFX "ESTABLISHED\n");
166                 if (!cb->server) {
167                         cb->state = CONNECTED;
168                         wakeup(cb);
169                 }
170                 break;
171
172         case RDMA_CM_EVENT_ADDR_ERROR:
173         case RDMA_CM_EVENT_ROUTE_ERROR:
174         case RDMA_CM_EVENT_CONNECT_ERROR:
175         case RDMA_CM_EVENT_UNREACHABLE:
176         case RDMA_CM_EVENT_REJECTED:
177                 log(LOG_ERR, "cma event %d, error %d\n", event->event,
178                        event->status);
179                 cb->state = ERROR;
180                 wakeup(cb);
181                 break;
182
183         case RDMA_CM_EVENT_DISCONNECTED:
184                 DEBUG_LOG(PFX "DISCONNECT EVENT...\n");
185                 cb->state = ERROR;
186                 wakeup(cb);
187                 break;
188
189         case RDMA_CM_EVENT_DEVICE_REMOVAL:
190                 DEBUG_LOG(PFX "cma detected device removal!!!!\n");
191                 cb->state = ERROR;
192                 wakeup(cb);
193                 mtx_unlock(&cb->lock);
194                 krping_wait(cb, CLEANUP);
195                 tsleep(cb, 0, "krping", 5000);
196                 return 0;
197
198         default:
199                 log(LOG_ERR, "oof bad type!\n");
200                 wakeup(cb);
201                 break;
202         }
203         mtx_unlock(&cb->lock);
204         return 0;
205 }
206
207 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
208 {
209         if (wc->byte_len != sizeof(cb->recv_buf)) {
210                 log(LOG_ERR, "Received bogus data, size %d\n", 
211                        wc->byte_len);
212                 return -1;
213         }
214
215         cb->remote_rkey = ntohl(cb->recv_buf.rkey);
216         cb->remote_addr = ntohll(cb->recv_buf.buf);
217         cb->remote_len  = ntohl(cb->recv_buf.size);
218         DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n",
219                   cb->remote_rkey, (unsigned long long)cb->remote_addr, 
220                   cb->remote_len);
221
222         if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
223                 cb->state = RDMA_READ_ADV;
224         else
225                 cb->state = RDMA_WRITE_ADV;
226
227         return 0;
228 }
229
230 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
231 {
232         if (wc->byte_len != sizeof(cb->recv_buf)) {
233                 log(LOG_ERR, "Received bogus data, size %d\n", 
234                        wc->byte_len);
235                 return -1;
236         }
237
238         if (cb->state == RDMA_READ_ADV)
239                 cb->state = RDMA_WRITE_ADV;
240         else
241                 cb->state = RDMA_WRITE_COMPLETE;
242
243         return 0;
244 }
245
246 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
247 {
248         struct krping_cb *cb = ctx;
249         struct ib_wc wc;
250         struct ib_recv_wr *bad_wr;
251         int ret;
252
253         mtx_lock(&cb->lock);
254         KASSERT(cb->cq == cq, ("bad condition"));
255         if (cb->state == ERROR) {
256                 log(LOG_ERR,  "cq completion in ERROR state\n");
257                 mtx_unlock(&cb->lock);
258                 return;
259         }
260         if (!cb->wlat && !cb->rlat && !cb->bw)
261                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
262         while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
263                 if (wc.status) {
264                         if (wc.status == IB_WC_WR_FLUSH_ERR) {
265                                 DEBUG_LOG("cq flushed\n");
266                                 continue;
267                         } else {
268                                 log(LOG_CRIT, "cq completion failed status %d\n",
269                                         wc.status);
270                                 goto error;
271                         }
272                 }
273
274                 switch (wc.opcode) {
275                 case IB_WC_SEND:
276                         DEBUG_LOG(PFX "send completion\n");
277                         cb->stats.send_bytes += cb->send_sgl.length;
278                         cb->stats.send_msgs++;
279                         break;
280
281                 case IB_WC_RDMA_WRITE:
282                         DEBUG_LOG(PFX "rdma write completion\n");
283                         cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
284                         cb->stats.write_msgs++;
285                         cb->state = RDMA_WRITE_COMPLETE;
286                         wakeup(cb);
287                         break;
288
289                 case IB_WC_RDMA_READ:
290                         DEBUG_LOG(PFX "rdma read completion\n");
291                         cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
292                         cb->stats.read_msgs++;
293                         cb->state = RDMA_READ_COMPLETE;
294                         wakeup(cb);
295                         break;
296
297                 case IB_WC_RECV:
298                         DEBUG_LOG(PFX "recv completion\n");
299                         cb->stats.recv_bytes += sizeof(cb->recv_buf);
300                         cb->stats.recv_msgs++;
301                         if (cb->wlat || cb->rlat || cb->bw)
302                                 ret = server_recv(cb, &wc);
303                         else
304                                 ret = cb->server ? server_recv(cb, &wc) :
305                                            client_recv(cb, &wc);
306                         if (ret) {
307                                 log(LOG_ERR, "recv wc error: %d\n", ret);
308                                 goto error;
309                         }
310
311                         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
312                         if (ret) {
313                                 log(LOG_ERR, "post recv error: %d\n", 
314                                        ret);
315                                 goto error;
316                         }
317                         wakeup(cb);
318                         break;
319
320                 default:
321                         log(LOG_ERR, "unknown!!!!! completion\n");
322                         goto error;
323                 }
324         }
325         if (ret) {
326                 log(LOG_ERR, "poll error %d\n", ret);
327                 goto error;
328         }
329         mtx_unlock(&cb->lock);
330         return;
331 error:
332         cb->state = ERROR;
333         wakeup(cb);
334         mtx_unlock(&cb->lock);
335 }
336
337 static int krping_accept(struct krping_cb *cb)
338 {
339         struct rdma_conn_param conn_param;
340         int ret;
341
342         DEBUG_LOG(PFX "accepting client connection request\n");
343
344         memset(&conn_param, 0, sizeof conn_param);
345         conn_param.responder_resources = 1;
346         conn_param.initiator_depth = 1;
347
348         ret = rdma_accept(cb->child_cm_id, &conn_param);
349         if (ret) {
350                 log(LOG_ERR, "rdma_accept error: %d\n", ret);
351                 return ret;
352         }
353
354         if (!cb->wlat && !cb->rlat && !cb->bw) {
355                 krping_wait(cb, CONNECTED);
356                 if (cb->state == ERROR) {
357                         log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
358                         return -1;
359                 }
360         }
361         return 0;
362 }
363
364 static void krping_setup_wr(struct krping_cb *cb)
365 {
366         /* XXX X86 only here... not mapping for dma! */
367         cb->recv_sgl.addr = vtophys(&cb->recv_buf);
368         cb->recv_sgl.length = sizeof cb->recv_buf;
369         if (cb->use_dmamr)
370                 cb->recv_sgl.lkey = cb->dma_mr->lkey;
371         else
372                 cb->recv_sgl.lkey = cb->recv_mr->lkey;
373         cb->rq_wr.sg_list = &cb->recv_sgl;
374         cb->rq_wr.num_sge = 1;
375
376         cb->send_sgl.addr = vtophys(&cb->send_buf);
377         cb->send_sgl.length = sizeof cb->send_buf;
378         if (cb->use_dmamr)
379                 cb->send_sgl.lkey = cb->dma_mr->lkey;
380         else
381                 cb->send_sgl.lkey = cb->send_mr->lkey;
382
383         cb->sq_wr.opcode = IB_WR_SEND;
384         cb->sq_wr.send_flags = IB_SEND_SIGNALED;
385         cb->sq_wr.sg_list = &cb->send_sgl;
386         cb->sq_wr.num_sge = 1;
387
388         cb->rdma_addr = vtophys(cb->rdma_buf);
389         cb->rdma_sgl.addr = cb->rdma_addr;
390         if (cb->use_dmamr)
391                 cb->rdma_sgl.lkey = cb->dma_mr->lkey;
392         else
393                 cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
394         cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
395         cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
396         cb->rdma_sq_wr.num_sge = 1;
397
398         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
399                 cb->start_addr = vtophys(cb->start_buf);
400         }
401 }
402
403 static int krping_setup_buffers(struct krping_cb *cb)
404 {
405         int ret;
406         struct ib_phys_buf buf;
407         u64 iovbase;
408
409         DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
410
411         if (cb->use_dmamr) {
412                 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
413                                            IB_ACCESS_REMOTE_READ|
414                                            IB_ACCESS_REMOTE_WRITE);
415                 if (IS_ERR(cb->dma_mr)) {
416                         log(LOG_ERR, "reg_dmamr failed\n");
417                         return PTR_ERR(cb->dma_mr);
418                 }
419         } else {
420
421                 buf.addr = vtophys(&cb->recv_buf);
422                 buf.size = sizeof cb->recv_buf;
423                 iovbase = vtophys(&cb->recv_buf);
424                 cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
425                                              IB_ACCESS_LOCAL_WRITE, 
426                                              &iovbase);
427
428                 if (IS_ERR(cb->recv_mr)) {
429                         log(LOG_ERR, "recv_buf reg_mr failed\n");
430                         return PTR_ERR(cb->recv_mr);
431                 }
432
433                 buf.addr = vtophys(&cb->send_buf);
434                 buf.size = sizeof cb->send_buf;
435                 iovbase = vtophys(&cb->send_buf);
436                 cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
437                                              0, &iovbase);
438
439                 if (IS_ERR(cb->send_mr)) {
440                         log(LOG_ERR, "send_buf reg_mr failed\n");
441                         ib_dereg_mr(cb->recv_mr);
442                         return PTR_ERR(cb->send_mr);
443                 }
444         }
445
446         /* RNIC adapters have a limit upto which it can register physical memory
447          * If DMA-MR memory mode is set then normally driver registers maximum
448          * supported memory. After that if contigmalloc allocates memory beyond the
449          * specified RNIC limit then Krping may not work.
450          */
451         if (cb->use_dmamr && cb->memlimit)
452                 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit,
453                                             PAGE_SIZE, 0);
454         else 
455                 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
456                                             PAGE_SIZE, 0);
457
458         if (!cb->rdma_buf) {
459                 log(LOG_ERR, "rdma_buf malloc failed\n");
460                 ret = ENOMEM;
461                 goto err1;
462         }
463         if (!cb->use_dmamr) {
464
465                 buf.addr = vtophys(cb->rdma_buf);
466                 buf.size = cb->size;
467                 iovbase = vtophys(cb->rdma_buf);
468                 cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
469                                              IB_ACCESS_REMOTE_READ| 
470                                              IB_ACCESS_REMOTE_WRITE, 
471                                              &iovbase);
472
473                 if (IS_ERR(cb->rdma_mr)) {
474                         log(LOG_ERR, "rdma_buf reg_mr failed\n");
475                         ret = PTR_ERR(cb->rdma_mr);
476                         goto err2;
477                 }
478         }
479
480         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
481                 if (cb->use_dmamr && cb->memlimit)
482                         cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
483                                                      0, cb->memlimit, PAGE_SIZE, 0);
484                 else
485                         cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
486                                                      0, -1UL, PAGE_SIZE, 0);
487                 if (!cb->start_buf) {
488                         log(LOG_ERR, "start_buf malloc failed\n");
489                         ret = ENOMEM;
490                         goto err2;
491                 }
492                 if (!cb->use_dmamr) {
493                         unsigned flags = IB_ACCESS_REMOTE_READ;
494
495                         if (cb->wlat || cb->rlat || cb->bw) 
496                                 flags |= IB_ACCESS_REMOTE_WRITE;
497                         buf.addr = vtophys(cb->start_buf);
498                         buf.size = cb->size;
499                         iovbase = vtophys(cb->start_buf);
500                         cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
501                                              flags,
502                                              &iovbase);
503
504                         if (IS_ERR(cb->start_mr)) {
505                                 log(LOG_ERR, "start_buf reg_mr failed\n");
506                                 ret = PTR_ERR(cb->start_mr);
507                                 goto err3;
508                         }
509                 }
510         }
511
512         krping_setup_wr(cb);
513         DEBUG_LOG(PFX "allocated & registered buffers...\n");
514         return 0;
515 err3:
516         contigfree(cb->start_buf, cb->size, M_DEVBUF);
517
518         if (!cb->use_dmamr)
519                 ib_dereg_mr(cb->rdma_mr);
520 err2:
521         contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
522 err1:
523         if (cb->use_dmamr)
524                 ib_dereg_mr(cb->dma_mr);
525         else {
526                 ib_dereg_mr(cb->recv_mr);
527                 ib_dereg_mr(cb->send_mr);
528         }
529         return ret;
530 }
531
532 static void krping_free_buffers(struct krping_cb *cb)
533 {
534         DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb);
535         
536 #if 0
537         dma_unmap_single(cb->pd->device->dma_device,
538                          pci_unmap_addr(cb, recv_mapping),
539                          sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
540         dma_unmap_single(cb->pd->device->dma_device,
541                          pci_unmap_addr(cb, send_mapping),
542                          sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
543         dma_unmap_single(cb->pd->device->dma_device,
544                          pci_unmap_addr(cb, rdma_mapping),
545                          cb->size, DMA_BIDIRECTIONAL);
546 #endif
547         contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
548         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
549 #if 0
550                 dma_unmap_single(cb->pd->device->dma_device,
551                          pci_unmap_addr(cb, start_mapping),
552                          cb->size, DMA_BIDIRECTIONAL);
553 #endif
554                 contigfree(cb->start_buf, cb->size, M_DEVBUF);
555         }
556         if (cb->use_dmamr)
557                 ib_dereg_mr(cb->dma_mr);
558         else {
559                 ib_dereg_mr(cb->send_mr);
560                 ib_dereg_mr(cb->recv_mr);
561                 ib_dereg_mr(cb->rdma_mr);
562                 if (!cb->server)
563                         ib_dereg_mr(cb->start_mr);
564         }
565 }
566
567 static int krping_create_qp(struct krping_cb *cb)
568 {
569         struct ib_qp_init_attr init_attr;
570         int ret;
571
572         memset(&init_attr, 0, sizeof(init_attr));
573         init_attr.cap.max_send_wr = cb->txdepth;
574         init_attr.cap.max_recv_wr = 2;
575         init_attr.cap.max_recv_sge = 1;
576         init_attr.cap.max_send_sge = 1;
577         init_attr.qp_type = IB_QPT_RC;
578         init_attr.send_cq = cb->cq;
579         init_attr.recv_cq = cb->cq;
580
581         if (cb->server) {
582                 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
583                 if (!ret)
584                         cb->qp = cb->child_cm_id->qp;
585         } else {
586                 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
587                 if (!ret)
588                         cb->qp = cb->cm_id->qp;
589         }
590
591         return ret;
592 }
593
594 static void krping_free_qp(struct krping_cb *cb)
595 {
596         ib_destroy_qp(cb->qp);
597         ib_destroy_cq(cb->cq);
598         ib_dealloc_pd(cb->pd);
599 }
600
601 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
602 {
603         int ret;
604         cb->pd = ib_alloc_pd(cm_id->device);
605         if (IS_ERR(cb->pd)) {
606                 log(LOG_ERR, "ib_alloc_pd failed\n");
607                 return PTR_ERR(cb->pd);
608         }
609         DEBUG_LOG(PFX "created pd %p\n", cb->pd);
610
611         strlcpy(cb->name, cb->pd->device->name, sizeof(cb->name));
612
613         cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
614                               cb, cb->txdepth * 2, 0);
615         if (IS_ERR(cb->cq)) {
616                 log(LOG_ERR, "ib_create_cq failed\n");
617                 ret = PTR_ERR(cb->cq);
618                 goto err1;
619         }
620         DEBUG_LOG(PFX "created cq %p\n", cb->cq);
621
622         if (!cb->wlat && !cb->rlat && !cb->bw) {
623                 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
624                 if (ret) {
625                         log(LOG_ERR, "ib_create_cq failed\n");
626                         goto err2;
627                 }
628         }
629
630         ret = krping_create_qp(cb);
631         if (ret) {
632                 log(LOG_ERR, "krping_create_qp failed: %d\n", ret);
633                 goto err2;
634         }
635         DEBUG_LOG(PFX "created qp %p\n", cb->qp);
636         return 0;
637 err2:
638         ib_destroy_cq(cb->cq);
639 err1:
640         ib_dealloc_pd(cb->pd);
641         return ret;
642 }
643
644 static void krping_format_send(struct krping_cb *cb, u64 buf, 
645                                struct ib_mr *mr)
646 {
647         struct krping_rdma_info *info = &cb->send_buf;
648
649         info->buf = htonll(buf);
650         info->rkey = htonl(mr->rkey);
651         info->size = htonl(cb->size);
652
653         DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n",
654                   (unsigned long long)buf, mr->rkey, cb->size);
655 }
656
657 static void krping_test_server(struct krping_cb *cb)
658 {
659         struct ib_send_wr *bad_wr;
660         int ret;
661
662         while (1) {
663                 /* Wait for client's Start STAG/TO/Len */
664                 krping_wait(cb, RDMA_READ_ADV);
665                 if (cb->state != RDMA_READ_ADV) {
666                         DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n",
667                                 cb->state);
668                         break;
669                 }
670
671                 DEBUG_LOG(PFX "server received sink adv\n");
672
673                 /* Issue RDMA Read. */
674                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
675                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
676                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
677                 cb->rdma_sq_wr.sg_list->length = cb->remote_len;
678
679                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
680                 if (ret) {
681                         log(LOG_ERR, "post send error %d\n", ret);
682                         break;
683                 }
684                 DEBUG_LOG(PFX "server posted rdma read req \n");
685
686                 /* Wait for read completion */
687                 krping_wait(cb, RDMA_READ_COMPLETE);
688                 if (cb->state != RDMA_READ_COMPLETE) {
689                         log(LOG_ERR,  
690                                "wait for RDMA_READ_COMPLETE state %d\n",
691                                cb->state);
692                         break;
693                 }
694                 DEBUG_LOG(PFX "server received read complete\n");
695
696                 /* Display data in recv buf */
697                 if (cb->verbose)
698                         DEBUG_LOG("server ping data: %s\n", cb->rdma_buf);
699
700                 /* Tell client to continue */
701                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
702                 if (ret) {
703                         log(LOG_ERR, "post send error %d\n", ret);
704                         break;
705                 }
706                 DEBUG_LOG(PFX "server posted go ahead\n");
707
708                 /* Wait for client's RDMA STAG/TO/Len */
709                 krping_wait(cb, RDMA_WRITE_ADV);
710                 if (cb->state != RDMA_WRITE_ADV) {
711                         log(LOG_ERR,  
712                                "wait for RDMA_WRITE_ADV state %d\n",
713                                cb->state);
714                         break;
715                 }
716                 DEBUG_LOG(PFX "server received sink adv\n");
717
718                 /* RDMA Write echo data */
719                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
720                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
721                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
722                 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
723                 DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n",
724                           cb->rdma_sq_wr.sg_list->lkey,
725                           (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
726                           cb->rdma_sq_wr.sg_list->length);
727
728                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
729                 if (ret) {
730                         log(LOG_ERR, "post send error %d\n", ret);
731                         break;
732                 }
733
734                 /* Wait for completion */
735                 krping_wait(cb, RDMA_WRITE_COMPLETE);
736                 if (cb->state != RDMA_WRITE_COMPLETE) {
737                         log(LOG_ERR,  
738                                "wait for RDMA_WRITE_COMPLETE state %d\n",
739                                cb->state);
740                         break;
741                 }
742                 DEBUG_LOG(PFX "server rdma write complete \n");
743
744                 cb->state = CONNECTED;
745
746                 /* Tell client to begin again */
747                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
748                 if (ret) {
749                         log(LOG_ERR, "post send error %d\n", ret);
750                         break;
751                 }
752                 DEBUG_LOG(PFX "server posted go ahead\n");
753         }
754 }
755
756 static void rlat_test(struct krping_cb *cb)
757 {
758         int scnt;
759         int iters = cb->count;
760         struct timeval start_tv, stop_tv;
761         int ret;
762         struct ib_wc wc;
763         struct ib_send_wr *bad_wr;
764         int ne;
765
766         scnt = 0;
767         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
768         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
769         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
770         cb->rdma_sq_wr.sg_list->length = cb->size;
771
772         microtime(&start_tv);
773         if (!cb->poll) {
774                 cb->state = RDMA_READ_ADV;
775                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
776         }
777         while (scnt < iters) {
778
779                 cb->state = RDMA_READ_ADV;
780                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
781                 if (ret) {
782                         log(LOG_ERR,  
783                                 "Couldn't post send: ret=%d scnt %d\n",
784                                 ret, scnt);
785                         return;
786                 }
787
788                 do {
789                         if (!cb->poll) {
790                                 krping_wait(cb, RDMA_READ_COMPLETE);
791                                 if (cb->state == RDMA_READ_COMPLETE) {
792                                         ne = 1;
793                                         ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
794                                 } else {
795                                         ne = -1;
796                                 }
797                         } else
798                                 ne = ib_poll_cq(cb->cq, 1, &wc);
799                         if (cb->state == ERROR) {
800                                 log(LOG_ERR, 
801                                        "state == ERROR...bailing scnt %d\n", scnt);
802                                 return;
803                         }
804                 } while (ne == 0);
805
806                 if (ne < 0) {
807                         log(LOG_ERR, "poll CQ failed %d\n", ne);
808                         return;
809                 }
810                 if (cb->poll && wc.status != IB_WC_SUCCESS) {
811                         log(LOG_ERR, "Completion wth error at %s:\n",
812                                 cb->server ? "server" : "client");
813                         log(LOG_ERR, "Failed status %d: wr_id %d\n",
814                                 wc.status, (int) wc.wr_id);
815                         return;
816                 }
817                 ++scnt;
818         }
819         microtime(&stop_tv);
820
821         if (stop_tv.tv_usec < start_tv.tv_usec) {
822                 stop_tv.tv_usec += 1000000;
823                 stop_tv.tv_sec  -= 1;
824         }
825
826         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n",
827                 stop_tv.tv_sec - start_tv.tv_sec, 
828                 stop_tv.tv_usec - start_tv.tv_usec,
829                 scnt, cb->size);
830 }
831
832 static int alloc_cycle_mem(int cycle_iters,
833                                 cycles_t **post_cycles_start,
834                                 cycles_t **post_cycles_stop,
835                                 cycles_t **poll_cycles_start,
836                                 cycles_t **poll_cycles_stop,
837                                 cycles_t **last_poll_cycles_start)
838 {
839         *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
840         if (!*post_cycles_start) {
841                 goto fail1;
842         }
843         *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
844         if (!*post_cycles_stop) {
845                 goto fail2;
846         }
847         *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
848         if (!*poll_cycles_start) {
849                 goto fail3;
850         }
851         *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
852         if (!*poll_cycles_stop) {
853                 goto fail4;
854         }
855         *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
856         if (!*last_poll_cycles_start) {
857                 goto fail5;
858         }
859         return 0;
860 fail5:
861         free(*poll_cycles_stop, M_DEVBUF);
862 fail4:
863         free(*poll_cycles_start, M_DEVBUF);
864 fail3:
865         free(*post_cycles_stop, M_DEVBUF);
866 fail2:
867         free(*post_cycles_start, M_DEVBUF);
868 fail1:
869         log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
870         return ENOMEM;
871 }
872
873 static void free_cycle_mem(cycles_t *post_cycles_start,
874                                 cycles_t *post_cycles_stop,
875                                 cycles_t *poll_cycles_start,
876                                 cycles_t *poll_cycles_stop,
877                                 cycles_t *last_poll_cycles_start)
878 {
879         free(last_poll_cycles_start, M_DEVBUF);
880         free(poll_cycles_stop, M_DEVBUF);
881         free(poll_cycles_start, M_DEVBUF);
882         free(post_cycles_stop, M_DEVBUF);
883         free(post_cycles_start, M_DEVBUF);
884 }
885
886 static void wlat_test(struct krping_cb *cb)
887 {
888         int ccnt, scnt, rcnt;
889         int iters=cb->count;
890         volatile char *poll_buf = (char *) cb->start_buf;
891         char *buf = (char *)cb->rdma_buf;
892         ccnt = 0;
893         scnt = 0;
894         rcnt = 0;
895         struct timeval start_tv, stop_tv;
896         cycles_t *post_cycles_start, *post_cycles_stop;
897         cycles_t *poll_cycles_start, *poll_cycles_stop;
898         cycles_t *last_poll_cycles_start;
899         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
900         int i;
901         int cycle_iters = 1000;
902         int err;
903
904         err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
905                                 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
906                           
907         if (err) {
908                 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
909                 return;
910         }
911
912         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
913         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
914         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
915         cb->rdma_sq_wr.sg_list->length = cb->size;
916
917         if (cycle_iters > iters)
918                 cycle_iters = iters;
919         microtime(&start_tv);
920         while (scnt < iters || ccnt < iters || rcnt < iters) {
921
922                 /* Wait till buffer changes. */
923                 if (rcnt < iters && !(scnt < 1 && !cb->server)) {
924                         ++rcnt;
925                         while (*poll_buf != (char)rcnt) {
926                                 if (cb->state == ERROR) {
927                                         log(LOG_ERR, "state = ERROR, bailing\n");
928                                         return;
929                                 }
930                         }
931                 }
932
933                 if (scnt < iters) {
934                         struct ib_send_wr *bad_wr;
935
936                         *buf = (char)scnt+1;
937                         if (scnt < cycle_iters)
938                                 post_cycles_start[scnt] = get_cycles();
939                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
940                                 log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
941                                         scnt);
942                                 return;
943                         }
944                         if (scnt < cycle_iters)
945                                 post_cycles_stop[scnt] = get_cycles();
946                         scnt++;
947                 }
948
949                 if (ccnt < iters) {
950                         struct ib_wc wc;
951                         int ne;
952
953                         if (ccnt < cycle_iters)
954                                 poll_cycles_start[ccnt] = get_cycles();
955                         do {
956                                 if (ccnt < cycle_iters)
957                                         last_poll_cycles_start[ccnt] = get_cycles();
958                                 ne = ib_poll_cq(cb->cq, 1, &wc);
959                         } while (ne == 0);
960                         if (ccnt < cycle_iters)
961                                 poll_cycles_stop[ccnt] = get_cycles();
962                         ++ccnt;
963
964                         if (ne < 0) {
965                                 log(LOG_ERR, "poll CQ failed %d\n", ne);
966                                 return;
967                         }
968                         if (wc.status != IB_WC_SUCCESS) {
969                                 log(LOG_ERR, "Completion wth error at %s:\n",
970                                         cb->server ? "server" : "client");
971                                 log(LOG_ERR, "Failed status %d: wr_id %d\n",
972                                         wc.status, (int) wc.wr_id);
973                                 log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n",
974                                         scnt, rcnt, ccnt);
975                                 return;
976                         }
977                 }
978         }
979         microtime(&stop_tv);
980
981         if (stop_tv.tv_usec < start_tv.tv_usec) {
982                 stop_tv.tv_usec += 1000000;
983                 stop_tv.tv_sec  -= 1;
984         }
985
986         for (i=0; i < cycle_iters; i++) {
987                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
988                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
989                 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
990         }
991
992         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
993                 stop_tv.tv_sec - start_tv.tv_sec, 
994                 stop_tv.tv_usec - start_tv.tv_usec,
995                 scnt, cb->size, cycle_iters, 
996                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
997                 (unsigned long long)sum_last_poll);
998
999         free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 
1000                         poll_cycles_stop, last_poll_cycles_start);
1001 }
1002
1003 static void bw_test(struct krping_cb *cb)
1004 {
1005         int ccnt, scnt, rcnt;
1006         int iters=cb->count;
1007         ccnt = 0;
1008         scnt = 0;
1009         rcnt = 0;
1010         struct timeval start_tv, stop_tv;
1011         cycles_t *post_cycles_start, *post_cycles_stop;
1012         cycles_t *poll_cycles_start, *poll_cycles_stop;
1013         cycles_t *last_poll_cycles_start;
1014         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1015         int i;
1016         int cycle_iters = 1000;
1017         int err;
1018
1019         err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
1020                                 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
1021                           
1022         if (err) {
1023                 log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__);
1024                 return;
1025         }
1026
1027         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1028         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1029         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1030         cb->rdma_sq_wr.sg_list->length = cb->size;
1031
1032         if (cycle_iters > iters)
1033                 cycle_iters = iters;
1034         microtime(&start_tv);
1035         while (scnt < iters || ccnt < iters) {
1036
1037                 while (scnt < iters && scnt - ccnt < cb->txdepth) {
1038                         struct ib_send_wr *bad_wr;
1039
1040                         if (scnt < cycle_iters)
1041                                 post_cycles_start[scnt] = get_cycles();
1042                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1043                                 log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1044                                         scnt);
1045                                 return;
1046                         }
1047                         if (scnt < cycle_iters)
1048                                 post_cycles_stop[scnt] = get_cycles();
1049                         ++scnt;
1050                 }
1051
1052                 if (ccnt < iters) {
1053                         int ne;
1054                         struct ib_wc wc;
1055
1056                         if (ccnt < cycle_iters)
1057                                 poll_cycles_start[ccnt] = get_cycles();
1058                         do {
1059                                 if (ccnt < cycle_iters)
1060                                         last_poll_cycles_start[ccnt] = get_cycles();
1061                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1062                         } while (ne == 0);
1063                         if (ccnt < cycle_iters)
1064                                 poll_cycles_stop[ccnt] = get_cycles();
1065                         ccnt += 1;
1066
1067                         if (ne < 0) {
1068                                 log(LOG_ERR, "poll CQ failed %d\n", ne);
1069                                 return;
1070                         }
1071                         if (wc.status != IB_WC_SUCCESS) {
1072                                 log(LOG_ERR, "Completion wth error at %s:\n",
1073                                         cb->server ? "server" : "client");
1074                                 log(LOG_ERR, "Failed status %d: wr_id %d\n",
1075                                         wc.status, (int) wc.wr_id);
1076                                 return;
1077                         }
1078                 }
1079         }
1080         microtime(&stop_tv);
1081
1082         if (stop_tv.tv_usec < start_tv.tv_usec) {
1083                 stop_tv.tv_usec += 1000000;
1084                 stop_tv.tv_sec  -= 1;
1085         }
1086
1087         for (i=0; i < cycle_iters; i++) {
1088                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1089                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1090                 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1091         }
1092
1093         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1094                 stop_tv.tv_sec - start_tv.tv_sec, 
1095                 stop_tv.tv_usec - start_tv.tv_usec,
1096                 scnt, cb->size, cycle_iters, 
1097                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1098                 (unsigned long long)sum_last_poll);
1099
1100         free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 
1101                         poll_cycles_stop, last_poll_cycles_start);
1102 }
1103
1104 static void krping_rlat_test_server(struct krping_cb *cb)
1105 {
1106         struct ib_send_wr *bad_wr;
1107         struct ib_wc wc;
1108         int ret;
1109
1110         /* Spin waiting for client's Start STAG/TO/Len */
1111         while (cb->state < RDMA_READ_ADV) {
1112                 krping_cq_event_handler(cb->cq, cb);
1113         }
1114
1115         /* Send STAG/TO/Len to client */
1116         if (cb->dma_mr)
1117                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1118         else
1119                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1120         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1121         if (ret) {
1122                 log(LOG_ERR, "post send error %d\n", ret);
1123                 return;
1124         }
1125
1126         /* Spin waiting for send completion */
1127         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1128         if (ret < 0) {
1129                 log(LOG_ERR, "poll error %d\n", ret);
1130                 return;
1131         }
1132         if (wc.status) {
1133                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1134                 return;
1135         }
1136
1137         krping_wait(cb, ERROR);
1138 }
1139
1140 static void krping_wlat_test_server(struct krping_cb *cb)
1141 {
1142         struct ib_send_wr *bad_wr;
1143         struct ib_wc wc;
1144         int ret;
1145
1146         /* Spin waiting for client's Start STAG/TO/Len */
1147         while (cb->state < RDMA_READ_ADV) {
1148                 krping_cq_event_handler(cb->cq, cb);
1149         }
1150
1151         /* Send STAG/TO/Len to client */
1152         if (cb->dma_mr)
1153                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1154         else
1155                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1156         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1157         if (ret) {
1158                 log(LOG_ERR, "post send error %d\n", ret);
1159                 return;
1160         }
1161
1162         /* Spin waiting for send completion */
1163         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1164         if (ret < 0) {
1165                 log(LOG_ERR, "poll error %d\n", ret);
1166                 return;
1167         }
1168         if (wc.status) {
1169                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1170                 return;
1171         }
1172
1173         wlat_test(cb);
1174         krping_wait(cb, ERROR);
1175 }
1176
1177 static void krping_bw_test_server(struct krping_cb *cb)
1178 {
1179         struct ib_send_wr *bad_wr;
1180         struct ib_wc wc;
1181         int ret;
1182
1183         /* Spin waiting for client's Start STAG/TO/Len */
1184         while (cb->state < RDMA_READ_ADV) {
1185                 krping_cq_event_handler(cb->cq, cb);
1186         }
1187
1188         /* Send STAG/TO/Len to client */
1189         if (cb->dma_mr)
1190                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1191         else
1192                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1193         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1194         if (ret) {
1195                 log(LOG_ERR, "post send error %d\n", ret);
1196                 return;
1197         }
1198
1199         /* Spin waiting for send completion */
1200         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1201         if (ret < 0) {
1202                 log(LOG_ERR, "poll error %d\n", ret);
1203                 return;
1204         }
1205         if (wc.status) {
1206                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1207                 return;
1208         }
1209
1210         if (cb->duplex)
1211                 bw_test(cb);
1212         krping_wait(cb, ERROR);
1213 }
1214
1215 static int krping_bind_server(struct krping_cb *cb)
1216 {
1217         struct sockaddr_in sin;
1218         int ret;
1219
1220         memset(&sin, 0, sizeof(sin));
1221         sin.sin_len = sizeof sin;
1222         sin.sin_family = AF_INET;
1223         sin.sin_addr.s_addr = cb->addr.s_addr;
1224         sin.sin_port = cb->port;
1225
1226         ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1227         if (ret) {
1228                 log(LOG_ERR, "rdma_bind_addr error %d\n", ret);
1229                 return ret;
1230         }
1231         DEBUG_LOG(PFX "rdma_bind_addr successful\n");
1232
1233         DEBUG_LOG(PFX "rdma_listen\n");
1234         ret = rdma_listen(cb->cm_id, 3);
1235         if (ret) {
1236                 log(LOG_ERR, "rdma_listen failed: %d\n", ret);
1237                 return ret;
1238         }
1239
1240         krping_wait(cb, CONNECT_REQUEST);
1241         if (cb->state != CONNECT_REQUEST) {
1242                 log(LOG_ERR,  "wait for CONNECT_REQUEST state %d\n",
1243                         cb->state);
1244                 return -1;
1245         }
1246
1247         return 0;
1248 }
1249
1250 static void krping_run_server(struct krping_cb *cb)
1251 {
1252         struct ib_recv_wr *bad_wr;
1253         int ret;
1254
1255         ret = krping_bind_server(cb);
1256         if (ret)
1257                 return;
1258
1259         ret = krping_setup_qp(cb, cb->child_cm_id);
1260         if (ret) {
1261                 log(LOG_ERR, "setup_qp failed: %d\n", ret);
1262                 return;
1263         }
1264
1265         ret = krping_setup_buffers(cb);
1266         if (ret) {
1267                 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1268                 goto err1;
1269         }
1270
1271         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1272         if (ret) {
1273                 log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1274                 goto err2;
1275         }
1276
1277         ret = krping_accept(cb);
1278         if (ret) {
1279                 log(LOG_ERR, "connect error %d\n", ret);
1280                 goto err2;
1281         }
1282
1283         if (cb->wlat)
1284                 krping_wlat_test_server(cb);
1285         else if (cb->rlat)
1286                 krping_rlat_test_server(cb);
1287         else if (cb->bw)
1288                 krping_bw_test_server(cb);
1289         else
1290                 krping_test_server(cb);
1291
1292         rdma_disconnect(cb->child_cm_id);
1293         rdma_destroy_id(cb->child_cm_id);
1294 err2:
1295         krping_free_buffers(cb);
1296 err1:
1297         krping_free_qp(cb);
1298 }
1299
1300 static void krping_test_client(struct krping_cb *cb)
1301 {
1302         int ping, start, cc, i, ret;
1303         struct ib_send_wr *bad_wr;
1304         unsigned char c;
1305
1306         start = 65;
1307         for (ping = 0; !cb->count || ping < cb->count; ping++) {
1308                 cb->state = RDMA_READ_ADV;
1309
1310                 /* Put some ascii text in the buffer. */
1311                 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1312                 for (i = cc, c = start; i < cb->size; i++) {
1313                         cb->start_buf[i] = c;
1314                         c++;
1315                         if (c > 122)
1316                                 c = 65;
1317                 }
1318                 start++;
1319                 if (start > 122)
1320                         start = 65;
1321                 cb->start_buf[cb->size - 1] = 0;
1322
1323                 if (cb->dma_mr)
1324                         krping_format_send(cb, cb->start_addr, cb->dma_mr);
1325                 else
1326                         krping_format_send(cb, cb->start_addr, cb->start_mr);
1327
1328                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1329                 if (ret) {
1330                         log(LOG_ERR, "post send error %d\n", ret);
1331                         break;
1332                 }
1333
1334                 /* Wait for server to ACK */
1335                 krping_wait(cb, RDMA_WRITE_ADV);
1336                 if (cb->state != RDMA_WRITE_ADV) {
1337                         log(LOG_ERR,  
1338                                "wait for RDMA_WRITE_ADV state %d\n",
1339                                cb->state);
1340                         break;
1341                 }
1342
1343                 if (cb->dma_mr)
1344                         krping_format_send(cb, cb->rdma_addr, cb->dma_mr);
1345                 else
1346                         krping_format_send(cb, cb->rdma_addr, cb->rdma_mr);
1347
1348                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1349                 if (ret) {
1350                         log(LOG_ERR, "post send error %d\n", ret);
1351                         break;
1352                 }
1353
1354                 /* Wait for the server to say the RDMA Write is complete. */
1355                 krping_wait(cb, RDMA_WRITE_COMPLETE);
1356                 if (cb->state != RDMA_WRITE_COMPLETE) {
1357                         log(LOG_ERR,  
1358                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1359                                cb->state);
1360                         break;
1361                 }
1362
1363                 if (cb->validate)
1364                         if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1365                                 log(LOG_ERR, "data mismatch!\n");
1366                                 break;
1367                         }
1368
1369                 if (cb->verbose)
1370                         DEBUG_LOG("ping data: %s\n", cb->rdma_buf);
1371         }
1372 }
1373
1374 static void krping_rlat_test_client(struct krping_cb *cb)
1375 {
1376         struct ib_send_wr *bad_wr;
1377         struct ib_wc wc;
1378         int ret;
1379
1380         cb->state = RDMA_READ_ADV;
1381
1382         /* Send STAG/TO/Len to client */
1383         if (cb->dma_mr)
1384                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1385         else
1386                 krping_format_send(cb, cb->start_addr, cb->rdma_mr);
1387         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1388         if (ret) {
1389                 log(LOG_ERR, "post send error %d\n", ret);
1390                 return;
1391         }
1392
1393         /* Spin waiting for send completion */
1394         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1395         if (ret < 0) {
1396                 log(LOG_ERR, "poll error %d\n", ret);
1397                 return;
1398         }
1399         if (wc.status) {
1400                 log(LOG_ERR, "send completion error %d\n", wc.status);
1401                 return;
1402         }
1403
1404         /* Spin waiting for server's Start STAG/TO/Len */
1405         while (cb->state < RDMA_WRITE_ADV) {
1406                 krping_cq_event_handler(cb->cq, cb);
1407         }
1408
1409 #if 0
1410 {
1411         int i;
1412         struct timeval start, stop;
1413         time_t sec;
1414         suseconds_t usec;
1415         unsigned long long elapsed;
1416         struct ib_wc wc;
1417         struct ib_send_wr *bad_wr;
1418         int ne;
1419         
1420         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1421         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1422         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1423         cb->rdma_sq_wr.sg_list->length = 0;
1424         cb->rdma_sq_wr.num_sge = 0;
1425
1426         microtime(&start);
1427         for (i=0; i < 100000; i++) {
1428                 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1429                         log(LOG_ERR,  "Couldn't post send\n");
1430                         return;
1431                 }
1432                 do {
1433                         ne = ib_poll_cq(cb->cq, 1, &wc);
1434                 } while (ne == 0);
1435                 if (ne < 0) {
1436                         log(LOG_ERR, "poll CQ failed %d\n", ne);
1437                         return;
1438                 }
1439                 if (wc.status != IB_WC_SUCCESS) {
1440                         log(LOG_ERR, "Completion wth error at %s:\n",
1441                                 cb->server ? "server" : "client");
1442                         log(LOG_ERR, "Failed status %d: wr_id %d\n",
1443                                 wc.status, (int) wc.wr_id);
1444                         return;
1445                 }
1446         }
1447         microtime(&stop);
1448         
1449         if (stop.tv_usec < start.tv_usec) {
1450                 stop.tv_usec += 1000000;
1451                 stop.tv_sec  -= 1;
1452         }
1453         sec     = stop.tv_sec - start.tv_sec;
1454         usec    = stop.tv_usec - start.tv_usec;
1455         elapsed = sec * 1000000 + usec;
1456         log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1457 }
1458 #endif
1459
1460         rlat_test(cb);
1461 }
1462
1463 static void krping_wlat_test_client(struct krping_cb *cb)
1464 {
1465         struct ib_send_wr *bad_wr;
1466         struct ib_wc wc;
1467         int ret;
1468
1469         cb->state = RDMA_READ_ADV;
1470
1471         /* Send STAG/TO/Len to client */
1472         if (cb->dma_mr)
1473                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1474         else
1475                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1476         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1477         if (ret) {
1478                 log(LOG_ERR, "post send error %d\n", ret);
1479                 return;
1480         }
1481
1482         /* Spin waiting for send completion */
1483         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1484         if (ret < 0) {
1485                 log(LOG_ERR, "poll error %d\n", ret);
1486                 return;
1487         }
1488         if (wc.status) {
1489                 log(LOG_ERR, "send completion error %d\n", wc.status);
1490                 return;
1491         }
1492
1493         /* Spin waiting for server's Start STAG/TO/Len */
1494         while (cb->state < RDMA_WRITE_ADV) {
1495                 krping_cq_event_handler(cb->cq, cb);
1496         }
1497
1498         wlat_test(cb);
1499 }
1500
1501 static void krping_bw_test_client(struct krping_cb *cb)
1502 {
1503         struct ib_send_wr *bad_wr;
1504         struct ib_wc wc;
1505         int ret;
1506
1507         cb->state = RDMA_READ_ADV;
1508
1509         /* Send STAG/TO/Len to client */
1510         if (cb->dma_mr)
1511                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1512         else
1513                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1514         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1515         if (ret) {
1516                 log(LOG_ERR, "post send error %d\n", ret);
1517                 return;
1518         }
1519
1520         /* Spin waiting for send completion */
1521         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1522         if (ret < 0) {
1523                 log(LOG_ERR, "poll error %d\n", ret);
1524                 return;
1525         }
1526         if (wc.status) {
1527                 log(LOG_ERR, "send completion error %d\n", wc.status);
1528                 return;
1529         }
1530
1531         /* Spin waiting for server's Start STAG/TO/Len */
1532         while (cb->state < RDMA_WRITE_ADV) {
1533                 krping_cq_event_handler(cb->cq, cb);
1534         }
1535
1536         bw_test(cb);
1537 }
1538
1539 static int krping_connect_client(struct krping_cb *cb)
1540 {
1541         struct rdma_conn_param conn_param;
1542         int ret;
1543
1544         memset(&conn_param, 0, sizeof conn_param);
1545         conn_param.responder_resources = 1;
1546         conn_param.initiator_depth = 1;
1547         conn_param.retry_count = 10;
1548
1549         ret = rdma_connect(cb->cm_id, &conn_param);
1550         if (ret) {
1551                 log(LOG_ERR, "rdma_connect error %d\n", ret);
1552                 return ret;
1553         }
1554
1555         krping_wait(cb, CONNECTED);
1556         if (cb->state == ERROR) {
1557                 log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
1558                 return -1;
1559         }
1560
1561         DEBUG_LOG(PFX "rdma_connect successful\n");
1562         return 0;
1563 }
1564
1565 static int krping_bind_client(struct krping_cb *cb)
1566 {
1567         struct sockaddr_in sin;
1568         int ret;
1569
1570         memset(&sin, 0, sizeof(sin));
1571         sin.sin_len = sizeof sin;
1572         sin.sin_family = AF_INET;
1573         sin.sin_addr.s_addr = cb->addr.s_addr;
1574         sin.sin_port = cb->port;
1575
1576         ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
1577                                 2000);
1578         if (ret) {
1579                 log(LOG_ERR, "rdma_resolve_addr error %d\n", ret);
1580                 return ret;
1581         }
1582
1583         krping_wait(cb, ROUTE_RESOLVED);
1584         if (cb->state != ROUTE_RESOLVED) {
1585                 log(LOG_ERR,  
1586                        "addr/route resolution did not resolve: state %d\n",
1587                        cb->state);
1588                 return EINTR;
1589         }
1590
1591         DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n");
1592         return 0;
1593 }
1594
1595 static void krping_run_client(struct krping_cb *cb)
1596 {
1597         struct ib_recv_wr *bad_wr;
1598         int ret;
1599
1600         ret = krping_bind_client(cb);
1601         if (ret)
1602                 return;
1603
1604         ret = krping_setup_qp(cb, cb->cm_id);
1605         if (ret) {
1606                 log(LOG_ERR, "setup_qp failed: %d\n", ret);
1607                 return;
1608         }
1609
1610         ret = krping_setup_buffers(cb);
1611         if (ret) {
1612                 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1613                 goto err1;
1614         }
1615
1616         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1617         if (ret) {
1618                 log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1619                 goto err2;
1620         }
1621
1622         ret = krping_connect_client(cb);
1623         if (ret) {
1624                 log(LOG_ERR, "connect error %d\n", ret);
1625                 goto err2;
1626         }
1627
1628         if (cb->wlat)
1629                 krping_wlat_test_client(cb);
1630         else if (cb->rlat)
1631                 krping_rlat_test_client(cb);
1632         else if (cb->bw)
1633                 krping_bw_test_client(cb);
1634         else
1635                 krping_test_client(cb);
1636         rdma_disconnect(cb->cm_id);
1637 err2:
1638         krping_free_buffers(cb);
1639 err1:
1640         krping_free_qp(cb);
1641 }
1642
1643 int krping_doit(char *cmd)
1644 {
1645         struct krping_cb *cb;
1646         int op;
1647         int ret = 0;
1648         char *optarg;
1649         unsigned long optint;
1650         debug = 0;
1651
1652         cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK);
1653         if (!cb)
1654                 return ENOMEM;
1655         bzero(cb, sizeof *cb);
1656
1657         mtx_lock(&krping_mutex);
1658         TAILQ_INSERT_TAIL(&krping_cbs, cb, list);
1659         mtx_unlock(&krping_mutex);
1660
1661         cb->server = -1;
1662         cb->state = IDLE;
1663         cb->size = 64;
1664         cb->txdepth = RPING_SQ_DEPTH;
1665         cb->use_dmamr = 1;
1666         cb->memlimit = 0;
1667         mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
1668
1669         while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
1670                               &optint)) != 0) {
1671                 switch (op) {
1672                 case 'a':
1673                         cb->addr_str = optarg;
1674                         DEBUG_LOG(PFX "ipaddr (%s)\n", optarg);
1675                         if (!inet_aton(optarg, &cb->addr)) {
1676                                 log(LOG_ERR, "bad addr string %s\n", optarg);
1677                                 ret = EINVAL;
1678                         }
1679                         break;
1680                 case 'D':
1681                         cb->use_dmamr = 1;
1682                         DEBUG_LOG(PFX "using dma mr\n");
1683                         break;
1684                 case 'p':
1685                         cb->port = htons(optint);
1686                         DEBUG_LOG(PFX "port %d\n", (int)optint);
1687                         break;
1688                 case 'P':
1689                         cb->poll = 1;
1690                         DEBUG_LOG("server\n");
1691                         break;
1692                 case 's':
1693                         cb->server = 1;
1694                         DEBUG_LOG(PFX "server\n");
1695                         break;
1696                 case 'c':
1697                         cb->server = 0;
1698                         DEBUG_LOG(PFX "client\n");
1699                         break;
1700                 case 'S':
1701                         cb->size = optint;
1702                         if ((cb->size < 1) ||
1703                             (cb->size > RPING_BUFSIZE)) {
1704                                 log(LOG_ERR, "Invalid size %d "
1705                                        "(valid range is 1 to %d)\n",
1706                                        cb->size, RPING_BUFSIZE);
1707                                 ret = EINVAL;
1708                         } else
1709                                 DEBUG_LOG(PFX "size %d\n", (int)optint);
1710                         break;
1711                 case 'C':
1712                         cb->count = optint;
1713                         if (cb->count < 0) {
1714                                 log(LOG_ERR, "Invalid count %d\n",
1715                                         cb->count);
1716                                 ret = EINVAL;
1717                         } else
1718                                 DEBUG_LOG(PFX "count %d\n", (int) cb->count);
1719                         break;
1720                 case 'v':
1721                         cb->verbose++;
1722                         DEBUG_LOG(PFX "verbose\n");
1723                         break;
1724                 case 'V':
1725                         cb->validate++;
1726                         DEBUG_LOG(PFX "validate data\n");
1727                         break;
1728                 case 'L':
1729                         cb->rlat++;
1730                         break;
1731                 case 'l':
1732                         cb->wlat++;
1733                         break;
1734                 case 'B':
1735                         cb->bw++;
1736                         break;
1737                 case 't':
1738                         cb->txdepth = optint;
1739                         DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth);
1740                         break;
1741                 case 'd':
1742                         debug++;
1743                         break;
1744                 case 'm':
1745                         cb->memlimit = optint;
1746                         if (cb->memlimit < 1) {
1747                                 log(LOG_ERR, "Invalid memory limit %ju\n",
1748                                     cb->memlimit);
1749                                 ret = EINVAL;
1750                         } else
1751                                 DEBUG_LOG(PFX "memory limit %d\n", (int)optint);
1752                         break;
1753                 default:
1754                         log(LOG_ERR, "unknown opt %s\n", optarg);
1755                         ret = EINVAL;
1756                         break;
1757                 }
1758         }
1759         if (ret)
1760                 goto out;
1761
1762         if (cb->server == -1) {
1763                 log(LOG_ERR, "must be either client or server\n");
1764                 ret = EINVAL;
1765                 goto out;
1766         }
1767         if ((cb->bw + cb->rlat + cb->wlat) > 1) {
1768                 log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n");
1769                 ret = EINVAL;
1770                 goto out;
1771         }
1772
1773
1774         cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
1775         if (IS_ERR(cb->cm_id)) {
1776                 ret = PTR_ERR(cb->cm_id);
1777                 log(LOG_ERR, "rdma_create_id error %d\n", ret);
1778                 goto out;
1779         }
1780         DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id);
1781         if (cb->server)
1782                 krping_run_server(cb);
1783         else
1784                 krping_run_client(cb);
1785         DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id);
1786
1787         mtx_lock(&cb->lock);
1788         cb->state = CLEANUP;
1789         wakeup(cb);
1790         mtx_unlock(&cb->lock);
1791
1792         rdma_destroy_id(cb->cm_id);
1793 out:
1794         mtx_lock(&krping_mutex);
1795         TAILQ_REMOVE(&krping_cbs, cb, list);
1796         mtx_unlock(&krping_mutex);
1797         free(cb, M_DEVBUF);
1798         return ret;
1799 }
1800
1801 void krping_init(void)
1802 {
1803         mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF);
1804         TAILQ_INIT(&krping_cbs);
1805 }