]> CyberLeo.Net >> Repos - FreeBSD/releng/9.2.git/blob - sys/contrib/rdma/krping/krping.c
- Copy stable/9 to releng/9.2 as part of the 9.2-RELEASE cycle.
[FreeBSD/releng/9.2.git] / sys / contrib / rdma / krping / krping.c
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/ctype.h>
38
39 #include <sys/param.h>
40 #include <sys/condvar.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/socket.h>
44 #include <sys/endian.h>
45 #include <sys/limits.h>
46 #include <sys/proc.h>
47 #include <sys/signalvar.h>
48
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/rwlock.h>
52 #include <sys/queue.h>
53 #include <sys/taskqueue.h>
54 #include <sys/syslog.h>
55 #include <netinet/in.h>
56
57 #include <vm/vm.h>
58 #include <vm/pmap.h>
59
60 #include <linux/types.h>
61 #include <rdma/rdma_cm.h>
62
63 #include "getopt.h"
64 #include "krping.h"
65
66 #define PFX "krping: "
67
68 static int debug = 0;
69 #define DEBUG_LOG if (debug) printf
70
71 static const struct krping_option krping_opts[] = {
72         {"count", OPT_INT, 'C'},
73         {"size", OPT_INT, 'S'},
74         {"addr", OPT_STRING, 'a'},
75         {"port", OPT_INT, 'p'},
76         {"verbose", OPT_NOPARAM, 'v'},
77         {"validate", OPT_NOPARAM, 'V'},
78         {"server", OPT_NOPARAM, 's'},
79         {"client", OPT_NOPARAM, 'c'},
80         {"dmamr", OPT_NOPARAM, 'D'},
81         {"debug", OPT_NOPARAM, 'd'},
82         {"wlat", OPT_NOPARAM, 'l'},
83         {"rlat", OPT_NOPARAM, 'L'},
84         {"bw", OPT_NOPARAM, 'B'},
85         {"tx-depth", OPT_INT, 't'},
86         {"poll", OPT_NOPARAM, 'P'},
87         {"memlimit", OPT_INT, 'm'},
88         {NULL, 0, 0}
89 };
90
91 struct mtx krping_mutex;
92
93 /*
94  * List of running krping threads.
95  */
96 struct krping_cb_list krping_cbs;
97
98 /*
99  * krping "ping/pong" loop:
100  *      client sends source rkey/addr/len
101  *      server receives source rkey/add/len
102  *      server rdma reads "ping" data from source
103  *      server sends "go ahead" on rdma read completion
104  *      client sends sink rkey/addr/len
105  *      server receives sink rkey/addr/len
106  *      server rdma writes "pong" data to sink
107  *      server sends "go ahead" on rdma write completion
108  *      <repeat loop>
109  */
110
111 /*
112  * Default max buffer size for IO...
113  */
114 #define RPING_BUFSIZE 128*1024
115 #define RPING_SQ_DEPTH 32
116
117 static void krping_wait(struct krping_cb *cb, int state)
118 {
119         int rc;
120         mtx_lock(&cb->lock);
121         while (cb->state < state) {
122                 rc = msleep(cb, &cb->lock, 0, "krping", 0);
123                 if (rc && rc != ERESTART) {
124                         cb->state = ERROR;
125                         break;
126                 }
127         }
128         mtx_unlock(&cb->lock);
129 }
130
131 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
132                                    struct rdma_cm_event *event)
133 {
134         int ret;
135         struct krping_cb *cb = cma_id->context;
136
137         DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
138                   (cma_id == cb->cm_id) ? "parent" : "child");
139
140         mtx_lock(&cb->lock);
141         switch (event->event) {
142         case RDMA_CM_EVENT_ADDR_RESOLVED:
143                 cb->state = ADDR_RESOLVED;
144                 ret = rdma_resolve_route(cma_id, 2000);
145                 if (ret) {
146                         log(LOG_ERR, "rdma_resolve_route error %d\n", 
147                                ret);
148                         wakeup(cb);
149                 }
150                 break;
151
152         case RDMA_CM_EVENT_ROUTE_RESOLVED:
153                 cb->state = ROUTE_RESOLVED;
154                 wakeup(cb);
155                 break;
156
157         case RDMA_CM_EVENT_CONNECT_REQUEST:
158                 cb->state = CONNECT_REQUEST;
159                 cb->child_cm_id = cma_id;
160                 DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id);
161                 wakeup(cb);
162                 break;
163
164         case RDMA_CM_EVENT_ESTABLISHED:
165                 DEBUG_LOG(PFX "ESTABLISHED\n");
166                 if (!cb->server) {
167                         cb->state = CONNECTED;
168                         wakeup(cb);
169                 }
170                 break;
171
172         case RDMA_CM_EVENT_ADDR_ERROR:
173         case RDMA_CM_EVENT_ROUTE_ERROR:
174         case RDMA_CM_EVENT_CONNECT_ERROR:
175         case RDMA_CM_EVENT_UNREACHABLE:
176         case RDMA_CM_EVENT_REJECTED:
177                 log(LOG_ERR, "cma event %d, error %d\n", event->event,
178                        event->status);
179                 cb->state = ERROR;
180                 wakeup(cb);
181                 break;
182
183         case RDMA_CM_EVENT_DISCONNECTED:
184                 DEBUG_LOG(PFX "DISCONNECT EVENT...\n");
185                 cb->state = ERROR;
186                 wakeup(cb);
187                 break;
188
189         case RDMA_CM_EVENT_DEVICE_REMOVAL:
190                 DEBUG_LOG(PFX "cma detected device removal!!!!\n");
191                 break;
192
193         default:
194                 log(LOG_ERR, "oof bad type!\n");
195                 wakeup(cb);
196                 break;
197         }
198         mtx_unlock(&cb->lock);
199         return 0;
200 }
201
202 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
203 {
204         if (wc->byte_len != sizeof(cb->recv_buf)) {
205                 log(LOG_ERR, "Received bogus data, size %d\n", 
206                        wc->byte_len);
207                 return -1;
208         }
209
210         cb->remote_rkey = ntohl(cb->recv_buf.rkey);
211         cb->remote_addr = ntohll(cb->recv_buf.buf);
212         cb->remote_len  = ntohl(cb->recv_buf.size);
213         DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n",
214                   cb->remote_rkey, (unsigned long long)cb->remote_addr, 
215                   cb->remote_len);
216
217         if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
218                 cb->state = RDMA_READ_ADV;
219         else
220                 cb->state = RDMA_WRITE_ADV;
221
222         return 0;
223 }
224
225 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
226 {
227         if (wc->byte_len != sizeof(cb->recv_buf)) {
228                 log(LOG_ERR, "Received bogus data, size %d\n", 
229                        wc->byte_len);
230                 return -1;
231         }
232
233         if (cb->state == RDMA_READ_ADV)
234                 cb->state = RDMA_WRITE_ADV;
235         else
236                 cb->state = RDMA_WRITE_COMPLETE;
237
238         return 0;
239 }
240
241 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
242 {
243         struct krping_cb *cb = ctx;
244         struct ib_wc wc;
245         struct ib_recv_wr *bad_wr;
246         int ret;
247
248         mtx_lock(&cb->lock);
249         KASSERT(cb->cq == cq, ("bad condition"));
250         if (cb->state == ERROR) {
251                 log(LOG_ERR,  "cq completion in ERROR state\n");
252                 mtx_unlock(&cb->lock);
253                 return;
254         }
255         if (!cb->wlat && !cb->rlat && !cb->bw)
256                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
257         while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
258                 if (wc.status) {
259                         if (wc.status == IB_WC_WR_FLUSH_ERR) {
260                                 DEBUG_LOG("cq flushed\n");
261                                 continue;
262                         } else {
263                                 log(LOG_CRIT, "cq completion failed status %d\n",
264                                         wc.status);
265                                 goto error;
266                         }
267                 }
268
269                 switch (wc.opcode) {
270                 case IB_WC_SEND:
271                         DEBUG_LOG(PFX "send completion\n");
272                         cb->stats.send_bytes += cb->send_sgl.length;
273                         cb->stats.send_msgs++;
274                         break;
275
276                 case IB_WC_RDMA_WRITE:
277                         DEBUG_LOG(PFX "rdma write completion\n");
278                         cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
279                         cb->stats.write_msgs++;
280                         cb->state = RDMA_WRITE_COMPLETE;
281                         wakeup(cb);
282                         break;
283
284                 case IB_WC_RDMA_READ:
285                         DEBUG_LOG(PFX "rdma read completion\n");
286                         cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
287                         cb->stats.read_msgs++;
288                         cb->state = RDMA_READ_COMPLETE;
289                         wakeup(cb);
290                         break;
291
292                 case IB_WC_RECV:
293                         DEBUG_LOG(PFX "recv completion\n");
294                         cb->stats.recv_bytes += sizeof(cb->recv_buf);
295                         cb->stats.recv_msgs++;
296                         if (cb->wlat || cb->rlat || cb->bw)
297                                 ret = server_recv(cb, &wc);
298                         else
299                                 ret = cb->server ? server_recv(cb, &wc) :
300                                            client_recv(cb, &wc);
301                         if (ret) {
302                                 log(LOG_ERR, "recv wc error: %d\n", ret);
303                                 goto error;
304                         }
305
306                         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
307                         if (ret) {
308                                 log(LOG_ERR, "post recv error: %d\n", 
309                                        ret);
310                                 goto error;
311                         }
312                         wakeup(cb);
313                         break;
314
315                 default:
316                         log(LOG_ERR, "unknown!!!!! completion\n");
317                         goto error;
318                 }
319         }
320         if (ret) {
321                 log(LOG_ERR, "poll error %d\n", ret);
322                 goto error;
323         }
324         mtx_unlock(&cb->lock);
325         return;
326 error:
327         cb->state = ERROR;
328         wakeup(cb);
329         mtx_unlock(&cb->lock);
330 }
331
332 static int krping_accept(struct krping_cb *cb)
333 {
334         struct rdma_conn_param conn_param;
335         int ret;
336
337         DEBUG_LOG(PFX "accepting client connection request\n");
338
339         memset(&conn_param, 0, sizeof conn_param);
340         conn_param.responder_resources = 1;
341         conn_param.initiator_depth = 1;
342
343         ret = rdma_accept(cb->child_cm_id, &conn_param);
344         if (ret) {
345                 log(LOG_ERR, "rdma_accept error: %d\n", ret);
346                 return ret;
347         }
348
349         if (!cb->wlat && !cb->rlat && !cb->bw) {
350                 krping_wait(cb, CONNECTED);
351                 if (cb->state == ERROR) {
352                         log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
353                         return -1;
354                 }
355         }
356         return 0;
357 }
358
359 static void krping_setup_wr(struct krping_cb *cb)
360 {
361         /* XXX X86 only here... not mapping for dma! */
362         cb->recv_sgl.addr = vtophys(&cb->recv_buf);
363         cb->recv_sgl.length = sizeof cb->recv_buf;
364         if (cb->use_dmamr)
365                 cb->recv_sgl.lkey = cb->dma_mr->lkey;
366         else
367                 cb->recv_sgl.lkey = cb->recv_mr->lkey;
368         cb->rq_wr.sg_list = &cb->recv_sgl;
369         cb->rq_wr.num_sge = 1;
370
371         cb->send_sgl.addr = vtophys(&cb->send_buf);
372         cb->send_sgl.length = sizeof cb->send_buf;
373         if (cb->use_dmamr)
374                 cb->send_sgl.lkey = cb->dma_mr->lkey;
375         else
376                 cb->send_sgl.lkey = cb->send_mr->lkey;
377
378         cb->sq_wr.opcode = IB_WR_SEND;
379         cb->sq_wr.send_flags = IB_SEND_SIGNALED;
380         cb->sq_wr.sg_list = &cb->send_sgl;
381         cb->sq_wr.num_sge = 1;
382
383         cb->rdma_addr = vtophys(cb->rdma_buf);
384         cb->rdma_sgl.addr = cb->rdma_addr;
385         if (cb->use_dmamr)
386                 cb->rdma_sgl.lkey = cb->dma_mr->lkey;
387         else
388                 cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
389         cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
390         cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
391         cb->rdma_sq_wr.num_sge = 1;
392
393         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
394                 cb->start_addr = vtophys(cb->start_buf);
395         }
396 }
397
398 static int krping_setup_buffers(struct krping_cb *cb)
399 {
400         int ret;
401         struct ib_phys_buf buf;
402         u64 iovbase;
403
404         DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
405
406         if (cb->use_dmamr) {
407                 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
408                                            IB_ACCESS_REMOTE_READ|
409                                            IB_ACCESS_REMOTE_WRITE);
410                 if (IS_ERR(cb->dma_mr)) {
411                         log(LOG_ERR, "reg_dmamr failed\n");
412                         return PTR_ERR(cb->dma_mr);
413                 }
414         } else {
415
416                 buf.addr = vtophys(&cb->recv_buf);
417                 buf.size = sizeof cb->recv_buf;
418                 iovbase = vtophys(&cb->recv_buf);
419                 cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
420                                              IB_ACCESS_LOCAL_WRITE, 
421                                              &iovbase);
422
423                 if (IS_ERR(cb->recv_mr)) {
424                         log(LOG_ERR, "recv_buf reg_mr failed\n");
425                         return PTR_ERR(cb->recv_mr);
426                 }
427
428                 buf.addr = vtophys(&cb->send_buf);
429                 buf.size = sizeof cb->send_buf;
430                 iovbase = vtophys(&cb->send_buf);
431                 cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
432                                              0, &iovbase);
433
434                 if (IS_ERR(cb->send_mr)) {
435                         log(LOG_ERR, "send_buf reg_mr failed\n");
436                         ib_dereg_mr(cb->recv_mr);
437                         return PTR_ERR(cb->send_mr);
438                 }
439         }
440
441         /* RNIC adapters have a limit upto which it can register physical memory
442          * If DMA-MR memory mode is set then normally driver registers maximum
443          * supported memory. After that if contigmalloc allocates memory beyond the
444          * specified RNIC limit then Krping may not work.
445          */
446         if (cb->use_dmamr && cb->memlimit)
447                 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit,
448                                             PAGE_SIZE, 0);
449         else 
450                 cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
451                                             PAGE_SIZE, 0);
452
453         if (!cb->rdma_buf) {
454                 log(LOG_ERR, "rdma_buf malloc failed\n");
455                 ret = ENOMEM;
456                 goto err1;
457         }
458         if (!cb->use_dmamr) {
459
460                 buf.addr = vtophys(cb->rdma_buf);
461                 buf.size = cb->size;
462                 iovbase = vtophys(cb->rdma_buf);
463                 cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
464                                              IB_ACCESS_REMOTE_READ| 
465                                              IB_ACCESS_REMOTE_WRITE, 
466                                              &iovbase);
467
468                 if (IS_ERR(cb->rdma_mr)) {
469                         log(LOG_ERR, "rdma_buf reg_mr failed\n");
470                         ret = PTR_ERR(cb->rdma_mr);
471                         goto err2;
472                 }
473         }
474
475         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
476                 if (cb->use_dmamr && cb->memlimit)
477                         cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
478                                                      0, cb->memlimit, PAGE_SIZE, 0);
479                 else
480                         cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
481                                                      0, -1UL, PAGE_SIZE, 0);
482                 if (!cb->start_buf) {
483                         log(LOG_ERR, "start_buf malloc failed\n");
484                         ret = ENOMEM;
485                         goto err2;
486                 }
487                 if (!cb->use_dmamr) {
488                         unsigned flags = IB_ACCESS_REMOTE_READ;
489
490                         if (cb->wlat || cb->rlat || cb->bw) 
491                                 flags |= IB_ACCESS_REMOTE_WRITE;
492                         buf.addr = vtophys(cb->start_buf);
493                         buf.size = cb->size;
494                         iovbase = vtophys(cb->start_buf);
495                         cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
496                                              flags,
497                                              &iovbase);
498
499                         if (IS_ERR(cb->start_mr)) {
500                                 log(LOG_ERR, "start_buf reg_mr failed\n");
501                                 ret = PTR_ERR(cb->start_mr);
502                                 goto err3;
503                         }
504                 }
505         }
506
507         krping_setup_wr(cb);
508         DEBUG_LOG(PFX "allocated & registered buffers...\n");
509         return 0;
510 err3:
511         contigfree(cb->start_buf, cb->size, M_DEVBUF);
512
513         if (!cb->use_dmamr)
514                 ib_dereg_mr(cb->rdma_mr);
515 err2:
516         contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
517 err1:
518         if (cb->use_dmamr)
519                 ib_dereg_mr(cb->dma_mr);
520         else {
521                 ib_dereg_mr(cb->recv_mr);
522                 ib_dereg_mr(cb->send_mr);
523         }
524         return ret;
525 }
526
527 static void krping_free_buffers(struct krping_cb *cb)
528 {
529         DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb);
530         
531 #if 0
532         dma_unmap_single(cb->pd->device->dma_device,
533                          pci_unmap_addr(cb, recv_mapping),
534                          sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
535         dma_unmap_single(cb->pd->device->dma_device,
536                          pci_unmap_addr(cb, send_mapping),
537                          sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
538         dma_unmap_single(cb->pd->device->dma_device,
539                          pci_unmap_addr(cb, rdma_mapping),
540                          cb->size, DMA_BIDIRECTIONAL);
541 #endif
542         contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
543         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
544 #if 0
545                 dma_unmap_single(cb->pd->device->dma_device,
546                          pci_unmap_addr(cb, start_mapping),
547                          cb->size, DMA_BIDIRECTIONAL);
548 #endif
549                 contigfree(cb->start_buf, cb->size, M_DEVBUF);
550         }
551         if (cb->use_dmamr)
552                 ib_dereg_mr(cb->dma_mr);
553         else {
554                 ib_dereg_mr(cb->send_mr);
555                 ib_dereg_mr(cb->recv_mr);
556                 ib_dereg_mr(cb->rdma_mr);
557                 if (!cb->server)
558                         ib_dereg_mr(cb->start_mr);
559         }
560 }
561
562 static int krping_create_qp(struct krping_cb *cb)
563 {
564         struct ib_qp_init_attr init_attr;
565         int ret;
566
567         memset(&init_attr, 0, sizeof(init_attr));
568         init_attr.cap.max_send_wr = cb->txdepth;
569         init_attr.cap.max_recv_wr = 2;
570         init_attr.cap.max_recv_sge = 1;
571         init_attr.cap.max_send_sge = 1;
572         init_attr.qp_type = IB_QPT_RC;
573         init_attr.send_cq = cb->cq;
574         init_attr.recv_cq = cb->cq;
575
576         if (cb->server) {
577                 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
578                 if (!ret)
579                         cb->qp = cb->child_cm_id->qp;
580         } else {
581                 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
582                 if (!ret)
583                         cb->qp = cb->cm_id->qp;
584         }
585
586         return ret;
587 }
588
589 static void krping_free_qp(struct krping_cb *cb)
590 {
591         ib_destroy_qp(cb->qp);
592         ib_destroy_cq(cb->cq);
593         ib_dealloc_pd(cb->pd);
594 }
595
596 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
597 {
598         int ret;
599         cb->pd = ib_alloc_pd(cm_id->device);
600         if (IS_ERR(cb->pd)) {
601                 log(LOG_ERR, "ib_alloc_pd failed\n");
602                 return PTR_ERR(cb->pd);
603         }
604         DEBUG_LOG(PFX "created pd %p\n", cb->pd);
605
606         cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
607                               cb, cb->txdepth * 2, 0);
608         if (IS_ERR(cb->cq)) {
609                 log(LOG_ERR, "ib_create_cq failed\n");
610                 ret = PTR_ERR(cb->cq);
611                 goto err1;
612         }
613         DEBUG_LOG(PFX "created cq %p\n", cb->cq);
614
615         if (!cb->wlat && !cb->rlat && !cb->bw) {
616                 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
617                 if (ret) {
618                         log(LOG_ERR, "ib_create_cq failed\n");
619                         goto err2;
620                 }
621         }
622
623         ret = krping_create_qp(cb);
624         if (ret) {
625                 log(LOG_ERR, "krping_create_qp failed: %d\n", ret);
626                 goto err2;
627         }
628         DEBUG_LOG(PFX "created qp %p\n", cb->qp);
629         return 0;
630 err2:
631         ib_destroy_cq(cb->cq);
632 err1:
633         ib_dealloc_pd(cb->pd);
634         return ret;
635 }
636
637 static void krping_format_send(struct krping_cb *cb, u64 buf, 
638                                struct ib_mr *mr)
639 {
640         struct krping_rdma_info *info = &cb->send_buf;
641
642         info->buf = htonll(buf);
643         info->rkey = htonl(mr->rkey);
644         info->size = htonl(cb->size);
645
646         DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n",
647                   (unsigned long long)buf, mr->rkey, cb->size);
648 }
649
650 static void krping_test_server(struct krping_cb *cb)
651 {
652         struct ib_send_wr *bad_wr;
653         int ret;
654
655         while (1) {
656                 /* Wait for client's Start STAG/TO/Len */
657                 krping_wait(cb, RDMA_READ_ADV);
658                 if (cb->state != RDMA_READ_ADV) {
659                         DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n",
660                                 cb->state);
661                         break;
662                 }
663
664                 DEBUG_LOG(PFX "server received sink adv\n");
665
666                 /* Issue RDMA Read. */
667                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
668                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
669                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
670                 cb->rdma_sq_wr.sg_list->length = cb->remote_len;
671
672                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
673                 if (ret) {
674                         log(LOG_ERR, "post send error %d\n", ret);
675                         break;
676                 }
677                 DEBUG_LOG(PFX "server posted rdma read req \n");
678
679                 /* Wait for read completion */
680                 krping_wait(cb, RDMA_READ_COMPLETE);
681                 if (cb->state != RDMA_READ_COMPLETE) {
682                         log(LOG_ERR,  
683                                "wait for RDMA_READ_COMPLETE state %d\n",
684                                cb->state);
685                         break;
686                 }
687                 DEBUG_LOG(PFX "server received read complete\n");
688
689                 /* Display data in recv buf */
690                 if (cb->verbose)
691                         DEBUG_LOG("server ping data: %s\n", cb->rdma_buf);
692
693                 /* Tell client to continue */
694                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
695                 if (ret) {
696                         log(LOG_ERR, "post send error %d\n", ret);
697                         break;
698                 }
699                 DEBUG_LOG(PFX "server posted go ahead\n");
700
701                 /* Wait for client's RDMA STAG/TO/Len */
702                 krping_wait(cb, RDMA_WRITE_ADV);
703                 if (cb->state != RDMA_WRITE_ADV) {
704                         log(LOG_ERR,  
705                                "wait for RDMA_WRITE_ADV state %d\n",
706                                cb->state);
707                         break;
708                 }
709                 DEBUG_LOG(PFX "server received sink adv\n");
710
711                 /* RDMA Write echo data */
712                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
713                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
714                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
715                 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
716                 DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n",
717                           cb->rdma_sq_wr.sg_list->lkey,
718                           (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
719                           cb->rdma_sq_wr.sg_list->length);
720
721                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
722                 if (ret) {
723                         log(LOG_ERR, "post send error %d\n", ret);
724                         break;
725                 }
726
727                 /* Wait for completion */
728                 krping_wait(cb, RDMA_WRITE_COMPLETE);
729                 if (cb->state != RDMA_WRITE_COMPLETE) {
730                         log(LOG_ERR,  
731                                "wait for RDMA_WRITE_COMPLETE state %d\n",
732                                cb->state);
733                         break;
734                 }
735                 DEBUG_LOG(PFX "server rdma write complete \n");
736
737                 cb->state = CONNECTED;
738
739                 /* Tell client to begin again */
740                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
741                 if (ret) {
742                         log(LOG_ERR, "post send error %d\n", ret);
743                         break;
744                 }
745                 DEBUG_LOG(PFX "server posted go ahead\n");
746         }
747 }
748
749 static void rlat_test(struct krping_cb *cb)
750 {
751         int scnt;
752         int iters = cb->count;
753         struct timeval start_tv, stop_tv;
754         int ret;
755         struct ib_wc wc;
756         struct ib_send_wr *bad_wr;
757         int ne;
758
759         scnt = 0;
760         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
761         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
762         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
763         cb->rdma_sq_wr.sg_list->length = cb->size;
764
765         microtime(&start_tv);
766         if (!cb->poll) {
767                 cb->state = RDMA_READ_ADV;
768                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
769         }
770         while (scnt < iters) {
771
772                 cb->state = RDMA_READ_ADV;
773                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
774                 if (ret) {
775                         log(LOG_ERR,  
776                                 "Couldn't post send: ret=%d scnt %d\n",
777                                 ret, scnt);
778                         return;
779                 }
780
781                 do {
782                         if (!cb->poll) {
783                                 krping_wait(cb, RDMA_READ_COMPLETE);
784                                 if (cb->state == RDMA_READ_COMPLETE) {
785                                         ne = 1;
786                                         ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
787                                 } else {
788                                         ne = -1;
789                                 }
790                         } else
791                                 ne = ib_poll_cq(cb->cq, 1, &wc);
792                         if (cb->state == ERROR) {
793                                 log(LOG_ERR, 
794                                        "state == ERROR...bailing scnt %d\n", scnt);
795                                 return;
796                         }
797                 } while (ne == 0);
798
799                 if (ne < 0) {
800                         log(LOG_ERR, "poll CQ failed %d\n", ne);
801                         return;
802                 }
803                 if (cb->poll && wc.status != IB_WC_SUCCESS) {
804                         log(LOG_ERR, "Completion wth error at %s:\n",
805                                 cb->server ? "server" : "client");
806                         log(LOG_ERR, "Failed status %d: wr_id %d\n",
807                                 wc.status, (int) wc.wr_id);
808                         return;
809                 }
810                 ++scnt;
811         }
812         microtime(&stop_tv);
813
814         if (stop_tv.tv_usec < start_tv.tv_usec) {
815                 stop_tv.tv_usec += 1000000;
816                 stop_tv.tv_sec  -= 1;
817         }
818
819         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n",
820                 stop_tv.tv_sec - start_tv.tv_sec, 
821                 stop_tv.tv_usec - start_tv.tv_usec,
822                 scnt, cb->size);
823 }
824
825 static int alloc_cycle_mem(int cycle_iters,
826                                 cycles_t **post_cycles_start,
827                                 cycles_t **post_cycles_stop,
828                                 cycles_t **poll_cycles_start,
829                                 cycles_t **poll_cycles_stop,
830                                 cycles_t **last_poll_cycles_start)
831 {
832         *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
833         if (!*post_cycles_start) {
834                 goto fail1;
835         }
836         *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
837         if (!*post_cycles_stop) {
838                 goto fail2;
839         }
840         *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
841         if (!*poll_cycles_start) {
842                 goto fail3;
843         }
844         *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
845         if (!*poll_cycles_stop) {
846                 goto fail4;
847         }
848         *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
849         if (!*last_poll_cycles_start) {
850                 goto fail5;
851         }
852         return 0;
853 fail5:
854         free(*poll_cycles_stop, M_DEVBUF);
855 fail4:
856         free(*poll_cycles_start, M_DEVBUF);
857 fail3:
858         free(*post_cycles_stop, M_DEVBUF);
859 fail2:
860         free(*post_cycles_start, M_DEVBUF);
861 fail1:
862         log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
863         return ENOMEM;
864 }
865
866 static void free_cycle_mem(cycles_t *post_cycles_start,
867                                 cycles_t *post_cycles_stop,
868                                 cycles_t *poll_cycles_start,
869                                 cycles_t *poll_cycles_stop,
870                                 cycles_t *last_poll_cycles_start)
871 {
872         free(last_poll_cycles_start, M_DEVBUF);
873         free(poll_cycles_stop, M_DEVBUF);
874         free(poll_cycles_start, M_DEVBUF);
875         free(post_cycles_stop, M_DEVBUF);
876         free(post_cycles_start, M_DEVBUF);
877 }
878
879 static void wlat_test(struct krping_cb *cb)
880 {
881         int ccnt, scnt, rcnt;
882         int iters=cb->count;
883         volatile char *poll_buf = (char *) cb->start_buf;
884         char *buf = (char *)cb->rdma_buf;
885         ccnt = 0;
886         scnt = 0;
887         rcnt = 0;
888         struct timeval start_tv, stop_tv;
889         cycles_t *post_cycles_start, *post_cycles_stop;
890         cycles_t *poll_cycles_start, *poll_cycles_stop;
891         cycles_t *last_poll_cycles_start;
892         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
893         int i;
894         int cycle_iters = 1000;
895         int err;
896
897         err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
898                                 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
899                           
900         if (err) {
901                 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
902                 return;
903         }
904
905         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
906         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
907         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
908         cb->rdma_sq_wr.sg_list->length = cb->size;
909
910         if (cycle_iters > iters)
911                 cycle_iters = iters;
912         microtime(&start_tv);
913         while (scnt < iters || ccnt < iters || rcnt < iters) {
914
915                 /* Wait till buffer changes. */
916                 if (rcnt < iters && !(scnt < 1 && !cb->server)) {
917                         ++rcnt;
918                         while (*poll_buf != (char)rcnt) {
919                                 if (cb->state == ERROR) {
920                                         log(LOG_ERR, "state = ERROR, bailing\n");
921                                         return;
922                                 }
923                         }
924                 }
925
926                 if (scnt < iters) {
927                         struct ib_send_wr *bad_wr;
928
929                         *buf = (char)scnt+1;
930                         if (scnt < cycle_iters)
931                                 post_cycles_start[scnt] = get_cycles();
932                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
933                                 log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
934                                         scnt);
935                                 return;
936                         }
937                         if (scnt < cycle_iters)
938                                 post_cycles_stop[scnt] = get_cycles();
939                         scnt++;
940                 }
941
942                 if (ccnt < iters) {
943                         struct ib_wc wc;
944                         int ne;
945
946                         if (ccnt < cycle_iters)
947                                 poll_cycles_start[ccnt] = get_cycles();
948                         do {
949                                 if (ccnt < cycle_iters)
950                                         last_poll_cycles_start[ccnt] = get_cycles();
951                                 ne = ib_poll_cq(cb->cq, 1, &wc);
952                         } while (ne == 0);
953                         if (ccnt < cycle_iters)
954                                 poll_cycles_stop[ccnt] = get_cycles();
955                         ++ccnt;
956
957                         if (ne < 0) {
958                                 log(LOG_ERR, "poll CQ failed %d\n", ne);
959                                 return;
960                         }
961                         if (wc.status != IB_WC_SUCCESS) {
962                                 log(LOG_ERR, "Completion wth error at %s:\n",
963                                         cb->server ? "server" : "client");
964                                 log(LOG_ERR, "Failed status %d: wr_id %d\n",
965                                         wc.status, (int) wc.wr_id);
966                                 log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n",
967                                         scnt, rcnt, ccnt);
968                                 return;
969                         }
970                 }
971         }
972         microtime(&stop_tv);
973
974         if (stop_tv.tv_usec < start_tv.tv_usec) {
975                 stop_tv.tv_usec += 1000000;
976                 stop_tv.tv_sec  -= 1;
977         }
978
979         for (i=0; i < cycle_iters; i++) {
980                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
981                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
982                 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
983         }
984
985         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
986                 stop_tv.tv_sec - start_tv.tv_sec, 
987                 stop_tv.tv_usec - start_tv.tv_usec,
988                 scnt, cb->size, cycle_iters, 
989                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
990                 (unsigned long long)sum_last_poll);
991
992         free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 
993                         poll_cycles_stop, last_poll_cycles_start);
994 }
995
996 static void bw_test(struct krping_cb *cb)
997 {
998         int ccnt, scnt, rcnt;
999         int iters=cb->count;
1000         ccnt = 0;
1001         scnt = 0;
1002         rcnt = 0;
1003         struct timeval start_tv, stop_tv;
1004         cycles_t *post_cycles_start, *post_cycles_stop;
1005         cycles_t *poll_cycles_start, *poll_cycles_stop;
1006         cycles_t *last_poll_cycles_start;
1007         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1008         int i;
1009         int cycle_iters = 1000;
1010         int err;
1011
1012         err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
1013                                 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
1014                           
1015         if (err) {
1016                 log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__);
1017                 return;
1018         }
1019
1020         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1021         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1022         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1023         cb->rdma_sq_wr.sg_list->length = cb->size;
1024
1025         if (cycle_iters > iters)
1026                 cycle_iters = iters;
1027         microtime(&start_tv);
1028         while (scnt < iters || ccnt < iters) {
1029
1030                 while (scnt < iters && scnt - ccnt < cb->txdepth) {
1031                         struct ib_send_wr *bad_wr;
1032
1033                         if (scnt < cycle_iters)
1034                                 post_cycles_start[scnt] = get_cycles();
1035                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1036                                 log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1037                                         scnt);
1038                                 return;
1039                         }
1040                         if (scnt < cycle_iters)
1041                                 post_cycles_stop[scnt] = get_cycles();
1042                         ++scnt;
1043                 }
1044
1045                 if (ccnt < iters) {
1046                         int ne;
1047                         struct ib_wc wc;
1048
1049                         if (ccnt < cycle_iters)
1050                                 poll_cycles_start[ccnt] = get_cycles();
1051                         do {
1052                                 if (ccnt < cycle_iters)
1053                                         last_poll_cycles_start[ccnt] = get_cycles();
1054                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1055                         } while (ne == 0);
1056                         if (ccnt < cycle_iters)
1057                                 poll_cycles_stop[ccnt] = get_cycles();
1058                         ccnt += 1;
1059
1060                         if (ne < 0) {
1061                                 log(LOG_ERR, "poll CQ failed %d\n", ne);
1062                                 return;
1063                         }
1064                         if (wc.status != IB_WC_SUCCESS) {
1065                                 log(LOG_ERR, "Completion wth error at %s:\n",
1066                                         cb->server ? "server" : "client");
1067                                 log(LOG_ERR, "Failed status %d: wr_id %d\n",
1068                                         wc.status, (int) wc.wr_id);
1069                                 return;
1070                         }
1071                 }
1072         }
1073         microtime(&stop_tv);
1074
1075         if (stop_tv.tv_usec < start_tv.tv_usec) {
1076                 stop_tv.tv_usec += 1000000;
1077                 stop_tv.tv_sec  -= 1;
1078         }
1079
1080         for (i=0; i < cycle_iters; i++) {
1081                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1082                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1083                 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1084         }
1085
1086         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1087                 stop_tv.tv_sec - start_tv.tv_sec, 
1088                 stop_tv.tv_usec - start_tv.tv_usec,
1089                 scnt, cb->size, cycle_iters, 
1090                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1091                 (unsigned long long)sum_last_poll);
1092
1093         free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 
1094                         poll_cycles_stop, last_poll_cycles_start);
1095 }
1096
1097 static void krping_rlat_test_server(struct krping_cb *cb)
1098 {
1099         struct ib_send_wr *bad_wr;
1100         struct ib_wc wc;
1101         int ret;
1102
1103         /* Spin waiting for client's Start STAG/TO/Len */
1104         while (cb->state < RDMA_READ_ADV) {
1105                 krping_cq_event_handler(cb->cq, cb);
1106         }
1107
1108         /* Send STAG/TO/Len to client */
1109         if (cb->dma_mr)
1110                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1111         else
1112                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1113         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1114         if (ret) {
1115                 log(LOG_ERR, "post send error %d\n", ret);
1116                 return;
1117         }
1118
1119         /* Spin waiting for send completion */
1120         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1121         if (ret < 0) {
1122                 log(LOG_ERR, "poll error %d\n", ret);
1123                 return;
1124         }
1125         if (wc.status) {
1126                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1127                 return;
1128         }
1129
1130         krping_wait(cb, ERROR);
1131 }
1132
1133 static void krping_wlat_test_server(struct krping_cb *cb)
1134 {
1135         struct ib_send_wr *bad_wr;
1136         struct ib_wc wc;
1137         int ret;
1138
1139         /* Spin waiting for client's Start STAG/TO/Len */
1140         while (cb->state < RDMA_READ_ADV) {
1141                 krping_cq_event_handler(cb->cq, cb);
1142         }
1143
1144         /* Send STAG/TO/Len to client */
1145         if (cb->dma_mr)
1146                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1147         else
1148                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1149         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1150         if (ret) {
1151                 log(LOG_ERR, "post send error %d\n", ret);
1152                 return;
1153         }
1154
1155         /* Spin waiting for send completion */
1156         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1157         if (ret < 0) {
1158                 log(LOG_ERR, "poll error %d\n", ret);
1159                 return;
1160         }
1161         if (wc.status) {
1162                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1163                 return;
1164         }
1165
1166         wlat_test(cb);
1167
1168 }
1169
1170 static void krping_bw_test_server(struct krping_cb *cb)
1171 {
1172         struct ib_send_wr *bad_wr;
1173         struct ib_wc wc;
1174         int ret;
1175
1176         /* Spin waiting for client's Start STAG/TO/Len */
1177         while (cb->state < RDMA_READ_ADV) {
1178                 krping_cq_event_handler(cb->cq, cb);
1179         }
1180
1181         /* Send STAG/TO/Len to client */
1182         if (cb->dma_mr)
1183                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1184         else
1185                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1186         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1187         if (ret) {
1188                 log(LOG_ERR, "post send error %d\n", ret);
1189                 return;
1190         }
1191
1192         /* Spin waiting for send completion */
1193         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1194         if (ret < 0) {
1195                 log(LOG_ERR, "poll error %d\n", ret);
1196                 return;
1197         }
1198         if (wc.status) {
1199                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1200                 return;
1201         }
1202
1203         if (cb->duplex)
1204                 bw_test(cb);
1205         krping_wait(cb, ERROR);
1206 }
1207
1208 static int krping_bind_server(struct krping_cb *cb)
1209 {
1210         struct sockaddr_in sin;
1211         int ret;
1212
1213         memset(&sin, 0, sizeof(sin));
1214         sin.sin_len = sizeof sin;
1215         sin.sin_family = AF_INET;
1216         sin.sin_addr.s_addr = cb->addr.s_addr;
1217         sin.sin_port = cb->port;
1218
1219         ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1220         if (ret) {
1221                 log(LOG_ERR, "rdma_bind_addr error %d\n", ret);
1222                 return ret;
1223         }
1224         DEBUG_LOG(PFX "rdma_bind_addr successful\n");
1225
1226         DEBUG_LOG(PFX "rdma_listen\n");
1227         ret = rdma_listen(cb->cm_id, 3);
1228         if (ret) {
1229                 log(LOG_ERR, "rdma_listen failed: %d\n", ret);
1230                 return ret;
1231         }
1232
1233         krping_wait(cb, CONNECT_REQUEST);
1234         if (cb->state != CONNECT_REQUEST) {
1235                 log(LOG_ERR,  "wait for CONNECT_REQUEST state %d\n",
1236                         cb->state);
1237                 return -1;
1238         }
1239
1240         return 0;
1241 }
1242
1243 static void krping_run_server(struct krping_cb *cb)
1244 {
1245         struct ib_recv_wr *bad_wr;
1246         int ret;
1247
1248         ret = krping_bind_server(cb);
1249         if (ret)
1250                 return;
1251
1252         ret = krping_setup_qp(cb, cb->child_cm_id);
1253         if (ret) {
1254                 log(LOG_ERR, "setup_qp failed: %d\n", ret);
1255                 return;
1256         }
1257
1258         ret = krping_setup_buffers(cb);
1259         if (ret) {
1260                 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1261                 goto err1;
1262         }
1263
1264         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1265         if (ret) {
1266                 log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1267                 goto err2;
1268         }
1269
1270         ret = krping_accept(cb);
1271         if (ret) {
1272                 log(LOG_ERR, "connect error %d\n", ret);
1273                 goto err2;
1274         }
1275
1276         if (cb->wlat)
1277                 krping_wlat_test_server(cb);
1278         else if (cb->rlat)
1279                 krping_rlat_test_server(cb);
1280         else if (cb->bw)
1281                 krping_bw_test_server(cb);
1282         else
1283                 krping_test_server(cb);
1284
1285         rdma_disconnect(cb->child_cm_id);
1286         rdma_destroy_id(cb->child_cm_id);
1287 err2:
1288         krping_free_buffers(cb);
1289 err1:
1290         krping_free_qp(cb);
1291 }
1292
1293 static void krping_test_client(struct krping_cb *cb)
1294 {
1295         int ping, start, cc, i, ret;
1296         struct ib_send_wr *bad_wr;
1297         unsigned char c;
1298
1299         start = 65;
1300         for (ping = 0; !cb->count || ping < cb->count; ping++) {
1301                 cb->state = RDMA_READ_ADV;
1302
1303                 /* Put some ascii text in the buffer. */
1304                 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1305                 for (i = cc, c = start; i < cb->size; i++) {
1306                         cb->start_buf[i] = c;
1307                         c++;
1308                         if (c > 122)
1309                                 c = 65;
1310                 }
1311                 start++;
1312                 if (start > 122)
1313                         start = 65;
1314                 cb->start_buf[cb->size - 1] = 0;
1315
1316                 if (cb->dma_mr)
1317                         krping_format_send(cb, cb->start_addr, cb->dma_mr);
1318                 else
1319                         krping_format_send(cb, cb->start_addr, cb->start_mr);
1320
1321                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1322                 if (ret) {
1323                         log(LOG_ERR, "post send error %d\n", ret);
1324                         break;
1325                 }
1326
1327                 /* Wait for server to ACK */
1328                 krping_wait(cb, RDMA_WRITE_ADV);
1329                 if (cb->state != RDMA_WRITE_ADV) {
1330                         log(LOG_ERR,  
1331                                "wait for RDMA_WRITE_ADV state %d\n",
1332                                cb->state);
1333                         break;
1334                 }
1335
1336                 if (cb->dma_mr)
1337                         krping_format_send(cb, cb->rdma_addr, cb->dma_mr);
1338                 else
1339                         krping_format_send(cb, cb->rdma_addr, cb->rdma_mr);
1340
1341                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1342                 if (ret) {
1343                         log(LOG_ERR, "post send error %d\n", ret);
1344                         break;
1345                 }
1346
1347                 /* Wait for the server to say the RDMA Write is complete. */
1348                 krping_wait(cb, RDMA_WRITE_COMPLETE);
1349                 if (cb->state != RDMA_WRITE_COMPLETE) {
1350                         log(LOG_ERR,  
1351                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1352                                cb->state);
1353                         break;
1354                 }
1355
1356                 if (cb->validate)
1357                         if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1358                                 log(LOG_ERR, "data mismatch!\n");
1359                                 break;
1360                         }
1361
1362                 if (cb->verbose)
1363                         DEBUG_LOG("ping data: %s\n", cb->rdma_buf);
1364         }
1365 }
1366
1367 static void krping_rlat_test_client(struct krping_cb *cb)
1368 {
1369         struct ib_send_wr *bad_wr;
1370         struct ib_wc wc;
1371         int ret;
1372
1373         cb->state = RDMA_READ_ADV;
1374
1375         /* Send STAG/TO/Len to client */
1376         if (cb->dma_mr)
1377                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1378         else
1379                 krping_format_send(cb, cb->start_addr, cb->rdma_mr);
1380         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1381         if (ret) {
1382                 log(LOG_ERR, "post send error %d\n", ret);
1383                 return;
1384         }
1385
1386         /* Spin waiting for send completion */
1387         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1388         if (ret < 0) {
1389                 log(LOG_ERR, "poll error %d\n", ret);
1390                 return;
1391         }
1392         if (wc.status) {
1393                 log(LOG_ERR, "send completion error %d\n", wc.status);
1394                 return;
1395         }
1396
1397         /* Spin waiting for server's Start STAG/TO/Len */
1398         while (cb->state < RDMA_WRITE_ADV) {
1399                 krping_cq_event_handler(cb->cq, cb);
1400         }
1401
1402 #if 0
1403 {
1404         int i;
1405         struct timeval start, stop;
1406         time_t sec;
1407         suseconds_t usec;
1408         unsigned long long elapsed;
1409         struct ib_wc wc;
1410         struct ib_send_wr *bad_wr;
1411         int ne;
1412         
1413         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1414         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1415         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1416         cb->rdma_sq_wr.sg_list->length = 0;
1417         cb->rdma_sq_wr.num_sge = 0;
1418
1419         microtime(&start);
1420         for (i=0; i < 100000; i++) {
1421                 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1422                         log(LOG_ERR,  "Couldn't post send\n");
1423                         return;
1424                 }
1425                 do {
1426                         ne = ib_poll_cq(cb->cq, 1, &wc);
1427                 } while (ne == 0);
1428                 if (ne < 0) {
1429                         log(LOG_ERR, "poll CQ failed %d\n", ne);
1430                         return;
1431                 }
1432                 if (wc.status != IB_WC_SUCCESS) {
1433                         log(LOG_ERR, "Completion wth error at %s:\n",
1434                                 cb->server ? "server" : "client");
1435                         log(LOG_ERR, "Failed status %d: wr_id %d\n",
1436                                 wc.status, (int) wc.wr_id);
1437                         return;
1438                 }
1439         }
1440         microtime(&stop);
1441         
1442         if (stop.tv_usec < start.tv_usec) {
1443                 stop.tv_usec += 1000000;
1444                 stop.tv_sec  -= 1;
1445         }
1446         sec     = stop.tv_sec - start.tv_sec;
1447         usec    = stop.tv_usec - start.tv_usec;
1448         elapsed = sec * 1000000 + usec;
1449         log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1450 }
1451 #endif
1452
1453         rlat_test(cb);
1454 }
1455
1456 static void krping_wlat_test_client(struct krping_cb *cb)
1457 {
1458         struct ib_send_wr *bad_wr;
1459         struct ib_wc wc;
1460         int ret;
1461
1462         cb->state = RDMA_READ_ADV;
1463
1464         /* Send STAG/TO/Len to client */
1465         if (cb->dma_mr)
1466                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1467         else
1468                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1469         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1470         if (ret) {
1471                 log(LOG_ERR, "post send error %d\n", ret);
1472                 return;
1473         }
1474
1475         /* Spin waiting for send completion */
1476         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1477         if (ret < 0) {
1478                 log(LOG_ERR, "poll error %d\n", ret);
1479                 return;
1480         }
1481         if (wc.status) {
1482                 log(LOG_ERR, "send completion error %d\n", wc.status);
1483                 return;
1484         }
1485
1486         /* Spin waiting for server's Start STAG/TO/Len */
1487         while (cb->state < RDMA_WRITE_ADV) {
1488                 krping_cq_event_handler(cb->cq, cb);
1489         }
1490
1491         wlat_test(cb);
1492 }
1493
1494 static void krping_bw_test_client(struct krping_cb *cb)
1495 {
1496         struct ib_send_wr *bad_wr;
1497         struct ib_wc wc;
1498         int ret;
1499
1500         cb->state = RDMA_READ_ADV;
1501
1502         /* Send STAG/TO/Len to client */
1503         if (cb->dma_mr)
1504                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1505         else
1506                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1507         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1508         if (ret) {
1509                 log(LOG_ERR, "post send error %d\n", ret);
1510                 return;
1511         }
1512
1513         /* Spin waiting for send completion */
1514         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1515         if (ret < 0) {
1516                 log(LOG_ERR, "poll error %d\n", ret);
1517                 return;
1518         }
1519         if (wc.status) {
1520                 log(LOG_ERR, "send completion error %d\n", wc.status);
1521                 return;
1522         }
1523
1524         /* Spin waiting for server's Start STAG/TO/Len */
1525         while (cb->state < RDMA_WRITE_ADV) {
1526                 krping_cq_event_handler(cb->cq, cb);
1527         }
1528
1529         bw_test(cb);
1530 }
1531
1532 static int krping_connect_client(struct krping_cb *cb)
1533 {
1534         struct rdma_conn_param conn_param;
1535         int ret;
1536
1537         memset(&conn_param, 0, sizeof conn_param);
1538         conn_param.responder_resources = 1;
1539         conn_param.initiator_depth = 1;
1540         conn_param.retry_count = 10;
1541
1542         ret = rdma_connect(cb->cm_id, &conn_param);
1543         if (ret) {
1544                 log(LOG_ERR, "rdma_connect error %d\n", ret);
1545                 return ret;
1546         }
1547
1548         krping_wait(cb, CONNECTED);
1549         if (cb->state == ERROR) {
1550                 log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
1551                 return -1;
1552         }
1553
1554         DEBUG_LOG(PFX "rdma_connect successful\n");
1555         return 0;
1556 }
1557
1558 static int krping_bind_client(struct krping_cb *cb)
1559 {
1560         struct sockaddr_in sin;
1561         int ret;
1562
1563         memset(&sin, 0, sizeof(sin));
1564         sin.sin_len = sizeof sin;
1565         sin.sin_family = AF_INET;
1566         sin.sin_addr.s_addr = cb->addr.s_addr;
1567         sin.sin_port = cb->port;
1568
1569         ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
1570                                 2000);
1571         if (ret) {
1572                 log(LOG_ERR, "rdma_resolve_addr error %d\n", ret);
1573                 return ret;
1574         }
1575
1576         krping_wait(cb, ROUTE_RESOLVED);
1577         if (cb->state != ROUTE_RESOLVED) {
1578                 log(LOG_ERR,  
1579                        "addr/route resolution did not resolve: state %d\n",
1580                        cb->state);
1581                 return EINTR;
1582         }
1583
1584         DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n");
1585         return 0;
1586 }
1587
1588 static void krping_run_client(struct krping_cb *cb)
1589 {
1590         struct ib_recv_wr *bad_wr;
1591         int ret;
1592
1593         ret = krping_bind_client(cb);
1594         if (ret)
1595                 return;
1596
1597         ret = krping_setup_qp(cb, cb->cm_id);
1598         if (ret) {
1599                 log(LOG_ERR, "setup_qp failed: %d\n", ret);
1600                 return;
1601         }
1602
1603         ret = krping_setup_buffers(cb);
1604         if (ret) {
1605                 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1606                 goto err1;
1607         }
1608
1609         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1610         if (ret) {
1611                 log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1612                 goto err2;
1613         }
1614
1615         ret = krping_connect_client(cb);
1616         if (ret) {
1617                 log(LOG_ERR, "connect error %d\n", ret);
1618                 goto err2;
1619         }
1620
1621         if (cb->wlat)
1622                 krping_wlat_test_client(cb);
1623         else if (cb->rlat)
1624                 krping_rlat_test_client(cb);
1625         else if (cb->bw)
1626                 krping_bw_test_client(cb);
1627         else
1628                 krping_test_client(cb);
1629         rdma_disconnect(cb->cm_id);
1630 err2:
1631         krping_free_buffers(cb);
1632 err1:
1633         krping_free_qp(cb);
1634 }
1635
1636 int krping_doit(char *cmd)
1637 {
1638         struct krping_cb *cb;
1639         int op;
1640         int ret = 0;
1641         char *optarg;
1642         unsigned long optint;
1643         debug = 0;
1644
1645         cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK);
1646         if (!cb)
1647                 return ENOMEM;
1648         bzero(cb, sizeof *cb);
1649
1650         mtx_lock(&krping_mutex);
1651         TAILQ_INSERT_TAIL(&krping_cbs, cb, list);
1652         mtx_unlock(&krping_mutex);
1653
1654         cb->server = -1;
1655         cb->state = IDLE;
1656         cb->size = 64;
1657         cb->txdepth = RPING_SQ_DEPTH;
1658         cb->use_dmamr = 1;
1659         cb->memlimit = 0;
1660         mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
1661
1662         while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
1663                               &optint)) != 0) {
1664                 switch (op) {
1665                 case 'a':
1666                         cb->addr_str = optarg;
1667                         DEBUG_LOG(PFX "ipaddr (%s)\n", optarg);
1668                         if (!inet_aton(optarg, &cb->addr)) {
1669                                 log(LOG_ERR, "bad addr string %s\n", optarg);
1670                                 ret = EINVAL;
1671                         }
1672                         break;
1673                 case 'D':
1674                         cb->use_dmamr = 1;
1675                         DEBUG_LOG(PFX "using dma mr\n");
1676                         break;
1677                 case 'p':
1678                         cb->port = htons(optint);
1679                         DEBUG_LOG(PFX "port %d\n", (int)optint);
1680                         break;
1681                 case 'P':
1682                         cb->poll = 1;
1683                         DEBUG_LOG("server\n");
1684                         break;
1685                 case 's':
1686                         cb->server = 1;
1687                         DEBUG_LOG(PFX "server\n");
1688                         break;
1689                 case 'c':
1690                         cb->server = 0;
1691                         DEBUG_LOG(PFX "client\n");
1692                         break;
1693                 case 'S':
1694                         cb->size = optint;
1695                         if ((cb->size < 1) ||
1696                             (cb->size > RPING_BUFSIZE)) {
1697                                 log(LOG_ERR, "Invalid size %d "
1698                                        "(valid range is 1 to %d)\n",
1699                                        cb->size, RPING_BUFSIZE);
1700                                 ret = EINVAL;
1701                         } else
1702                                 DEBUG_LOG(PFX "size %d\n", (int)optint);
1703                         break;
1704                 case 'C':
1705                         cb->count = optint;
1706                         if (cb->count < 0) {
1707                                 log(LOG_ERR, "Invalid count %d\n",
1708                                         cb->count);
1709                                 ret = EINVAL;
1710                         } else
1711                                 DEBUG_LOG(PFX "count %d\n", (int) cb->count);
1712                         break;
1713                 case 'v':
1714                         cb->verbose++;
1715                         DEBUG_LOG(PFX "verbose\n");
1716                         break;
1717                 case 'V':
1718                         cb->validate++;
1719                         DEBUG_LOG(PFX "validate data\n");
1720                         break;
1721                 case 'L':
1722                         cb->rlat++;
1723                         break;
1724                 case 'l':
1725                         cb->wlat++;
1726                         break;
1727                 case 'B':
1728                         cb->bw++;
1729                         break;
1730                 case 't':
1731                         cb->txdepth = optint;
1732                         DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth);
1733                         break;
1734                 case 'd':
1735                         debug++;
1736                         break;
1737                 case 'm':
1738                         cb->memlimit = optint;
1739                         if (cb->memlimit < 1) {
1740                                 log(LOG_ERR, "Invalid memory limit %ju\n",
1741                                     cb->memlimit);
1742                                 ret = EINVAL;
1743                         } else
1744                                 DEBUG_LOG(PFX "memory limit %d\n", (int)optint);
1745                         break;
1746                 default:
1747                         log(LOG_ERR, "unknown opt %s\n", optarg);
1748                         ret = EINVAL;
1749                         break;
1750                 }
1751         }
1752         if (ret)
1753                 goto out;
1754
1755         if (cb->server == -1) {
1756                 log(LOG_ERR, "must be either client or server\n");
1757                 ret = EINVAL;
1758                 goto out;
1759         }
1760         if ((cb->bw + cb->rlat + cb->wlat) > 1) {
1761                 log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n");
1762                 ret = EINVAL;
1763                 goto out;
1764         }
1765
1766
1767         cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
1768         if (IS_ERR(cb->cm_id)) {
1769                 ret = PTR_ERR(cb->cm_id);
1770                 log(LOG_ERR, "rdma_create_id error %d\n", ret);
1771                 goto out;
1772         }
1773         DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id);
1774         if (cb->server)
1775                 krping_run_server(cb);
1776         else
1777                 krping_run_client(cb);
1778         DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id);
1779         rdma_destroy_id(cb->cm_id);
1780 out:
1781         mtx_lock(&krping_mutex);
1782         TAILQ_REMOVE(&krping_cbs, cb, list);
1783         mtx_unlock(&krping_mutex);
1784         free(cb, M_DEVBUF);
1785         return ret;
1786 }
1787
1788 void krping_init(void)
1789 {
1790         mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF);
1791         TAILQ_INIT(&krping_cbs);
1792 }