]> CyberLeo.Net >> Repos - FreeBSD/releng/8.1.git/blob - sys/contrib/rdma/krping/krping.c
Copy stable/8 to releng/8.1 in preparation for 8.1-RC1.
[FreeBSD/releng/8.1.git] / sys / contrib / rdma / krping / krping.c
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/ctype.h>
38
39 #include <sys/param.h>
40 #include <sys/condvar.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/socket.h>
44 #include <sys/module.h>
45 #include <sys/endian.h>
46 #include <sys/limits.h>
47 #include <sys/proc.h>
48 #include <sys/signalvar.h>
49
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/rwlock.h>
53 #include <sys/queue.h>
54 #include <sys/taskqueue.h>
55 #include <sys/syslog.h>
56
57 #include <vm/vm.h>
58 #include <vm/pmap.h>
59
60 #include <contrib/rdma/rdma_cm.h>
61
62 #include "getopt.h"
63 #include "krping.h"
64
65 #define PFX "krping: "
66
67 static int debug = 0;
68 #define DEBUG_LOG if (debug) printf
69
70 static const struct krping_option krping_opts[] = {
71         {"count", OPT_INT, 'C'},
72         {"size", OPT_INT, 'S'},
73         {"addr", OPT_STRING, 'a'},
74         {"port", OPT_INT, 'p'},
75         {"verbose", OPT_NOPARAM, 'v'},
76         {"validate", OPT_NOPARAM, 'V'},
77         {"server", OPT_NOPARAM, 's'},
78         {"client", OPT_NOPARAM, 'c'},
79         {"dmamr", OPT_NOPARAM, 'D'},
80         {"debug", OPT_NOPARAM, 'd'},
81         {"wlat", OPT_NOPARAM, 'l'},
82         {"rlat", OPT_NOPARAM, 'L'},
83         {"bw", OPT_NOPARAM, 'B'},
84         {"tx-depth", OPT_INT, 't'},
85         {"poll", OPT_NOPARAM, 'P'},
86         {NULL, 0, 0}
87 };
88
89 struct mtx krping_mutex;
90
91 /*
92  * List of running krping threads.
93  */
94 struct krping_cb_list krping_cbs;
95
96 /*
97  * krping "ping/pong" loop:
98  *      client sends source rkey/addr/len
99  *      server receives source rkey/add/len
100  *      server rdma reads "ping" data from source
101  *      server sends "go ahead" on rdma read completion
102  *      client sends sink rkey/addr/len
103  *      server receives sink rkey/addr/len
104  *      server rdma writes "pong" data to sink
105  *      server sends "go ahead" on rdma write completion
106  *      <repeat loop>
107  */
108
109 /*
110  * Default max buffer size for IO...
111  */
112 #define RPING_BUFSIZE 128*1024
113 #define RPING_SQ_DEPTH 32
114
115 static void krping_wait(struct krping_cb *cb, int state)
116 {
117         int rc;
118         mtx_lock(&cb->lock);
119         while (cb->state < state) {
120                 rc = msleep(cb, &cb->lock, 0, "krping", 0);
121                 if (rc && rc != ERESTART) {
122                         cb->state = ERROR;
123                         break;
124                 }
125         }
126         mtx_unlock(&cb->lock);
127 }
128
129 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
130                                    struct rdma_cm_event *event)
131 {
132         int ret;
133         struct krping_cb *cb = cma_id->context;
134
135         DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
136                   (cma_id == cb->cm_id) ? "parent" : "child");
137
138         mtx_lock(&cb->lock);
139         switch (event->event) {
140         case RDMA_CM_EVENT_ADDR_RESOLVED:
141                 cb->state = ADDR_RESOLVED;
142                 ret = rdma_resolve_route(cma_id, 2000);
143                 if (ret) {
144                         log(LOG_ERR, "rdma_resolve_route error %d\n", 
145                                ret);
146                         wakeup(cb);
147                 }
148                 break;
149
150         case RDMA_CM_EVENT_ROUTE_RESOLVED:
151                 cb->state = ROUTE_RESOLVED;
152                 wakeup(cb);
153                 break;
154
155         case RDMA_CM_EVENT_CONNECT_REQUEST:
156                 cb->state = CONNECT_REQUEST;
157                 cb->child_cm_id = cma_id;
158                 DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id);
159                 wakeup(cb);
160                 break;
161
162         case RDMA_CM_EVENT_ESTABLISHED:
163                 DEBUG_LOG(PFX "ESTABLISHED\n");
164                 if (!cb->server) {
165                         cb->state = CONNECTED;
166                         wakeup(cb);
167                 }
168                 break;
169
170         case RDMA_CM_EVENT_ADDR_ERROR:
171         case RDMA_CM_EVENT_ROUTE_ERROR:
172         case RDMA_CM_EVENT_CONNECT_ERROR:
173         case RDMA_CM_EVENT_UNREACHABLE:
174         case RDMA_CM_EVENT_REJECTED:
175                 log(LOG_ERR, "cma event %d, error %d\n", event->event,
176                        event->status);
177                 cb->state = ERROR;
178                 wakeup(cb);
179                 break;
180
181         case RDMA_CM_EVENT_DISCONNECTED:
182                 DEBUG_LOG(PFX "DISCONNECT EVENT...\n");
183                 cb->state = ERROR;
184                 wakeup(cb);
185                 break;
186
187         case RDMA_CM_EVENT_DEVICE_REMOVAL:
188                 DEBUG_LOG(PFX "cma detected device removal!!!!\n");
189                 break;
190
191         default:
192                 log(LOG_ERR, "oof bad type!\n");
193                 wakeup(cb);
194                 break;
195         }
196         mtx_unlock(&cb->lock);
197         return 0;
198 }
199
200 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
201 {
202         if (wc->byte_len != sizeof(cb->recv_buf)) {
203                 log(LOG_ERR, "Received bogus data, size %d\n", 
204                        wc->byte_len);
205                 return -1;
206         }
207
208         cb->remote_rkey = ntohl(cb->recv_buf.rkey);
209         cb->remote_addr = ntohll(cb->recv_buf.buf);
210         cb->remote_len  = ntohl(cb->recv_buf.size);
211         DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n",
212                   cb->remote_rkey, (unsigned long long)cb->remote_addr, 
213                   cb->remote_len);
214
215         if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
216                 cb->state = RDMA_READ_ADV;
217         else
218                 cb->state = RDMA_WRITE_ADV;
219
220         return 0;
221 }
222
223 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
224 {
225         if (wc->byte_len != sizeof(cb->recv_buf)) {
226                 log(LOG_ERR, "Received bogus data, size %d\n", 
227                        wc->byte_len);
228                 return -1;
229         }
230
231         if (cb->state == RDMA_READ_ADV)
232                 cb->state = RDMA_WRITE_ADV;
233         else
234                 cb->state = RDMA_WRITE_COMPLETE;
235
236         return 0;
237 }
238
239 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
240 {
241         struct krping_cb *cb = ctx;
242         struct ib_wc wc;
243         struct ib_recv_wr *bad_wr;
244         int ret;
245
246         mtx_lock(&cb->lock);
247         KASSERT(cb->cq == cq, ("bad condition"));
248         if (cb->state == ERROR) {
249                 log(LOG_ERR,  "cq completion in ERROR state\n");
250                 mtx_unlock(&cb->lock);
251                 return;
252         }
253         if (!cb->wlat && !cb->rlat && !cb->bw)
254                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
255         while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
256                 if (wc.status) {
257                         if (wc.status != IB_WC_WR_FLUSH_ERR)
258                                 log(LOG_ERR, "cq completion failed status %d\n",
259                                         wc.status);
260                         goto error;
261                 }
262
263                 switch (wc.opcode) {
264                 case IB_WC_SEND:
265                         DEBUG_LOG(PFX "send completion\n");
266                         cb->stats.send_bytes += cb->send_sgl.length;
267                         cb->stats.send_msgs++;
268                         break;
269
270                 case IB_WC_RDMA_WRITE:
271                         DEBUG_LOG(PFX "rdma write completion\n");
272                         cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
273                         cb->stats.write_msgs++;
274                         cb->state = RDMA_WRITE_COMPLETE;
275                         wakeup(cb);
276                         break;
277
278                 case IB_WC_RDMA_READ:
279                         DEBUG_LOG(PFX "rdma read completion\n");
280                         cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
281                         cb->stats.read_msgs++;
282                         cb->state = RDMA_READ_COMPLETE;
283                         wakeup(cb);
284                         break;
285
286                 case IB_WC_RECV:
287                         DEBUG_LOG(PFX "recv completion\n");
288                         cb->stats.recv_bytes += sizeof(cb->recv_buf);
289                         cb->stats.recv_msgs++;
290                         if (cb->wlat || cb->rlat || cb->bw)
291                                 ret = server_recv(cb, &wc);
292                         else
293                                 ret = cb->server ? server_recv(cb, &wc) :
294                                            client_recv(cb, &wc);
295                         if (ret) {
296                                 log(LOG_ERR, "recv wc error: %d\n", ret);
297                                 goto error;
298                         }
299
300                         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
301                         if (ret) {
302                                 log(LOG_ERR, "post recv error: %d\n", 
303                                        ret);
304                                 goto error;
305                         }
306                         wakeup(cb);
307                         break;
308
309                 default:
310                         log(LOG_ERR, "unknown!!!!! completion\n");
311                         goto error;
312                 }
313         }
314         if (ret) {
315                 log(LOG_ERR, "poll error %d\n", ret);
316                 goto error;
317         }
318         mtx_unlock(&cb->lock);
319         return;
320 error:
321         cb->state = ERROR;
322         wakeup(cb);
323         mtx_unlock(&cb->lock);
324 }
325
326 static int krping_accept(struct krping_cb *cb)
327 {
328         struct rdma_conn_param conn_param;
329         int ret;
330
331         DEBUG_LOG(PFX "accepting client connection request\n");
332
333         memset(&conn_param, 0, sizeof conn_param);
334         conn_param.responder_resources = 1;
335         conn_param.initiator_depth = 1;
336
337         ret = rdma_accept(cb->child_cm_id, &conn_param);
338         if (ret) {
339                 log(LOG_ERR, "rdma_accept error: %d\n", ret);
340                 return ret;
341         }
342
343         if (!cb->wlat && !cb->rlat && !cb->bw) {
344                 krping_wait(cb, CONNECTED);
345                 if (cb->state == ERROR) {
346                         log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
347                         return -1;
348                 }
349         }
350         return 0;
351 }
352
353 static void krping_setup_wr(struct krping_cb *cb)
354 {
355         /* XXX X86 only here... not mapping for dma! */
356         cb->recv_sgl.addr = vtophys(&cb->recv_buf);
357         cb->recv_sgl.length = sizeof cb->recv_buf;
358         if (cb->use_dmamr)
359                 cb->recv_sgl.lkey = cb->dma_mr->lkey;
360         else
361                 cb->recv_sgl.lkey = cb->recv_mr->lkey;
362         cb->rq_wr.sg_list = &cb->recv_sgl;
363         cb->rq_wr.num_sge = 1;
364
365         cb->send_sgl.addr = vtophys(&cb->send_buf);
366         cb->send_sgl.length = sizeof cb->send_buf;
367         if (cb->use_dmamr)
368                 cb->send_sgl.lkey = cb->dma_mr->lkey;
369         else
370                 cb->send_sgl.lkey = cb->send_mr->lkey;
371
372         cb->sq_wr.opcode = IB_WR_SEND;
373         cb->sq_wr.send_flags = IB_SEND_SIGNALED;
374         cb->sq_wr.sg_list = &cb->send_sgl;
375         cb->sq_wr.num_sge = 1;
376
377         cb->rdma_addr = vtophys(cb->rdma_buf);
378         cb->rdma_sgl.addr = cb->rdma_addr;
379         if (cb->use_dmamr)
380                 cb->rdma_sgl.lkey = cb->dma_mr->lkey;
381         else
382                 cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
383         cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
384         cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
385         cb->rdma_sq_wr.num_sge = 1;
386
387         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
388                 cb->start_addr = vtophys(cb->start_buf);
389         }
390 }
391
392 static int krping_setup_buffers(struct krping_cb *cb)
393 {
394         int ret;
395         struct ib_phys_buf buf;
396         u64 iovbase;
397
398         DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
399
400         if (cb->use_dmamr) {
401                 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
402                                            IB_ACCESS_REMOTE_READ|
403                                            IB_ACCESS_REMOTE_WRITE);
404                 if (IS_ERR(cb->dma_mr)) {
405                         log(LOG_ERR, "reg_dmamr failed\n");
406                         return PTR_ERR(cb->dma_mr);
407                 }
408         } else {
409
410                 buf.addr = vtophys(&cb->recv_buf);
411                 buf.size = sizeof cb->recv_buf;
412                 iovbase = vtophys(&cb->recv_buf);
413                 cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
414                                              IB_ACCESS_LOCAL_WRITE, 
415                                              &iovbase);
416
417                 if (IS_ERR(cb->recv_mr)) {
418                         log(LOG_ERR, "recv_buf reg_mr failed\n");
419                         return PTR_ERR(cb->recv_mr);
420                 }
421
422                 buf.addr = vtophys(&cb->send_buf);
423                 buf.size = sizeof cb->send_buf;
424                 iovbase = vtophys(&cb->send_buf);
425                 cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
426                                              0, &iovbase);
427
428                 if (IS_ERR(cb->send_mr)) {
429                         log(LOG_ERR, "send_buf reg_mr failed\n");
430                         ib_dereg_mr(cb->recv_mr);
431                         return PTR_ERR(cb->send_mr);
432                 }
433         }
434
435         cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
436                 PAGE_SIZE, 0);
437
438         if (!cb->rdma_buf) {
439                 log(LOG_ERR, "rdma_buf malloc failed\n");
440                 ret = ENOMEM;
441                 goto err1;
442         }
443         if (!cb->use_dmamr) {
444
445                 buf.addr = vtophys(cb->rdma_buf);
446                 buf.size = cb->size;
447                 iovbase = vtophys(cb->rdma_buf);
448                 cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
449                                              IB_ACCESS_REMOTE_READ| 
450                                              IB_ACCESS_REMOTE_WRITE, 
451                                              &iovbase);
452
453                 if (IS_ERR(cb->rdma_mr)) {
454                         log(LOG_ERR, "rdma_buf reg_mr failed\n");
455                         ret = PTR_ERR(cb->rdma_mr);
456                         goto err2;
457                 }
458         }
459
460         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
461                 cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
462                         0, -1UL, PAGE_SIZE, 0);
463                 if (!cb->start_buf) {
464                         log(LOG_ERR, "start_buf malloc failed\n");
465                         ret = ENOMEM;
466                         goto err2;
467                 }
468                 if (!cb->use_dmamr) {
469                         unsigned flags = IB_ACCESS_REMOTE_READ;
470
471                         if (cb->wlat || cb->rlat || cb->bw) 
472                                 flags |= IB_ACCESS_REMOTE_WRITE;
473                         buf.addr = vtophys(cb->start_buf);
474                         buf.size = cb->size;
475                         iovbase = vtophys(cb->start_buf);
476                         cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
477                                              flags,
478                                              &iovbase);
479
480                         if (IS_ERR(cb->start_mr)) {
481                                 log(LOG_ERR, "start_buf reg_mr failed\n");
482                                 ret = PTR_ERR(cb->start_mr);
483                                 goto err3;
484                         }
485                 }
486         }
487
488         krping_setup_wr(cb);
489         DEBUG_LOG(PFX "allocated & registered buffers...\n");
490         return 0;
491 err3:
492         contigfree(cb->start_buf, cb->size, M_DEVBUF);
493
494         if (!cb->use_dmamr)
495                 ib_dereg_mr(cb->rdma_mr);
496 err2:
497         contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
498 err1:
499         if (cb->use_dmamr)
500                 ib_dereg_mr(cb->dma_mr);
501         else {
502                 ib_dereg_mr(cb->recv_mr);
503                 ib_dereg_mr(cb->send_mr);
504         }
505         return ret;
506 }
507
508 static void krping_free_buffers(struct krping_cb *cb)
509 {
510         DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb);
511         
512 #if 0
513         dma_unmap_single(cb->pd->device->dma_device,
514                          pci_unmap_addr(cb, recv_mapping),
515                          sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
516         dma_unmap_single(cb->pd->device->dma_device,
517                          pci_unmap_addr(cb, send_mapping),
518                          sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
519         dma_unmap_single(cb->pd->device->dma_device,
520                          pci_unmap_addr(cb, rdma_mapping),
521                          cb->size, DMA_BIDIRECTIONAL);
522 #endif
523         contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
524         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
525 #if 0
526                 dma_unmap_single(cb->pd->device->dma_device,
527                          pci_unmap_addr(cb, start_mapping),
528                          cb->size, DMA_BIDIRECTIONAL);
529 #endif
530                 contigfree(cb->start_buf, cb->size, M_DEVBUF);
531         }
532         if (cb->use_dmamr)
533                 ib_dereg_mr(cb->dma_mr);
534         else {
535                 ib_dereg_mr(cb->send_mr);
536                 ib_dereg_mr(cb->recv_mr);
537                 ib_dereg_mr(cb->rdma_mr);
538                 if (!cb->server)
539                         ib_dereg_mr(cb->start_mr);
540         }
541 }
542
543 static int krping_create_qp(struct krping_cb *cb)
544 {
545         struct ib_qp_init_attr init_attr;
546         int ret;
547
548         memset(&init_attr, 0, sizeof(init_attr));
549         init_attr.cap.max_send_wr = cb->txdepth;
550         init_attr.cap.max_recv_wr = 2;
551         init_attr.cap.max_recv_sge = 1;
552         init_attr.cap.max_send_sge = 1;
553         init_attr.qp_type = IB_QPT_RC;
554         init_attr.send_cq = cb->cq;
555         init_attr.recv_cq = cb->cq;
556
557         if (cb->server) {
558                 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
559                 if (!ret)
560                         cb->qp = cb->child_cm_id->qp;
561         } else {
562                 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
563                 if (!ret)
564                         cb->qp = cb->cm_id->qp;
565         }
566
567         return ret;
568 }
569
570 static void krping_free_qp(struct krping_cb *cb)
571 {
572         ib_destroy_qp(cb->qp);
573         ib_destroy_cq(cb->cq);
574         ib_dealloc_pd(cb->pd);
575 }
576
577 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
578 {
579         int ret;
580         cb->pd = ib_alloc_pd(cm_id->device);
581         if (IS_ERR(cb->pd)) {
582                 log(LOG_ERR, "ib_alloc_pd failed\n");
583                 return PTR_ERR(cb->pd);
584         }
585         DEBUG_LOG(PFX "created pd %p\n", cb->pd);
586
587         cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
588                               cb, cb->txdepth * 2, 0);
589         if (IS_ERR(cb->cq)) {
590                 log(LOG_ERR, "ib_create_cq failed\n");
591                 ret = PTR_ERR(cb->cq);
592                 goto err1;
593         }
594         DEBUG_LOG(PFX "created cq %p\n", cb->cq);
595
596         if (!cb->wlat && !cb->rlat && !cb->bw) {
597                 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
598                 if (ret) {
599                         log(LOG_ERR, "ib_create_cq failed\n");
600                         goto err2;
601                 }
602         }
603
604         ret = krping_create_qp(cb);
605         if (ret) {
606                 log(LOG_ERR, "krping_create_qp failed: %d\n", ret);
607                 goto err2;
608         }
609         DEBUG_LOG(PFX "created qp %p\n", cb->qp);
610         return 0;
611 err2:
612         ib_destroy_cq(cb->cq);
613 err1:
614         ib_dealloc_pd(cb->pd);
615         return ret;
616 }
617
618 static void krping_format_send(struct krping_cb *cb, u64 buf, 
619                                struct ib_mr *mr)
620 {
621         struct krping_rdma_info *info = &cb->send_buf;
622
623         info->buf = htonll(buf);
624         info->rkey = htonl(mr->rkey);
625         info->size = htonl(cb->size);
626
627         DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n",
628                   (unsigned long long)buf, mr->rkey, cb->size);
629 }
630
631 static void krping_test_server(struct krping_cb *cb)
632 {
633         struct ib_send_wr *bad_wr;
634         int ret;
635
636         while (1) {
637                 /* Wait for client's Start STAG/TO/Len */
638                 krping_wait(cb, RDMA_READ_ADV);
639                 if (cb->state != RDMA_READ_ADV) {
640                         DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n",
641                                 cb->state);
642                         break;
643                 }
644
645                 DEBUG_LOG(PFX "server received sink adv\n");
646
647                 /* Issue RDMA Read. */
648                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
649                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
650                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
651                 cb->rdma_sq_wr.sg_list->length = cb->remote_len;
652
653                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
654                 if (ret) {
655                         log(LOG_ERR, "post send error %d\n", ret);
656                         break;
657                 }
658                 DEBUG_LOG(PFX "server posted rdma read req \n");
659
660                 /* Wait for read completion */
661                 krping_wait(cb, RDMA_READ_COMPLETE);
662                 if (cb->state != RDMA_READ_COMPLETE) {
663                         log(LOG_ERR,  
664                                "wait for RDMA_READ_COMPLETE state %d\n",
665                                cb->state);
666                         break;
667                 }
668                 DEBUG_LOG(PFX "server received read complete\n");
669
670                 /* Display data in recv buf */
671                 if (cb->verbose)
672                         DEBUG_LOG("server ping data: %s\n", cb->rdma_buf);
673
674                 /* Tell client to continue */
675                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
676                 if (ret) {
677                         log(LOG_ERR, "post send error %d\n", ret);
678                         break;
679                 }
680                 DEBUG_LOG(PFX "server posted go ahead\n");
681
682                 /* Wait for client's RDMA STAG/TO/Len */
683                 krping_wait(cb, RDMA_WRITE_ADV);
684                 if (cb->state != RDMA_WRITE_ADV) {
685                         log(LOG_ERR,  
686                                "wait for RDMA_WRITE_ADV state %d\n",
687                                cb->state);
688                         break;
689                 }
690                 DEBUG_LOG(PFX "server received sink adv\n");
691
692                 /* RDMA Write echo data */
693                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
694                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
695                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
696                 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
697                 DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n",
698                           cb->rdma_sq_wr.sg_list->lkey,
699                           (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
700                           cb->rdma_sq_wr.sg_list->length);
701
702                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
703                 if (ret) {
704                         log(LOG_ERR, "post send error %d\n", ret);
705                         break;
706                 }
707
708                 /* Wait for completion */
709                 krping_wait(cb, RDMA_WRITE_COMPLETE);
710                 if (cb->state != RDMA_WRITE_COMPLETE) {
711                         log(LOG_ERR,  
712                                "wait for RDMA_WRITE_COMPLETE state %d\n",
713                                cb->state);
714                         break;
715                 }
716                 DEBUG_LOG(PFX "server rdma write complete \n");
717
718                 cb->state = CONNECTED;
719
720                 /* Tell client to begin again */
721                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
722                 if (ret) {
723                         log(LOG_ERR, "post send error %d\n", ret);
724                         break;
725                 }
726                 DEBUG_LOG(PFX "server posted go ahead\n");
727         }
728 }
729
730 static void rlat_test(struct krping_cb *cb)
731 {
732         int scnt;
733         int iters = cb->count;
734         struct timeval start_tv, stop_tv;
735         int ret;
736         struct ib_wc wc;
737         struct ib_send_wr *bad_wr;
738         int ne;
739
740         scnt = 0;
741         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
742         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
743         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
744         cb->rdma_sq_wr.sg_list->length = cb->size;
745
746         microtime(&start_tv);
747         if (!cb->poll) {
748                 cb->state = RDMA_READ_ADV;
749                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
750         }
751         while (scnt < iters) {
752
753                 cb->state = RDMA_READ_ADV;
754                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
755                 if (ret) {
756                         log(LOG_ERR,  
757                                 "Couldn't post send: ret=%d scnt %d\n",
758                                 ret, scnt);
759                         return;
760                 }
761
762                 do {
763                         if (!cb->poll) {
764                                 krping_wait(cb, RDMA_READ_COMPLETE);
765                                 if (cb->state == RDMA_READ_COMPLETE) {
766                                         ne = 1;
767                                         ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
768                                 } else {
769                                         ne = -1;
770                                 }
771                         } else
772                                 ne = ib_poll_cq(cb->cq, 1, &wc);
773                         if (cb->state == ERROR) {
774                                 log(LOG_ERR, 
775                                        "state == ERROR...bailing scnt %d\n", scnt);
776                                 return;
777                         }
778                 } while (ne == 0);
779
780                 if (ne < 0) {
781                         log(LOG_ERR, "poll CQ failed %d\n", ne);
782                         return;
783                 }
784                 if (cb->poll && wc.status != IB_WC_SUCCESS) {
785                         log(LOG_ERR, "Completion wth error at %s:\n",
786                                 cb->server ? "server" : "client");
787                         log(LOG_ERR, "Failed status %d: wr_id %d\n",
788                                 wc.status, (int) wc.wr_id);
789                         return;
790                 }
791                 ++scnt;
792         }
793         microtime(&stop_tv);
794
795         if (stop_tv.tv_usec < start_tv.tv_usec) {
796                 stop_tv.tv_usec += 1000000;
797                 stop_tv.tv_sec  -= 1;
798         }
799
800         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n",
801                 stop_tv.tv_sec - start_tv.tv_sec, 
802                 stop_tv.tv_usec - start_tv.tv_usec,
803                 scnt, cb->size);
804 }
805
806 static int alloc_cycle_mem(int cycle_iters,
807                                 cycles_t **post_cycles_start,
808                                 cycles_t **post_cycles_stop,
809                                 cycles_t **poll_cycles_start,
810                                 cycles_t **poll_cycles_stop,
811                                 cycles_t **last_poll_cycles_start)
812 {
813         *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
814         if (!*post_cycles_start) {
815                 goto fail1;
816         }
817         *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
818         if (!*post_cycles_stop) {
819                 goto fail2;
820         }
821         *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
822         if (!*poll_cycles_start) {
823                 goto fail3;
824         }
825         *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
826         if (!*poll_cycles_stop) {
827                 goto fail4;
828         }
829         *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
830         if (!*last_poll_cycles_start) {
831                 goto fail5;
832         }
833         return 0;
834 fail5:
835         free(*poll_cycles_stop, M_DEVBUF);
836 fail4:
837         free(*poll_cycles_start, M_DEVBUF);
838 fail3:
839         free(*post_cycles_stop, M_DEVBUF);
840 fail2:
841         free(*post_cycles_start, M_DEVBUF);
842 fail1:
843         log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
844         return ENOMEM;
845 }
846
847 static void free_cycle_mem(cycles_t *post_cycles_start,
848                                 cycles_t *post_cycles_stop,
849                                 cycles_t *poll_cycles_start,
850                                 cycles_t *poll_cycles_stop,
851                                 cycles_t *last_poll_cycles_start)
852 {
853         free(last_poll_cycles_start, M_DEVBUF);
854         free(poll_cycles_stop, M_DEVBUF);
855         free(poll_cycles_start, M_DEVBUF);
856         free(post_cycles_stop, M_DEVBUF);
857         free(post_cycles_start, M_DEVBUF);
858 }
859
860 static void wlat_test(struct krping_cb *cb)
861 {
862         int ccnt, scnt, rcnt;
863         int iters=cb->count;
864         volatile char *poll_buf = (char *) cb->start_buf;
865         char *buf = (char *)cb->rdma_buf;
866         ccnt = 0;
867         scnt = 0;
868         rcnt = 0;
869         struct timeval start_tv, stop_tv;
870         cycles_t *post_cycles_start, *post_cycles_stop;
871         cycles_t *poll_cycles_start, *poll_cycles_stop;
872         cycles_t *last_poll_cycles_start;
873         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
874         int i;
875         int cycle_iters = 1000;
876         int err;
877
878         err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
879                                 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
880                           
881         if (err) {
882                 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
883                 return;
884         }
885
886         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
887         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
888         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
889         cb->rdma_sq_wr.sg_list->length = cb->size;
890
891         if (cycle_iters > iters)
892                 cycle_iters = iters;
893         microtime(&start_tv);
894         while (scnt < iters || ccnt < iters || rcnt < iters) {
895
896                 /* Wait till buffer changes. */
897                 if (rcnt < iters && !(scnt < 1 && !cb->server)) {
898                         ++rcnt;
899                         while (*poll_buf != (char)rcnt) {
900                                 if (cb->state == ERROR) {
901                                         log(LOG_ERR, "state = ERROR, bailing\n");
902                                         return;
903                                 }
904                         }
905                 }
906
907                 if (scnt < iters) {
908                         struct ib_send_wr *bad_wr;
909
910                         *buf = (char)scnt+1;
911                         if (scnt < cycle_iters)
912                                 post_cycles_start[scnt] = get_cycles();
913                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
914                                 log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
915                                         scnt);
916                                 return;
917                         }
918                         if (scnt < cycle_iters)
919                                 post_cycles_stop[scnt] = get_cycles();
920                         scnt++;
921                 }
922
923                 if (ccnt < iters) {
924                         struct ib_wc wc;
925                         int ne;
926
927                         if (ccnt < cycle_iters)
928                                 poll_cycles_start[ccnt] = get_cycles();
929                         do {
930                                 if (ccnt < cycle_iters)
931                                         last_poll_cycles_start[ccnt] = get_cycles();
932                                 ne = ib_poll_cq(cb->cq, 1, &wc);
933                         } while (ne == 0);
934                         if (ccnt < cycle_iters)
935                                 poll_cycles_stop[ccnt] = get_cycles();
936                         ++ccnt;
937
938                         if (ne < 0) {
939                                 log(LOG_ERR, "poll CQ failed %d\n", ne);
940                                 return;
941                         }
942                         if (wc.status != IB_WC_SUCCESS) {
943                                 log(LOG_ERR, "Completion wth error at %s:\n",
944                                         cb->server ? "server" : "client");
945                                 log(LOG_ERR, "Failed status %d: wr_id %d\n",
946                                         wc.status, (int) wc.wr_id);
947                                 log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n",
948                                         scnt, rcnt, ccnt);
949                                 return;
950                         }
951                 }
952         }
953         microtime(&stop_tv);
954
955         if (stop_tv.tv_usec < start_tv.tv_usec) {
956                 stop_tv.tv_usec += 1000000;
957                 stop_tv.tv_sec  -= 1;
958         }
959
960         for (i=0; i < cycle_iters; i++) {
961                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
962                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
963                 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
964         }
965
966         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
967                 stop_tv.tv_sec - start_tv.tv_sec, 
968                 stop_tv.tv_usec - start_tv.tv_usec,
969                 scnt, cb->size, cycle_iters, 
970                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
971                 (unsigned long long)sum_last_poll);
972
973         free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 
974                         poll_cycles_stop, last_poll_cycles_start);
975 }
976
977 static void bw_test(struct krping_cb *cb)
978 {
979         int ccnt, scnt, rcnt;
980         int iters=cb->count;
981         ccnt = 0;
982         scnt = 0;
983         rcnt = 0;
984         struct timeval start_tv, stop_tv;
985         cycles_t *post_cycles_start, *post_cycles_stop;
986         cycles_t *poll_cycles_start, *poll_cycles_stop;
987         cycles_t *last_poll_cycles_start;
988         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
989         int i;
990         int cycle_iters = 1000;
991         int err;
992
993         err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
994                                 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
995                           
996         if (err) {
997                 log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__);
998                 return;
999         }
1000
1001         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1002         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1003         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1004         cb->rdma_sq_wr.sg_list->length = cb->size;
1005
1006         if (cycle_iters > iters)
1007                 cycle_iters = iters;
1008         microtime(&start_tv);
1009         while (scnt < iters || ccnt < iters) {
1010
1011                 while (scnt < iters && scnt - ccnt < cb->txdepth) {
1012                         struct ib_send_wr *bad_wr;
1013
1014                         if (scnt < cycle_iters)
1015                                 post_cycles_start[scnt] = get_cycles();
1016                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1017                                 log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1018                                         scnt);
1019                                 return;
1020                         }
1021                         if (scnt < cycle_iters)
1022                                 post_cycles_stop[scnt] = get_cycles();
1023                         ++scnt;
1024                 }
1025
1026                 if (ccnt < iters) {
1027                         int ne;
1028                         struct ib_wc wc;
1029
1030                         if (ccnt < cycle_iters)
1031                                 poll_cycles_start[ccnt] = get_cycles();
1032                         do {
1033                                 if (ccnt < cycle_iters)
1034                                         last_poll_cycles_start[ccnt] = get_cycles();
1035                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1036                         } while (ne == 0);
1037                         if (ccnt < cycle_iters)
1038                                 poll_cycles_stop[ccnt] = get_cycles();
1039                         ccnt += 1;
1040
1041                         if (ne < 0) {
1042                                 log(LOG_ERR, "poll CQ failed %d\n", ne);
1043                                 return;
1044                         }
1045                         if (wc.status != IB_WC_SUCCESS) {
1046                                 log(LOG_ERR, "Completion wth error at %s:\n",
1047                                         cb->server ? "server" : "client");
1048                                 log(LOG_ERR, "Failed status %d: wr_id %d\n",
1049                                         wc.status, (int) wc.wr_id);
1050                                 return;
1051                         }
1052                 }
1053         }
1054         microtime(&stop_tv);
1055
1056         if (stop_tv.tv_usec < start_tv.tv_usec) {
1057                 stop_tv.tv_usec += 1000000;
1058                 stop_tv.tv_sec  -= 1;
1059         }
1060
1061         for (i=0; i < cycle_iters; i++) {
1062                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1063                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1064                 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1065         }
1066
1067         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1068                 stop_tv.tv_sec - start_tv.tv_sec, 
1069                 stop_tv.tv_usec - start_tv.tv_usec,
1070                 scnt, cb->size, cycle_iters, 
1071                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1072                 (unsigned long long)sum_last_poll);
1073
1074         free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 
1075                         poll_cycles_stop, last_poll_cycles_start);
1076 }
1077
1078 static void krping_rlat_test_server(struct krping_cb *cb)
1079 {
1080         struct ib_send_wr *bad_wr;
1081         struct ib_wc wc;
1082         int ret;
1083
1084         /* Spin waiting for client's Start STAG/TO/Len */
1085         while (cb->state < RDMA_READ_ADV) {
1086                 krping_cq_event_handler(cb->cq, cb);
1087         }
1088
1089         /* Send STAG/TO/Len to client */
1090         if (cb->dma_mr)
1091                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1092         else
1093                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1094         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1095         if (ret) {
1096                 log(LOG_ERR, "post send error %d\n", ret);
1097                 return;
1098         }
1099
1100         /* Spin waiting for send completion */
1101         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1102         if (ret < 0) {
1103                 log(LOG_ERR, "poll error %d\n", ret);
1104                 return;
1105         }
1106         if (wc.status) {
1107                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1108                 return;
1109         }
1110
1111         krping_wait(cb, ERROR);
1112 }
1113
1114 static void krping_wlat_test_server(struct krping_cb *cb)
1115 {
1116         struct ib_send_wr *bad_wr;
1117         struct ib_wc wc;
1118         int ret;
1119
1120         /* Spin waiting for client's Start STAG/TO/Len */
1121         while (cb->state < RDMA_READ_ADV) {
1122                 krping_cq_event_handler(cb->cq, cb);
1123         }
1124
1125         /* Send STAG/TO/Len to client */
1126         if (cb->dma_mr)
1127                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1128         else
1129                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1130         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1131         if (ret) {
1132                 log(LOG_ERR, "post send error %d\n", ret);
1133                 return;
1134         }
1135
1136         /* Spin waiting for send completion */
1137         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1138         if (ret < 0) {
1139                 log(LOG_ERR, "poll error %d\n", ret);
1140                 return;
1141         }
1142         if (wc.status) {
1143                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1144                 return;
1145         }
1146
1147         wlat_test(cb);
1148
1149 }
1150
1151 static void krping_bw_test_server(struct krping_cb *cb)
1152 {
1153         struct ib_send_wr *bad_wr;
1154         struct ib_wc wc;
1155         int ret;
1156
1157         /* Spin waiting for client's Start STAG/TO/Len */
1158         while (cb->state < RDMA_READ_ADV) {
1159                 krping_cq_event_handler(cb->cq, cb);
1160         }
1161
1162         /* Send STAG/TO/Len to client */
1163         if (cb->dma_mr)
1164                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1165         else
1166                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1167         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1168         if (ret) {
1169                 log(LOG_ERR, "post send error %d\n", ret);
1170                 return;
1171         }
1172
1173         /* Spin waiting for send completion */
1174         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1175         if (ret < 0) {
1176                 log(LOG_ERR, "poll error %d\n", ret);
1177                 return;
1178         }
1179         if (wc.status) {
1180                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1181                 return;
1182         }
1183
1184         if (cb->duplex)
1185                 bw_test(cb);
1186         krping_wait(cb, ERROR);
1187 }
1188
1189 static int krping_bind_server(struct krping_cb *cb)
1190 {
1191         struct sockaddr_in sin;
1192         int ret;
1193
1194         memset(&sin, 0, sizeof(sin));
1195         sin.sin_len = sizeof sin;
1196         sin.sin_family = AF_INET;
1197         sin.sin_addr.s_addr = cb->addr.s_addr;
1198         sin.sin_port = cb->port;
1199
1200         ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1201         if (ret) {
1202                 log(LOG_ERR, "rdma_bind_addr error %d\n", ret);
1203                 return ret;
1204         }
1205         DEBUG_LOG(PFX "rdma_bind_addr successful\n");
1206
1207         DEBUG_LOG(PFX "rdma_listen\n");
1208         ret = rdma_listen(cb->cm_id, 3);
1209         if (ret) {
1210                 log(LOG_ERR, "rdma_listen failed: %d\n", ret);
1211                 return ret;
1212         }
1213
1214         krping_wait(cb, CONNECT_REQUEST);
1215         if (cb->state != CONNECT_REQUEST) {
1216                 log(LOG_ERR,  "wait for CONNECT_REQUEST state %d\n",
1217                         cb->state);
1218                 return -1;
1219         }
1220
1221         return 0;
1222 }
1223
1224 static void krping_run_server(struct krping_cb *cb)
1225 {
1226         struct ib_recv_wr *bad_wr;
1227         int ret;
1228
1229         ret = krping_bind_server(cb);
1230         if (ret)
1231                 return;
1232
1233         ret = krping_setup_qp(cb, cb->child_cm_id);
1234         if (ret) {
1235                 log(LOG_ERR, "setup_qp failed: %d\n", ret);
1236                 return;
1237         }
1238
1239         ret = krping_setup_buffers(cb);
1240         if (ret) {
1241                 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1242                 goto err1;
1243         }
1244
1245         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1246         if (ret) {
1247                 log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1248                 goto err2;
1249         }
1250
1251         ret = krping_accept(cb);
1252         if (ret) {
1253                 log(LOG_ERR, "connect error %d\n", ret);
1254                 goto err2;
1255         }
1256
1257         if (cb->wlat)
1258                 krping_wlat_test_server(cb);
1259         else if (cb->rlat)
1260                 krping_rlat_test_server(cb);
1261         else if (cb->bw)
1262                 krping_bw_test_server(cb);
1263         else
1264                 krping_test_server(cb);
1265
1266         rdma_disconnect(cb->child_cm_id);
1267         rdma_destroy_id(cb->child_cm_id);
1268 err2:
1269         krping_free_buffers(cb);
1270 err1:
1271         krping_free_qp(cb);
1272 }
1273
1274 static void krping_test_client(struct krping_cb *cb)
1275 {
1276         int ping, start, cc, i, ret;
1277         struct ib_send_wr *bad_wr;
1278         unsigned char c;
1279
1280         start = 65;
1281         for (ping = 0; !cb->count || ping < cb->count; ping++) {
1282                 cb->state = RDMA_READ_ADV;
1283
1284                 /* Put some ascii text in the buffer. */
1285                 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1286                 for (i = cc, c = start; i < cb->size; i++) {
1287                         cb->start_buf[i] = c;
1288                         c++;
1289                         if (c > 122)
1290                                 c = 65;
1291                 }
1292                 start++;
1293                 if (start > 122)
1294                         start = 65;
1295                 cb->start_buf[cb->size - 1] = 0;
1296
1297                 if (cb->dma_mr)
1298                         krping_format_send(cb, cb->start_addr, cb->dma_mr);
1299                 else
1300                         krping_format_send(cb, cb->start_addr, cb->start_mr);
1301
1302                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1303                 if (ret) {
1304                         log(LOG_ERR, "post send error %d\n", ret);
1305                         break;
1306                 }
1307
1308                 /* Wait for server to ACK */
1309                 krping_wait(cb, RDMA_WRITE_ADV);
1310                 if (cb->state != RDMA_WRITE_ADV) {
1311                         log(LOG_ERR,  
1312                                "wait for RDMA_WRITE_ADV state %d\n",
1313                                cb->state);
1314                         break;
1315                 }
1316
1317                 if (cb->dma_mr)
1318                         krping_format_send(cb, cb->rdma_addr, cb->dma_mr);
1319                 else
1320                         krping_format_send(cb, cb->rdma_addr, cb->rdma_mr);
1321
1322                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1323                 if (ret) {
1324                         log(LOG_ERR, "post send error %d\n", ret);
1325                         break;
1326                 }
1327
1328                 /* Wait for the server to say the RDMA Write is complete. */
1329                 krping_wait(cb, RDMA_WRITE_COMPLETE);
1330                 if (cb->state != RDMA_WRITE_COMPLETE) {
1331                         log(LOG_ERR,  
1332                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1333                                cb->state);
1334                         break;
1335                 }
1336
1337                 if (cb->validate)
1338                         if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1339                                 log(LOG_ERR, "data mismatch!\n");
1340                                 break;
1341                         }
1342
1343                 if (cb->verbose)
1344                         DEBUG_LOG("ping data: %s\n", cb->rdma_buf);
1345         }
1346 }
1347
1348 static void krping_rlat_test_client(struct krping_cb *cb)
1349 {
1350         struct ib_send_wr *bad_wr;
1351         struct ib_wc wc;
1352         int ret;
1353
1354         cb->state = RDMA_READ_ADV;
1355
1356         /* Send STAG/TO/Len to client */
1357         if (cb->dma_mr)
1358                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1359         else
1360                 krping_format_send(cb, cb->start_addr, cb->rdma_mr);
1361         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1362         if (ret) {
1363                 log(LOG_ERR, "post send error %d\n", ret);
1364                 return;
1365         }
1366
1367         /* Spin waiting for send completion */
1368         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1369         if (ret < 0) {
1370                 log(LOG_ERR, "poll error %d\n", ret);
1371                 return;
1372         }
1373         if (wc.status) {
1374                 log(LOG_ERR, "send completion error %d\n", wc.status);
1375                 return;
1376         }
1377
1378         /* Spin waiting for server's Start STAG/TO/Len */
1379         while (cb->state < RDMA_WRITE_ADV) {
1380                 krping_cq_event_handler(cb->cq, cb);
1381         }
1382
1383 #if 0
1384 {
1385         int i;
1386         struct timeval start, stop;
1387         time_t sec;
1388         suseconds_t usec;
1389         unsigned long long elapsed;
1390         struct ib_wc wc;
1391         struct ib_send_wr *bad_wr;
1392         int ne;
1393         
1394         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1395         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1396         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1397         cb->rdma_sq_wr.sg_list->length = 0;
1398         cb->rdma_sq_wr.num_sge = 0;
1399
1400         microtime(&start);
1401         for (i=0; i < 100000; i++) {
1402                 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1403                         log(LOG_ERR,  "Couldn't post send\n");
1404                         return;
1405                 }
1406                 do {
1407                         ne = ib_poll_cq(cb->cq, 1, &wc);
1408                 } while (ne == 0);
1409                 if (ne < 0) {
1410                         log(LOG_ERR, "poll CQ failed %d\n", ne);
1411                         return;
1412                 }
1413                 if (wc.status != IB_WC_SUCCESS) {
1414                         log(LOG_ERR, "Completion wth error at %s:\n",
1415                                 cb->server ? "server" : "client");
1416                         log(LOG_ERR, "Failed status %d: wr_id %d\n",
1417                                 wc.status, (int) wc.wr_id);
1418                         return;
1419                 }
1420         }
1421         microtime(&stop);
1422         
1423         if (stop.tv_usec < start.tv_usec) {
1424                 stop.tv_usec += 1000000;
1425                 stop.tv_sec  -= 1;
1426         }
1427         sec     = stop.tv_sec - start.tv_sec;
1428         usec    = stop.tv_usec - start.tv_usec;
1429         elapsed = sec * 1000000 + usec;
1430         log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1431 }
1432 #endif
1433
1434         rlat_test(cb);
1435 }
1436
1437 static void krping_wlat_test_client(struct krping_cb *cb)
1438 {
1439         struct ib_send_wr *bad_wr;
1440         struct ib_wc wc;
1441         int ret;
1442
1443         cb->state = RDMA_READ_ADV;
1444
1445         /* Send STAG/TO/Len to client */
1446         if (cb->dma_mr)
1447                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1448         else
1449                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1450         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1451         if (ret) {
1452                 log(LOG_ERR, "post send error %d\n", ret);
1453                 return;
1454         }
1455
1456         /* Spin waiting for send completion */
1457         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1458         if (ret < 0) {
1459                 log(LOG_ERR, "poll error %d\n", ret);
1460                 return;
1461         }
1462         if (wc.status) {
1463                 log(LOG_ERR, "send completion error %d\n", wc.status);
1464                 return;
1465         }
1466
1467         /* Spin waiting for server's Start STAG/TO/Len */
1468         while (cb->state < RDMA_WRITE_ADV) {
1469                 krping_cq_event_handler(cb->cq, cb);
1470         }
1471
1472         wlat_test(cb);
1473 }
1474
1475 static void krping_bw_test_client(struct krping_cb *cb)
1476 {
1477         struct ib_send_wr *bad_wr;
1478         struct ib_wc wc;
1479         int ret;
1480
1481         cb->state = RDMA_READ_ADV;
1482
1483         /* Send STAG/TO/Len to client */
1484         if (cb->dma_mr)
1485                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1486         else
1487                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1488         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1489         if (ret) {
1490                 log(LOG_ERR, "post send error %d\n", ret);
1491                 return;
1492         }
1493
1494         /* Spin waiting for send completion */
1495         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1496         if (ret < 0) {
1497                 log(LOG_ERR, "poll error %d\n", ret);
1498                 return;
1499         }
1500         if (wc.status) {
1501                 log(LOG_ERR, "send completion error %d\n", wc.status);
1502                 return;
1503         }
1504
1505         /* Spin waiting for server's Start STAG/TO/Len */
1506         while (cb->state < RDMA_WRITE_ADV) {
1507                 krping_cq_event_handler(cb->cq, cb);
1508         }
1509
1510         bw_test(cb);
1511 }
1512
1513 static int krping_connect_client(struct krping_cb *cb)
1514 {
1515         struct rdma_conn_param conn_param;
1516         int ret;
1517
1518         memset(&conn_param, 0, sizeof conn_param);
1519         conn_param.responder_resources = 1;
1520         conn_param.initiator_depth = 1;
1521         conn_param.retry_count = 10;
1522
1523         ret = rdma_connect(cb->cm_id, &conn_param);
1524         if (ret) {
1525                 log(LOG_ERR, "rdma_connect error %d\n", ret);
1526                 return ret;
1527         }
1528
1529         krping_wait(cb, CONNECTED);
1530         if (cb->state == ERROR) {
1531                 log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
1532                 return -1;
1533         }
1534
1535         DEBUG_LOG(PFX "rdma_connect successful\n");
1536         return 0;
1537 }
1538
1539 static int krping_bind_client(struct krping_cb *cb)
1540 {
1541         struct sockaddr_in sin;
1542         int ret;
1543
1544         memset(&sin, 0, sizeof(sin));
1545         sin.sin_len = sizeof sin;
1546         sin.sin_family = AF_INET;
1547         sin.sin_addr.s_addr = cb->addr.s_addr;
1548         sin.sin_port = cb->port;
1549
1550         ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
1551                                 2000);
1552         if (ret) {
1553                 log(LOG_ERR, "rdma_resolve_addr error %d\n", ret);
1554                 return ret;
1555         }
1556
1557         krping_wait(cb, ROUTE_RESOLVED);
1558         if (cb->state != ROUTE_RESOLVED) {
1559                 log(LOG_ERR,  
1560                        "addr/route resolution did not resolve: state %d\n",
1561                        cb->state);
1562                 return EINTR;
1563         }
1564
1565         DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n");
1566         return 0;
1567 }
1568
1569 static void krping_run_client(struct krping_cb *cb)
1570 {
1571         struct ib_recv_wr *bad_wr;
1572         int ret;
1573
1574         ret = krping_bind_client(cb);
1575         if (ret)
1576                 return;
1577
1578         ret = krping_setup_qp(cb, cb->cm_id);
1579         if (ret) {
1580                 log(LOG_ERR, "setup_qp failed: %d\n", ret);
1581                 return;
1582         }
1583
1584         ret = krping_setup_buffers(cb);
1585         if (ret) {
1586                 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1587                 goto err1;
1588         }
1589
1590         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1591         if (ret) {
1592                 log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1593                 goto err2;
1594         }
1595
1596         ret = krping_connect_client(cb);
1597         if (ret) {
1598                 log(LOG_ERR, "connect error %d\n", ret);
1599                 goto err2;
1600         }
1601
1602         if (cb->wlat)
1603                 krping_wlat_test_client(cb);
1604         else if (cb->rlat)
1605                 krping_rlat_test_client(cb);
1606         else if (cb->bw)
1607                 krping_bw_test_client(cb);
1608         else
1609                 krping_test_client(cb);
1610         rdma_disconnect(cb->cm_id);
1611 err2:
1612         krping_free_buffers(cb);
1613 err1:
1614         krping_free_qp(cb);
1615 }
1616
1617 int krping_doit(char *cmd)
1618 {
1619         struct krping_cb *cb;
1620         int op;
1621         int ret = 0;
1622         char *optarg;
1623         unsigned long optint;
1624         debug = 0;
1625
1626         cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK);
1627         if (!cb)
1628                 return ENOMEM;
1629         bzero(cb, sizeof *cb);
1630
1631         mtx_lock(&krping_mutex);
1632         TAILQ_INSERT_TAIL(&krping_cbs, cb, list);
1633         mtx_unlock(&krping_mutex);
1634
1635         cb->server = -1;
1636         cb->state = IDLE;
1637         cb->size = 64;
1638         cb->txdepth = RPING_SQ_DEPTH;
1639         mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
1640
1641         while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
1642                               &optint)) != 0) {
1643                 switch (op) {
1644                 case 'a':
1645                         cb->addr_str = optarg;
1646                         DEBUG_LOG(PFX "ipaddr (%s)\n", optarg);
1647                         if (!inet_aton(optarg, &cb->addr)) {
1648                                 log(LOG_ERR, "bad addr string %s\n", optarg);
1649                                 ret = EINVAL;
1650                         }
1651                         break;
1652                 case 'D':
1653                         cb->use_dmamr = 1;
1654                         DEBUG_LOG(PFX "using dma mr\n");
1655                         break;
1656                 case 'p':
1657                         cb->port = htons(optint);
1658                         DEBUG_LOG(PFX "port %d\n", (int)optint);
1659                         break;
1660                 case 'P':
1661                         cb->poll = 1;
1662                         DEBUG_LOG("server\n");
1663                         break;
1664                 case 's':
1665                         cb->server = 1;
1666                         DEBUG_LOG(PFX "server\n");
1667                         break;
1668                 case 'c':
1669                         cb->server = 0;
1670                         DEBUG_LOG(PFX "client\n");
1671                         break;
1672                 case 'S':
1673                         cb->size = optint;
1674                         if ((cb->size < 1) ||
1675                             (cb->size > RPING_BUFSIZE)) {
1676                                 log(LOG_ERR, "Invalid size %d "
1677                                        "(valid range is 1 to %d)\n",
1678                                        cb->size, RPING_BUFSIZE);
1679                                 ret = EINVAL;
1680                         } else
1681                                 DEBUG_LOG(PFX "size %d\n", (int)optint);
1682                         break;
1683                 case 'C':
1684                         cb->count = optint;
1685                         if (cb->count < 0) {
1686                                 log(LOG_ERR, "Invalid count %d\n",
1687                                         cb->count);
1688                                 ret = EINVAL;
1689                         } else
1690                                 DEBUG_LOG(PFX "count %d\n", (int) cb->count);
1691                         break;
1692                 case 'v':
1693                         cb->verbose++;
1694                         DEBUG_LOG(PFX "verbose\n");
1695                         break;
1696                 case 'V':
1697                         cb->validate++;
1698                         DEBUG_LOG(PFX "validate data\n");
1699                         break;
1700                 case 'L':
1701                         cb->rlat++;
1702                         break;
1703                 case 'l':
1704                         cb->wlat++;
1705                         break;
1706                 case 'B':
1707                         cb->bw++;
1708                         break;
1709                 case 't':
1710                         cb->txdepth = optint;
1711                         DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth);
1712                         break;
1713                 case 'd':
1714                         debug++;
1715                         break;
1716                 default:
1717                         log(LOG_ERR, "unknown opt %s\n", optarg);
1718                         ret = EINVAL;
1719                         break;
1720                 }
1721         }
1722         if (ret)
1723                 goto out;
1724
1725         if (cb->server == -1) {
1726                 log(LOG_ERR, "must be either client or server\n");
1727                 ret = EINVAL;
1728                 goto out;
1729         }
1730         if ((cb->bw + cb->rlat + cb->wlat) > 1) {
1731                 log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n");
1732                 ret = EINVAL;
1733                 goto out;
1734         }
1735
1736
1737         cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
1738         if (IS_ERR(cb->cm_id)) {
1739                 ret = PTR_ERR(cb->cm_id);
1740                 log(LOG_ERR, "rdma_create_id error %d\n", ret);
1741                 goto out;
1742         }
1743         DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id);
1744         if (cb->server)
1745                 krping_run_server(cb);
1746         else
1747                 krping_run_client(cb);
1748         DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id);
1749         rdma_destroy_id(cb->cm_id);
1750 out:
1751         mtx_lock(&krping_mutex);
1752         TAILQ_REMOVE(&krping_cbs, cb, list);
1753         mtx_unlock(&krping_mutex);
1754         free(cb, M_DEVBUF);
1755         return ret;
1756 }
1757
1758 void krping_init(void)
1759 {
1760         mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF);
1761         TAILQ_INIT(&krping_cbs);
1762 }