]> CyberLeo.Net >> Repos - FreeBSD/releng/7.2.git/blob - sys/contrib/rdma/krping/krping.c
Create releng/7.2 from stable/7 in preparation for 7.2-RELEASE.
[FreeBSD/releng/7.2.git] / sys / contrib / rdma / krping / krping.c
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/ctype.h>
38
39 #include <sys/param.h>
40 #include <sys/condvar.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/socket.h>
44 #include <sys/module.h>
45 #include <sys/endian.h>
46 #include <sys/limits.h>
47 #include <sys/proc.h>
48 #include <sys/signalvar.h>
49
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/rwlock.h>
53 #include <sys/queue.h>
54 #include <sys/taskqueue.h>
55 #include <sys/syslog.h>
56
57 #include <vm/vm.h>
58 #include <vm/pmap.h>
59
60 #include <contrib/rdma/rdma_cm.h>
61
62 #include "getopt.h"
63 #include "krping.h"
64
65 #define PFX "krping: "
66
67 static int debug = 0;
68 #define DEBUG_LOG if (debug) printf
69
70 static const struct krping_option krping_opts[] = {
71         {"count", OPT_INT, 'C'},
72         {"size", OPT_INT, 'S'},
73         {"addr", OPT_STRING, 'a'},
74         {"port", OPT_INT, 'p'},
75         {"verbose", OPT_NOPARAM, 'v'},
76         {"validate", OPT_NOPARAM, 'V'},
77         {"server", OPT_NOPARAM, 's'},
78         {"client", OPT_NOPARAM, 'c'},
79         {"dmamr", OPT_NOPARAM, 'D'},
80         {"debug", OPT_NOPARAM, 'd'},
81         {"wlat", OPT_NOPARAM, 'l'},
82         {"rlat", OPT_NOPARAM, 'L'},
83         {"bw", OPT_NOPARAM, 'B'},
84         {"tx-depth", OPT_INT, 't'},
85         {"poll", OPT_NOPARAM, 'P'},
86         {NULL, 0, 0}
87 };
88
89 struct mtx krping_mutex;
90
91 /*
92  * List of running krping threads.
93  */
94 struct krping_cb_list krping_cbs;
95
96 /*
97  * krping "ping/pong" loop:
98  *      client sends source rkey/addr/len
99  *      server receives source rkey/add/len
100  *      server rdma reads "ping" data from source
101  *      server sends "go ahead" on rdma read completion
102  *      client sends sink rkey/addr/len
103  *      server receives sink rkey/addr/len
104  *      server rdma writes "pong" data to sink
105  *      server sends "go ahead" on rdma write completion
106  *      <repeat loop>
107  */
108
109 /*
110  * Default max buffer size for IO...
111  */
112 #define RPING_BUFSIZE 128*1024
113 #define RPING_SQ_DEPTH 32
114
115
116 /* lifted from netinet/libalias/alias_proxy.c */
117 static int inet_aton(const char *cp, struct in_addr *addr);
118 static int
119 inet_aton(cp, addr)
120         const char *cp;
121         struct in_addr *addr;
122 {
123         u_long parts[4];
124         in_addr_t val;
125         const char *c;
126         char *endptr;
127         int gotend, n;
128
129         c = (const char *)cp;
130         n = 0;
131         /*
132          * Run through the string, grabbing numbers until
133          * the end of the string, or some error
134          */
135         gotend = 0;
136         while (!gotend) {
137                 unsigned long l;
138
139                 l = strtoul(c, &endptr, 0);
140
141                 if (l == ULONG_MAX || (l == 0 && endptr == c))
142                         return (0);
143
144                 val = (in_addr_t)l;
145                 /*
146                  * If the whole string is invalid, endptr will equal
147                  * c.. this way we can make sure someone hasn't
148                  * gone '.12' or something which would get past
149                  * the next check.
150                  */
151                 if (endptr == c)
152                         return (0);
153                 parts[n] = val;
154                 c = endptr;
155
156                 /* Check the next character past the previous number's end */
157                 switch (*c) {
158                 case '.' :
159                         /* Make sure we only do 3 dots .. */
160                         if (n == 3)     /* Whoops. Quit. */
161                                 return (0);
162                         n++;
163                         c++;
164                         break;
165
166                 case '\0':
167                         gotend = 1;
168                         break;
169
170                 default:
171                         if (isspace((unsigned char)*c)) {
172                                 gotend = 1;
173                                 break;
174                         } else
175                                 return (0);     /* Invalid character, so fail */
176                 }
177
178         }
179
180         /*
181          * Concoct the address according to
182          * the number of parts specified.
183          */
184
185         switch (n) {
186         case 0:                         /* a -- 32 bits */
187                 /*
188                  * Nothing is necessary here.  Overflow checking was
189                  * already done in strtoul().
190                  */
191                 break;
192         case 1:                         /* a.b -- 8.24 bits */
193                 if (val > 0xffffff || parts[0] > 0xff)
194                         return (0);
195                 val |= parts[0] << 24;
196                 break;
197
198         case 2:                         /* a.b.c -- 8.8.16 bits */
199                 if (val > 0xffff || parts[0] > 0xff || parts[1] > 0xff)
200                         return (0);
201                 val |= (parts[0] << 24) | (parts[1] << 16);
202                 break;
203
204         case 3:                         /* a.b.c.d -- 8.8.8.8 bits */
205                 if (val > 0xff || parts[0] > 0xff || parts[1] > 0xff ||
206                     parts[2] > 0xff)
207                         return (0);
208                 val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8);
209                 break;
210         }
211
212         if (addr != NULL)
213                 addr->s_addr = htonl(val);
214         return (1);
215 }
216
217
218 static void krping_wait(struct krping_cb *cb, int state)
219 {
220         int rc;
221         mtx_lock(&cb->lock);
222         while (cb->state < state) {
223                 rc = msleep(cb, &cb->lock, 0, "krping", 0);
224                 if (rc && rc != ERESTART) {
225                         cb->state = ERROR;
226                         break;
227                 }
228         }
229         mtx_unlock(&cb->lock);
230 }
231
232 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
233                                    struct rdma_cm_event *event)
234 {
235         int ret;
236         struct krping_cb *cb = cma_id->context;
237
238         DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
239                   (cma_id == cb->cm_id) ? "parent" : "child");
240
241         mtx_lock(&cb->lock);
242         switch (event->event) {
243         case RDMA_CM_EVENT_ADDR_RESOLVED:
244                 cb->state = ADDR_RESOLVED;
245                 ret = rdma_resolve_route(cma_id, 2000);
246                 if (ret) {
247                         log(LOG_ERR, "rdma_resolve_route error %d\n", 
248                                ret);
249                         wakeup(cb);
250                 }
251                 break;
252
253         case RDMA_CM_EVENT_ROUTE_RESOLVED:
254                 cb->state = ROUTE_RESOLVED;
255                 wakeup(cb);
256                 break;
257
258         case RDMA_CM_EVENT_CONNECT_REQUEST:
259                 cb->state = CONNECT_REQUEST;
260                 cb->child_cm_id = cma_id;
261                 DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id);
262                 wakeup(cb);
263                 break;
264
265         case RDMA_CM_EVENT_ESTABLISHED:
266                 DEBUG_LOG(PFX "ESTABLISHED\n");
267                 if (!cb->server) {
268                         cb->state = CONNECTED;
269                         wakeup(cb);
270                 }
271                 break;
272
273         case RDMA_CM_EVENT_ADDR_ERROR:
274         case RDMA_CM_EVENT_ROUTE_ERROR:
275         case RDMA_CM_EVENT_CONNECT_ERROR:
276         case RDMA_CM_EVENT_UNREACHABLE:
277         case RDMA_CM_EVENT_REJECTED:
278                 log(LOG_ERR, "cma event %d, error %d\n", event->event,
279                        event->status);
280                 cb->state = ERROR;
281                 wakeup(cb);
282                 break;
283
284         case RDMA_CM_EVENT_DISCONNECTED:
285                 DEBUG_LOG(PFX "DISCONNECT EVENT...\n");
286                 cb->state = ERROR;
287                 wakeup(cb);
288                 break;
289
290         case RDMA_CM_EVENT_DEVICE_REMOVAL:
291                 DEBUG_LOG(PFX "cma detected device removal!!!!\n");
292                 break;
293
294         default:
295                 log(LOG_ERR, "oof bad type!\n");
296                 wakeup(cb);
297                 break;
298         }
299         mtx_unlock(&cb->lock);
300         return 0;
301 }
302
303 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
304 {
305         if (wc->byte_len != sizeof(cb->recv_buf)) {
306                 log(LOG_ERR, "Received bogus data, size %d\n", 
307                        wc->byte_len);
308                 return -1;
309         }
310
311         cb->remote_rkey = ntohl(cb->recv_buf.rkey);
312         cb->remote_addr = ntohll(cb->recv_buf.buf);
313         cb->remote_len  = ntohl(cb->recv_buf.size);
314         DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n",
315                   cb->remote_rkey, (unsigned long long)cb->remote_addr, 
316                   cb->remote_len);
317
318         if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
319                 cb->state = RDMA_READ_ADV;
320         else
321                 cb->state = RDMA_WRITE_ADV;
322
323         return 0;
324 }
325
326 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
327 {
328         if (wc->byte_len != sizeof(cb->recv_buf)) {
329                 log(LOG_ERR, "Received bogus data, size %d\n", 
330                        wc->byte_len);
331                 return -1;
332         }
333
334         if (cb->state == RDMA_READ_ADV)
335                 cb->state = RDMA_WRITE_ADV;
336         else
337                 cb->state = RDMA_WRITE_COMPLETE;
338
339         return 0;
340 }
341
342 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
343 {
344         struct krping_cb *cb = ctx;
345         struct ib_wc wc;
346         struct ib_recv_wr *bad_wr;
347         int ret;
348
349         mtx_lock(&cb->lock);
350         KASSERT(cb->cq == cq, ("bad condition"));
351         if (cb->state == ERROR) {
352                 log(LOG_ERR,  "cq completion in ERROR state\n");
353                 mtx_unlock(&cb->lock);
354                 return;
355         }
356         if (!cb->wlat && !cb->rlat && !cb->bw)
357                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
358         while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
359                 if (wc.status) {
360                         if (wc.status != IB_WC_WR_FLUSH_ERR)
361                                 log(LOG_ERR, "cq completion failed status %d\n",
362                                         wc.status);
363                         goto error;
364                 }
365
366                 switch (wc.opcode) {
367                 case IB_WC_SEND:
368                         DEBUG_LOG(PFX "send completion\n");
369                         cb->stats.send_bytes += cb->send_sgl.length;
370                         cb->stats.send_msgs++;
371                         break;
372
373                 case IB_WC_RDMA_WRITE:
374                         DEBUG_LOG(PFX "rdma write completion\n");
375                         cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
376                         cb->stats.write_msgs++;
377                         cb->state = RDMA_WRITE_COMPLETE;
378                         wakeup(cb);
379                         break;
380
381                 case IB_WC_RDMA_READ:
382                         DEBUG_LOG(PFX "rdma read completion\n");
383                         cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
384                         cb->stats.read_msgs++;
385                         cb->state = RDMA_READ_COMPLETE;
386                         wakeup(cb);
387                         break;
388
389                 case IB_WC_RECV:
390                         DEBUG_LOG(PFX "recv completion\n");
391                         cb->stats.recv_bytes += sizeof(cb->recv_buf);
392                         cb->stats.recv_msgs++;
393                         if (cb->wlat || cb->rlat || cb->bw)
394                                 ret = server_recv(cb, &wc);
395                         else
396                                 ret = cb->server ? server_recv(cb, &wc) :
397                                            client_recv(cb, &wc);
398                         if (ret) {
399                                 log(LOG_ERR, "recv wc error: %d\n", ret);
400                                 goto error;
401                         }
402
403                         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
404                         if (ret) {
405                                 log(LOG_ERR, "post recv error: %d\n", 
406                                        ret);
407                                 goto error;
408                         }
409                         wakeup(cb);
410                         break;
411
412                 default:
413                         log(LOG_ERR, "unknown!!!!! completion\n");
414                         goto error;
415                 }
416         }
417         if (ret) {
418                 log(LOG_ERR, "poll error %d\n", ret);
419                 goto error;
420         }
421         mtx_unlock(&cb->lock);
422         return;
423 error:
424         cb->state = ERROR;
425         wakeup(cb);
426         mtx_unlock(&cb->lock);
427 }
428
429 static int krping_accept(struct krping_cb *cb)
430 {
431         struct rdma_conn_param conn_param;
432         int ret;
433
434         DEBUG_LOG(PFX "accepting client connection request\n");
435
436         memset(&conn_param, 0, sizeof conn_param);
437         conn_param.responder_resources = 1;
438         conn_param.initiator_depth = 1;
439
440         ret = rdma_accept(cb->child_cm_id, &conn_param);
441         if (ret) {
442                 log(LOG_ERR, "rdma_accept error: %d\n", ret);
443                 return ret;
444         }
445
446         if (!cb->wlat && !cb->rlat && !cb->bw) {
447                 krping_wait(cb, CONNECTED);
448                 if (cb->state == ERROR) {
449                         log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
450                         return -1;
451                 }
452         }
453         return 0;
454 }
455
456 static void krping_setup_wr(struct krping_cb *cb)
457 {
458         /* XXX X86 only here... not mapping for dma! */
459         cb->recv_sgl.addr = vtophys(&cb->recv_buf);
460         cb->recv_sgl.length = sizeof cb->recv_buf;
461         if (cb->use_dmamr)
462                 cb->recv_sgl.lkey = cb->dma_mr->lkey;
463         else
464                 cb->recv_sgl.lkey = cb->recv_mr->lkey;
465         cb->rq_wr.sg_list = &cb->recv_sgl;
466         cb->rq_wr.num_sge = 1;
467
468         cb->send_sgl.addr = vtophys(&cb->send_buf);
469         cb->send_sgl.length = sizeof cb->send_buf;
470         if (cb->use_dmamr)
471                 cb->send_sgl.lkey = cb->dma_mr->lkey;
472         else
473                 cb->send_sgl.lkey = cb->send_mr->lkey;
474
475         cb->sq_wr.opcode = IB_WR_SEND;
476         cb->sq_wr.send_flags = IB_SEND_SIGNALED;
477         cb->sq_wr.sg_list = &cb->send_sgl;
478         cb->sq_wr.num_sge = 1;
479
480         cb->rdma_addr = vtophys(cb->rdma_buf);
481         cb->rdma_sgl.addr = cb->rdma_addr;
482         if (cb->use_dmamr)
483                 cb->rdma_sgl.lkey = cb->dma_mr->lkey;
484         else
485                 cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
486         cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
487         cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
488         cb->rdma_sq_wr.num_sge = 1;
489
490         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
491                 cb->start_addr = vtophys(cb->start_buf);
492         }
493 }
494
495 static int krping_setup_buffers(struct krping_cb *cb)
496 {
497         int ret;
498         struct ib_phys_buf buf;
499         u64 iovbase;
500
501         DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
502
503         if (cb->use_dmamr) {
504                 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
505                                            IB_ACCESS_REMOTE_READ|
506                                            IB_ACCESS_REMOTE_WRITE);
507                 if (IS_ERR(cb->dma_mr)) {
508                         log(LOG_ERR, "reg_dmamr failed\n");
509                         return PTR_ERR(cb->dma_mr);
510                 }
511         } else {
512
513                 buf.addr = vtophys(&cb->recv_buf);
514                 buf.size = sizeof cb->recv_buf;
515                 iovbase = vtophys(&cb->recv_buf);
516                 cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
517                                              IB_ACCESS_LOCAL_WRITE, 
518                                              &iovbase);
519
520                 if (IS_ERR(cb->recv_mr)) {
521                         log(LOG_ERR, "recv_buf reg_mr failed\n");
522                         return PTR_ERR(cb->recv_mr);
523                 }
524
525                 buf.addr = vtophys(&cb->send_buf);
526                 buf.size = sizeof cb->send_buf;
527                 iovbase = vtophys(&cb->send_buf);
528                 cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
529                                              0, &iovbase);
530
531                 if (IS_ERR(cb->send_mr)) {
532                         log(LOG_ERR, "send_buf reg_mr failed\n");
533                         ib_dereg_mr(cb->recv_mr);
534                         return PTR_ERR(cb->send_mr);
535                 }
536         }
537
538         cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
539                 PAGE_SIZE, 0);
540
541         if (!cb->rdma_buf) {
542                 log(LOG_ERR, "rdma_buf malloc failed\n");
543                 ret = ENOMEM;
544                 goto err1;
545         }
546         if (!cb->use_dmamr) {
547
548                 buf.addr = vtophys(cb->rdma_buf);
549                 buf.size = cb->size;
550                 iovbase = vtophys(cb->rdma_buf);
551                 cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
552                                              IB_ACCESS_REMOTE_READ| 
553                                              IB_ACCESS_REMOTE_WRITE, 
554                                              &iovbase);
555
556                 if (IS_ERR(cb->rdma_mr)) {
557                         log(LOG_ERR, "rdma_buf reg_mr failed\n");
558                         ret = PTR_ERR(cb->rdma_mr);
559                         goto err2;
560                 }
561         }
562
563         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
564                 cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
565                         0, -1UL, PAGE_SIZE, 0);
566                 if (!cb->start_buf) {
567                         log(LOG_ERR, "start_buf malloc failed\n");
568                         ret = ENOMEM;
569                         goto err2;
570                 }
571                 if (!cb->use_dmamr) {
572                         unsigned flags = IB_ACCESS_REMOTE_READ;
573
574                         if (cb->wlat || cb->rlat || cb->bw) 
575                                 flags |= IB_ACCESS_REMOTE_WRITE;
576                         buf.addr = vtophys(cb->start_buf);
577                         buf.size = cb->size;
578                         iovbase = vtophys(cb->start_buf);
579                         cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
580                                              flags,
581                                              &iovbase);
582
583                         if (IS_ERR(cb->start_mr)) {
584                                 log(LOG_ERR, "start_buf reg_mr failed\n");
585                                 ret = PTR_ERR(cb->start_mr);
586                                 goto err3;
587                         }
588                 }
589         }
590
591         krping_setup_wr(cb);
592         DEBUG_LOG(PFX "allocated & registered buffers...\n");
593         return 0;
594 err3:
595         contigfree(cb->start_buf, cb->size, M_DEVBUF);
596
597         if (!cb->use_dmamr)
598                 ib_dereg_mr(cb->rdma_mr);
599 err2:
600         contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
601 err1:
602         if (cb->use_dmamr)
603                 ib_dereg_mr(cb->dma_mr);
604         else {
605                 ib_dereg_mr(cb->recv_mr);
606                 ib_dereg_mr(cb->send_mr);
607         }
608         return ret;
609 }
610
611 static void krping_free_buffers(struct krping_cb *cb)
612 {
613         DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb);
614         
615 #if 0
616         dma_unmap_single(cb->pd->device->dma_device,
617                          pci_unmap_addr(cb, recv_mapping),
618                          sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
619         dma_unmap_single(cb->pd->device->dma_device,
620                          pci_unmap_addr(cb, send_mapping),
621                          sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
622         dma_unmap_single(cb->pd->device->dma_device,
623                          pci_unmap_addr(cb, rdma_mapping),
624                          cb->size, DMA_BIDIRECTIONAL);
625 #endif
626         contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
627         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
628 #if 0
629                 dma_unmap_single(cb->pd->device->dma_device,
630                          pci_unmap_addr(cb, start_mapping),
631                          cb->size, DMA_BIDIRECTIONAL);
632 #endif
633                 contigfree(cb->start_buf, cb->size, M_DEVBUF);
634         }
635         if (cb->use_dmamr)
636                 ib_dereg_mr(cb->dma_mr);
637         else {
638                 ib_dereg_mr(cb->send_mr);
639                 ib_dereg_mr(cb->recv_mr);
640                 ib_dereg_mr(cb->rdma_mr);
641                 if (!cb->server)
642                         ib_dereg_mr(cb->start_mr);
643         }
644 }
645
646 static int krping_create_qp(struct krping_cb *cb)
647 {
648         struct ib_qp_init_attr init_attr;
649         int ret;
650
651         memset(&init_attr, 0, sizeof(init_attr));
652         init_attr.cap.max_send_wr = cb->txdepth;
653         init_attr.cap.max_recv_wr = 2;
654         init_attr.cap.max_recv_sge = 1;
655         init_attr.cap.max_send_sge = 1;
656         init_attr.qp_type = IB_QPT_RC;
657         init_attr.send_cq = cb->cq;
658         init_attr.recv_cq = cb->cq;
659
660         if (cb->server) {
661                 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
662                 if (!ret)
663                         cb->qp = cb->child_cm_id->qp;
664         } else {
665                 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
666                 if (!ret)
667                         cb->qp = cb->cm_id->qp;
668         }
669
670         return ret;
671 }
672
673 static void krping_free_qp(struct krping_cb *cb)
674 {
675         ib_destroy_qp(cb->qp);
676         ib_destroy_cq(cb->cq);
677         ib_dealloc_pd(cb->pd);
678 }
679
680 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
681 {
682         int ret;
683         cb->pd = ib_alloc_pd(cm_id->device);
684         if (IS_ERR(cb->pd)) {
685                 log(LOG_ERR, "ib_alloc_pd failed\n");
686                 return PTR_ERR(cb->pd);
687         }
688         DEBUG_LOG(PFX "created pd %p\n", cb->pd);
689
690         cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
691                               cb, cb->txdepth * 2, 0);
692         if (IS_ERR(cb->cq)) {
693                 log(LOG_ERR, "ib_create_cq failed\n");
694                 ret = PTR_ERR(cb->cq);
695                 goto err1;
696         }
697         DEBUG_LOG(PFX "created cq %p\n", cb->cq);
698
699         if (!cb->wlat && !cb->rlat && !cb->bw) {
700                 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
701                 if (ret) {
702                         log(LOG_ERR, "ib_create_cq failed\n");
703                         goto err2;
704                 }
705         }
706
707         ret = krping_create_qp(cb);
708         if (ret) {
709                 log(LOG_ERR, "krping_create_qp failed: %d\n", ret);
710                 goto err2;
711         }
712         DEBUG_LOG(PFX "created qp %p\n", cb->qp);
713         return 0;
714 err2:
715         ib_destroy_cq(cb->cq);
716 err1:
717         ib_dealloc_pd(cb->pd);
718         return ret;
719 }
720
721 static void krping_format_send(struct krping_cb *cb, u64 buf, 
722                                struct ib_mr *mr)
723 {
724         struct krping_rdma_info *info = &cb->send_buf;
725
726         info->buf = htonll(buf);
727         info->rkey = htonl(mr->rkey);
728         info->size = htonl(cb->size);
729
730         DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n",
731                   (unsigned long long)buf, mr->rkey, cb->size);
732 }
733
734 static void krping_test_server(struct krping_cb *cb)
735 {
736         struct ib_send_wr *bad_wr;
737         int ret;
738
739         while (1) {
740                 /* Wait for client's Start STAG/TO/Len */
741                 krping_wait(cb, RDMA_READ_ADV);
742                 if (cb->state != RDMA_READ_ADV) {
743                         DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n",
744                                 cb->state);
745                         break;
746                 }
747
748                 DEBUG_LOG(PFX "server received sink adv\n");
749
750                 /* Issue RDMA Read. */
751                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
752                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
753                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
754                 cb->rdma_sq_wr.sg_list->length = cb->remote_len;
755
756                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
757                 if (ret) {
758                         log(LOG_ERR, "post send error %d\n", ret);
759                         break;
760                 }
761                 DEBUG_LOG(PFX "server posted rdma read req \n");
762
763                 /* Wait for read completion */
764                 krping_wait(cb, RDMA_READ_COMPLETE);
765                 if (cb->state != RDMA_READ_COMPLETE) {
766                         log(LOG_ERR,  
767                                "wait for RDMA_READ_COMPLETE state %d\n",
768                                cb->state);
769                         break;
770                 }
771                 DEBUG_LOG(PFX "server received read complete\n");
772
773                 /* Display data in recv buf */
774                 if (cb->verbose)
775                         DEBUG_LOG("server ping data: %s\n", cb->rdma_buf);
776
777                 /* Tell client to continue */
778                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
779                 if (ret) {
780                         log(LOG_ERR, "post send error %d\n", ret);
781                         break;
782                 }
783                 DEBUG_LOG(PFX "server posted go ahead\n");
784
785                 /* Wait for client's RDMA STAG/TO/Len */
786                 krping_wait(cb, RDMA_WRITE_ADV);
787                 if (cb->state != RDMA_WRITE_ADV) {
788                         log(LOG_ERR,  
789                                "wait for RDMA_WRITE_ADV state %d\n",
790                                cb->state);
791                         break;
792                 }
793                 DEBUG_LOG(PFX "server received sink adv\n");
794
795                 /* RDMA Write echo data */
796                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
797                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
798                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
799                 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
800                 DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n",
801                           cb->rdma_sq_wr.sg_list->lkey,
802                           (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
803                           cb->rdma_sq_wr.sg_list->length);
804
805                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
806                 if (ret) {
807                         log(LOG_ERR, "post send error %d\n", ret);
808                         break;
809                 }
810
811                 /* Wait for completion */
812                 krping_wait(cb, RDMA_WRITE_COMPLETE);
813                 if (cb->state != RDMA_WRITE_COMPLETE) {
814                         log(LOG_ERR,  
815                                "wait for RDMA_WRITE_COMPLETE state %d\n",
816                                cb->state);
817                         break;
818                 }
819                 DEBUG_LOG(PFX "server rdma write complete \n");
820
821                 cb->state = CONNECTED;
822
823                 /* Tell client to begin again */
824                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
825                 if (ret) {
826                         log(LOG_ERR, "post send error %d\n", ret);
827                         break;
828                 }
829                 DEBUG_LOG(PFX "server posted go ahead\n");
830         }
831 }
832
833 static void rlat_test(struct krping_cb *cb)
834 {
835         int scnt;
836         int iters = cb->count;
837         struct timeval start_tv, stop_tv;
838         int ret;
839         struct ib_wc wc;
840         struct ib_send_wr *bad_wr;
841         int ne;
842
843         scnt = 0;
844         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
845         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
846         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
847         cb->rdma_sq_wr.sg_list->length = cb->size;
848
849         microtime(&start_tv);
850         if (!cb->poll) {
851                 cb->state = RDMA_READ_ADV;
852                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
853         }
854         while (scnt < iters) {
855
856                 cb->state = RDMA_READ_ADV;
857                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
858                 if (ret) {
859                         log(LOG_ERR,  
860                                 "Couldn't post send: ret=%d scnt %d\n",
861                                 ret, scnt);
862                         return;
863                 }
864
865                 do {
866                         if (!cb->poll) {
867                                 krping_wait(cb, RDMA_READ_COMPLETE);
868                                 if (cb->state == RDMA_READ_COMPLETE) {
869                                         ne = 1;
870                                         ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
871                                 } else {
872                                         ne = -1;
873                                 }
874                         } else
875                                 ne = ib_poll_cq(cb->cq, 1, &wc);
876                         if (cb->state == ERROR) {
877                                 log(LOG_ERR, 
878                                        "state == ERROR...bailing scnt %d\n", scnt);
879                                 return;
880                         }
881                 } while (ne == 0);
882
883                 if (ne < 0) {
884                         log(LOG_ERR, "poll CQ failed %d\n", ne);
885                         return;
886                 }
887                 if (cb->poll && wc.status != IB_WC_SUCCESS) {
888                         log(LOG_ERR, "Completion wth error at %s:\n",
889                                 cb->server ? "server" : "client");
890                         log(LOG_ERR, "Failed status %d: wr_id %d\n",
891                                 wc.status, (int) wc.wr_id);
892                         return;
893                 }
894                 ++scnt;
895         }
896         microtime(&stop_tv);
897
898         if (stop_tv.tv_usec < start_tv.tv_usec) {
899                 stop_tv.tv_usec += 1000000;
900                 stop_tv.tv_sec  -= 1;
901         }
902
903         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n",
904                 stop_tv.tv_sec - start_tv.tv_sec, 
905                 stop_tv.tv_usec - start_tv.tv_usec,
906                 scnt, cb->size);
907 }
908
909 static int alloc_cycle_mem(int cycle_iters,
910                                 cycles_t **post_cycles_start,
911                                 cycles_t **post_cycles_stop,
912                                 cycles_t **poll_cycles_start,
913                                 cycles_t **poll_cycles_stop,
914                                 cycles_t **last_poll_cycles_start)
915 {
916         *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
917         if (!*post_cycles_start) {
918                 goto fail1;
919         }
920         *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
921         if (!*post_cycles_stop) {
922                 goto fail2;
923         }
924         *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
925         if (!*poll_cycles_start) {
926                 goto fail3;
927         }
928         *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
929         if (!*poll_cycles_stop) {
930                 goto fail4;
931         }
932         *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
933         if (!*last_poll_cycles_start) {
934                 goto fail5;
935         }
936         return 0;
937 fail5:
938         free(*poll_cycles_stop, M_DEVBUF);
939 fail4:
940         free(*poll_cycles_start, M_DEVBUF);
941 fail3:
942         free(*post_cycles_stop, M_DEVBUF);
943 fail2:
944         free(*post_cycles_start, M_DEVBUF);
945 fail1:
946         log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
947         return ENOMEM;
948 }
949
950 static void free_cycle_mem(cycles_t *post_cycles_start,
951                                 cycles_t *post_cycles_stop,
952                                 cycles_t *poll_cycles_start,
953                                 cycles_t *poll_cycles_stop,
954                                 cycles_t *last_poll_cycles_start)
955 {
956         free(last_poll_cycles_start, M_DEVBUF);
957         free(poll_cycles_stop, M_DEVBUF);
958         free(poll_cycles_start, M_DEVBUF);
959         free(post_cycles_stop, M_DEVBUF);
960         free(post_cycles_start, M_DEVBUF);
961 }
962
963 static void wlat_test(struct krping_cb *cb)
964 {
965         int ccnt, scnt, rcnt;
966         int iters=cb->count;
967         volatile char *poll_buf = (char *) cb->start_buf;
968         char *buf = (char *)cb->rdma_buf;
969         ccnt = 0;
970         scnt = 0;
971         rcnt = 0;
972         struct timeval start_tv, stop_tv;
973         cycles_t *post_cycles_start, *post_cycles_stop;
974         cycles_t *poll_cycles_start, *poll_cycles_stop;
975         cycles_t *last_poll_cycles_start;
976         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
977         int i;
978         int cycle_iters = 1000;
979         int err;
980
981         err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
982                                 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
983                           
984         if (err) {
985                 log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
986                 return;
987         }
988
989         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
990         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
991         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
992         cb->rdma_sq_wr.sg_list->length = cb->size;
993
994         if (cycle_iters > iters)
995                 cycle_iters = iters;
996         microtime(&start_tv);
997         while (scnt < iters || ccnt < iters || rcnt < iters) {
998
999                 /* Wait till buffer changes. */
1000                 if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1001                         ++rcnt;
1002                         while (*poll_buf != (char)rcnt) {
1003                                 if (cb->state == ERROR) {
1004                                         log(LOG_ERR, "state = ERROR, bailing\n");
1005                                         return;
1006                                 }
1007                         }
1008                 }
1009
1010                 if (scnt < iters) {
1011                         struct ib_send_wr *bad_wr;
1012
1013                         *buf = (char)scnt+1;
1014                         if (scnt < cycle_iters)
1015                                 post_cycles_start[scnt] = get_cycles();
1016                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1017                                 log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1018                                         scnt);
1019                                 return;
1020                         }
1021                         if (scnt < cycle_iters)
1022                                 post_cycles_stop[scnt] = get_cycles();
1023                         scnt++;
1024                 }
1025
1026                 if (ccnt < iters) {
1027                         struct ib_wc wc;
1028                         int ne;
1029
1030                         if (ccnt < cycle_iters)
1031                                 poll_cycles_start[ccnt] = get_cycles();
1032                         do {
1033                                 if (ccnt < cycle_iters)
1034                                         last_poll_cycles_start[ccnt] = get_cycles();
1035                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1036                         } while (ne == 0);
1037                         if (ccnt < cycle_iters)
1038                                 poll_cycles_stop[ccnt] = get_cycles();
1039                         ++ccnt;
1040
1041                         if (ne < 0) {
1042                                 log(LOG_ERR, "poll CQ failed %d\n", ne);
1043                                 return;
1044                         }
1045                         if (wc.status != IB_WC_SUCCESS) {
1046                                 log(LOG_ERR, "Completion wth error at %s:\n",
1047                                         cb->server ? "server" : "client");
1048                                 log(LOG_ERR, "Failed status %d: wr_id %d\n",
1049                                         wc.status, (int) wc.wr_id);
1050                                 log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n",
1051                                         scnt, rcnt, ccnt);
1052                                 return;
1053                         }
1054                 }
1055         }
1056         microtime(&stop_tv);
1057
1058         if (stop_tv.tv_usec < start_tv.tv_usec) {
1059                 stop_tv.tv_usec += 1000000;
1060                 stop_tv.tv_sec  -= 1;
1061         }
1062
1063         for (i=0; i < cycle_iters; i++) {
1064                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1065                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1066                 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1067         }
1068
1069         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1070                 stop_tv.tv_sec - start_tv.tv_sec, 
1071                 stop_tv.tv_usec - start_tv.tv_usec,
1072                 scnt, cb->size, cycle_iters, 
1073                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1074                 (unsigned long long)sum_last_poll);
1075
1076         free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 
1077                         poll_cycles_stop, last_poll_cycles_start);
1078 }
1079
1080 static void bw_test(struct krping_cb *cb)
1081 {
1082         int ccnt, scnt, rcnt;
1083         int iters=cb->count;
1084         ccnt = 0;
1085         scnt = 0;
1086         rcnt = 0;
1087         struct timeval start_tv, stop_tv;
1088         cycles_t *post_cycles_start, *post_cycles_stop;
1089         cycles_t *poll_cycles_start, *poll_cycles_stop;
1090         cycles_t *last_poll_cycles_start;
1091         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1092         int i;
1093         int cycle_iters = 1000;
1094         int err;
1095
1096         err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
1097                                 &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
1098                           
1099         if (err) {
1100                 log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__);
1101                 return;
1102         }
1103
1104         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1105         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1106         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1107         cb->rdma_sq_wr.sg_list->length = cb->size;
1108
1109         if (cycle_iters > iters)
1110                 cycle_iters = iters;
1111         microtime(&start_tv);
1112         while (scnt < iters || ccnt < iters) {
1113
1114                 while (scnt < iters && scnt - ccnt < cb->txdepth) {
1115                         struct ib_send_wr *bad_wr;
1116
1117                         if (scnt < cycle_iters)
1118                                 post_cycles_start[scnt] = get_cycles();
1119                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1120                                 log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1121                                         scnt);
1122                                 return;
1123                         }
1124                         if (scnt < cycle_iters)
1125                                 post_cycles_stop[scnt] = get_cycles();
1126                         ++scnt;
1127                 }
1128
1129                 if (ccnt < iters) {
1130                         int ne;
1131                         struct ib_wc wc;
1132
1133                         if (ccnt < cycle_iters)
1134                                 poll_cycles_start[ccnt] = get_cycles();
1135                         do {
1136                                 if (ccnt < cycle_iters)
1137                                         last_poll_cycles_start[ccnt] = get_cycles();
1138                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1139                         } while (ne == 0);
1140                         if (ccnt < cycle_iters)
1141                                 poll_cycles_stop[ccnt] = get_cycles();
1142                         ccnt += 1;
1143
1144                         if (ne < 0) {
1145                                 log(LOG_ERR, "poll CQ failed %d\n", ne);
1146                                 return;
1147                         }
1148                         if (wc.status != IB_WC_SUCCESS) {
1149                                 log(LOG_ERR, "Completion wth error at %s:\n",
1150                                         cb->server ? "server" : "client");
1151                                 log(LOG_ERR, "Failed status %d: wr_id %d\n",
1152                                         wc.status, (int) wc.wr_id);
1153                                 return;
1154                         }
1155                 }
1156         }
1157         microtime(&stop_tv);
1158
1159         if (stop_tv.tv_usec < start_tv.tv_usec) {
1160                 stop_tv.tv_usec += 1000000;
1161                 stop_tv.tv_sec  -= 1;
1162         }
1163
1164         for (i=0; i < cycle_iters; i++) {
1165                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1166                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1167                 sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1168         }
1169
1170         log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1171                 stop_tv.tv_sec - start_tv.tv_sec, 
1172                 stop_tv.tv_usec - start_tv.tv_usec,
1173                 scnt, cb->size, cycle_iters, 
1174                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1175                 (unsigned long long)sum_last_poll);
1176
1177         free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, 
1178                         poll_cycles_stop, last_poll_cycles_start);
1179 }
1180
1181 static void krping_rlat_test_server(struct krping_cb *cb)
1182 {
1183         struct ib_send_wr *bad_wr;
1184         struct ib_wc wc;
1185         int ret;
1186
1187         /* Spin waiting for client's Start STAG/TO/Len */
1188         while (cb->state < RDMA_READ_ADV) {
1189                 krping_cq_event_handler(cb->cq, cb);
1190         }
1191
1192         /* Send STAG/TO/Len to client */
1193         if (cb->dma_mr)
1194                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1195         else
1196                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1197         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1198         if (ret) {
1199                 log(LOG_ERR, "post send error %d\n", ret);
1200                 return;
1201         }
1202
1203         /* Spin waiting for send completion */
1204         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1205         if (ret < 0) {
1206                 log(LOG_ERR, "poll error %d\n", ret);
1207                 return;
1208         }
1209         if (wc.status) {
1210                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1211                 return;
1212         }
1213
1214         krping_wait(cb, ERROR);
1215 }
1216
1217 static void krping_wlat_test_server(struct krping_cb *cb)
1218 {
1219         struct ib_send_wr *bad_wr;
1220         struct ib_wc wc;
1221         int ret;
1222
1223         /* Spin waiting for client's Start STAG/TO/Len */
1224         while (cb->state < RDMA_READ_ADV) {
1225                 krping_cq_event_handler(cb->cq, cb);
1226         }
1227
1228         /* Send STAG/TO/Len to client */
1229         if (cb->dma_mr)
1230                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1231         else
1232                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1233         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1234         if (ret) {
1235                 log(LOG_ERR, "post send error %d\n", ret);
1236                 return;
1237         }
1238
1239         /* Spin waiting for send completion */
1240         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1241         if (ret < 0) {
1242                 log(LOG_ERR, "poll error %d\n", ret);
1243                 return;
1244         }
1245         if (wc.status) {
1246                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1247                 return;
1248         }
1249
1250         wlat_test(cb);
1251
1252 }
1253
1254 static void krping_bw_test_server(struct krping_cb *cb)
1255 {
1256         struct ib_send_wr *bad_wr;
1257         struct ib_wc wc;
1258         int ret;
1259
1260         /* Spin waiting for client's Start STAG/TO/Len */
1261         while (cb->state < RDMA_READ_ADV) {
1262                 krping_cq_event_handler(cb->cq, cb);
1263         }
1264
1265         /* Send STAG/TO/Len to client */
1266         if (cb->dma_mr)
1267                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1268         else
1269                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1270         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1271         if (ret) {
1272                 log(LOG_ERR, "post send error %d\n", ret);
1273                 return;
1274         }
1275
1276         /* Spin waiting for send completion */
1277         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1278         if (ret < 0) {
1279                 log(LOG_ERR, "poll error %d\n", ret);
1280                 return;
1281         }
1282         if (wc.status) {
1283                 log(LOG_ERR, "send completiong error %d\n", wc.status);
1284                 return;
1285         }
1286
1287         if (cb->duplex)
1288                 bw_test(cb);
1289         krping_wait(cb, ERROR);
1290 }
1291
1292 static int krping_bind_server(struct krping_cb *cb)
1293 {
1294         struct sockaddr_in sin;
1295         int ret;
1296
1297         memset(&sin, 0, sizeof(sin));
1298         sin.sin_len = sizeof sin;
1299         sin.sin_family = AF_INET;
1300         sin.sin_addr.s_addr = cb->addr.s_addr;
1301         sin.sin_port = cb->port;
1302
1303         ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1304         if (ret) {
1305                 log(LOG_ERR, "rdma_bind_addr error %d\n", ret);
1306                 return ret;
1307         }
1308         DEBUG_LOG(PFX "rdma_bind_addr successful\n");
1309
1310         DEBUG_LOG(PFX "rdma_listen\n");
1311         ret = rdma_listen(cb->cm_id, 3);
1312         if (ret) {
1313                 log(LOG_ERR, "rdma_listen failed: %d\n", ret);
1314                 return ret;
1315         }
1316
1317         krping_wait(cb, CONNECT_REQUEST);
1318         if (cb->state != CONNECT_REQUEST) {
1319                 log(LOG_ERR,  "wait for CONNECT_REQUEST state %d\n",
1320                         cb->state);
1321                 return -1;
1322         }
1323
1324         return 0;
1325 }
1326
1327 static void krping_run_server(struct krping_cb *cb)
1328 {
1329         struct ib_recv_wr *bad_wr;
1330         int ret;
1331
1332         ret = krping_bind_server(cb);
1333         if (ret)
1334                 return;
1335
1336         ret = krping_setup_qp(cb, cb->child_cm_id);
1337         if (ret) {
1338                 log(LOG_ERR, "setup_qp failed: %d\n", ret);
1339                 return;
1340         }
1341
1342         ret = krping_setup_buffers(cb);
1343         if (ret) {
1344                 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1345                 goto err1;
1346         }
1347
1348         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1349         if (ret) {
1350                 log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1351                 goto err2;
1352         }
1353
1354         ret = krping_accept(cb);
1355         if (ret) {
1356                 log(LOG_ERR, "connect error %d\n", ret);
1357                 goto err2;
1358         }
1359
1360         if (cb->wlat)
1361                 krping_wlat_test_server(cb);
1362         else if (cb->rlat)
1363                 krping_rlat_test_server(cb);
1364         else if (cb->bw)
1365                 krping_bw_test_server(cb);
1366         else
1367                 krping_test_server(cb);
1368
1369         rdma_disconnect(cb->child_cm_id);
1370         rdma_destroy_id(cb->child_cm_id);
1371 err2:
1372         krping_free_buffers(cb);
1373 err1:
1374         krping_free_qp(cb);
1375 }
1376
1377 static void krping_test_client(struct krping_cb *cb)
1378 {
1379         int ping, start, cc, i, ret;
1380         struct ib_send_wr *bad_wr;
1381         unsigned char c;
1382
1383         start = 65;
1384         for (ping = 0; !cb->count || ping < cb->count; ping++) {
1385                 cb->state = RDMA_READ_ADV;
1386
1387                 /* Put some ascii text in the buffer. */
1388                 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1389                 for (i = cc, c = start; i < cb->size; i++) {
1390                         cb->start_buf[i] = c;
1391                         c++;
1392                         if (c > 122)
1393                                 c = 65;
1394                 }
1395                 start++;
1396                 if (start > 122)
1397                         start = 65;
1398                 cb->start_buf[cb->size - 1] = 0;
1399
1400                 if (cb->dma_mr)
1401                         krping_format_send(cb, cb->start_addr, cb->dma_mr);
1402                 else
1403                         krping_format_send(cb, cb->start_addr, cb->start_mr);
1404
1405                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1406                 if (ret) {
1407                         log(LOG_ERR, "post send error %d\n", ret);
1408                         break;
1409                 }
1410
1411                 /* Wait for server to ACK */
1412                 krping_wait(cb, RDMA_WRITE_ADV);
1413                 if (cb->state != RDMA_WRITE_ADV) {
1414                         log(LOG_ERR,  
1415                                "wait for RDMA_WRITE_ADV state %d\n",
1416                                cb->state);
1417                         break;
1418                 }
1419
1420                 if (cb->dma_mr)
1421                         krping_format_send(cb, cb->rdma_addr, cb->dma_mr);
1422                 else
1423                         krping_format_send(cb, cb->rdma_addr, cb->rdma_mr);
1424
1425                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1426                 if (ret) {
1427                         log(LOG_ERR, "post send error %d\n", ret);
1428                         break;
1429                 }
1430
1431                 /* Wait for the server to say the RDMA Write is complete. */
1432                 krping_wait(cb, RDMA_WRITE_COMPLETE);
1433                 if (cb->state != RDMA_WRITE_COMPLETE) {
1434                         log(LOG_ERR,  
1435                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1436                                cb->state);
1437                         break;
1438                 }
1439
1440                 if (cb->validate)
1441                         if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1442                                 log(LOG_ERR, "data mismatch!\n");
1443                                 break;
1444                         }
1445
1446                 if (cb->verbose)
1447                         DEBUG_LOG("ping data: %s\n", cb->rdma_buf);
1448         }
1449 }
1450
1451 static void krping_rlat_test_client(struct krping_cb *cb)
1452 {
1453         struct ib_send_wr *bad_wr;
1454         struct ib_wc wc;
1455         int ret;
1456
1457         cb->state = RDMA_READ_ADV;
1458
1459         /* Send STAG/TO/Len to client */
1460         if (cb->dma_mr)
1461                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1462         else
1463                 krping_format_send(cb, cb->start_addr, cb->rdma_mr);
1464         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1465         if (ret) {
1466                 log(LOG_ERR, "post send error %d\n", ret);
1467                 return;
1468         }
1469
1470         /* Spin waiting for send completion */
1471         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1472         if (ret < 0) {
1473                 log(LOG_ERR, "poll error %d\n", ret);
1474                 return;
1475         }
1476         if (wc.status) {
1477                 log(LOG_ERR, "send completion error %d\n", wc.status);
1478                 return;
1479         }
1480
1481         /* Spin waiting for server's Start STAG/TO/Len */
1482         while (cb->state < RDMA_WRITE_ADV) {
1483                 krping_cq_event_handler(cb->cq, cb);
1484         }
1485
1486 #if 0
1487 {
1488         int i;
1489         struct timeval start, stop;
1490         time_t sec;
1491         suseconds_t usec;
1492         unsigned long long elapsed;
1493         struct ib_wc wc;
1494         struct ib_send_wr *bad_wr;
1495         int ne;
1496         
1497         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1498         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1499         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1500         cb->rdma_sq_wr.sg_list->length = 0;
1501         cb->rdma_sq_wr.num_sge = 0;
1502
1503         microtime(&start);
1504         for (i=0; i < 100000; i++) {
1505                 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1506                         log(LOG_ERR,  "Couldn't post send\n");
1507                         return;
1508                 }
1509                 do {
1510                         ne = ib_poll_cq(cb->cq, 1, &wc);
1511                 } while (ne == 0);
1512                 if (ne < 0) {
1513                         log(LOG_ERR, "poll CQ failed %d\n", ne);
1514                         return;
1515                 }
1516                 if (wc.status != IB_WC_SUCCESS) {
1517                         log(LOG_ERR, "Completion wth error at %s:\n",
1518                                 cb->server ? "server" : "client");
1519                         log(LOG_ERR, "Failed status %d: wr_id %d\n",
1520                                 wc.status, (int) wc.wr_id);
1521                         return;
1522                 }
1523         }
1524         microtime(&stop);
1525         
1526         if (stop.tv_usec < start.tv_usec) {
1527                 stop.tv_usec += 1000000;
1528                 stop.tv_sec  -= 1;
1529         }
1530         sec     = stop.tv_sec - start.tv_sec;
1531         usec    = stop.tv_usec - start.tv_usec;
1532         elapsed = sec * 1000000 + usec;
1533         log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1534 }
1535 #endif
1536
1537         rlat_test(cb);
1538 }
1539
1540 static void krping_wlat_test_client(struct krping_cb *cb)
1541 {
1542         struct ib_send_wr *bad_wr;
1543         struct ib_wc wc;
1544         int ret;
1545
1546         cb->state = RDMA_READ_ADV;
1547
1548         /* Send STAG/TO/Len to client */
1549         if (cb->dma_mr)
1550                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1551         else
1552                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1553         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1554         if (ret) {
1555                 log(LOG_ERR, "post send error %d\n", ret);
1556                 return;
1557         }
1558
1559         /* Spin waiting for send completion */
1560         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1561         if (ret < 0) {
1562                 log(LOG_ERR, "poll error %d\n", ret);
1563                 return;
1564         }
1565         if (wc.status) {
1566                 log(LOG_ERR, "send completion error %d\n", wc.status);
1567                 return;
1568         }
1569
1570         /* Spin waiting for server's Start STAG/TO/Len */
1571         while (cb->state < RDMA_WRITE_ADV) {
1572                 krping_cq_event_handler(cb->cq, cb);
1573         }
1574
1575         wlat_test(cb);
1576 }
1577
1578 static void krping_bw_test_client(struct krping_cb *cb)
1579 {
1580         struct ib_send_wr *bad_wr;
1581         struct ib_wc wc;
1582         int ret;
1583
1584         cb->state = RDMA_READ_ADV;
1585
1586         /* Send STAG/TO/Len to client */
1587         if (cb->dma_mr)
1588                 krping_format_send(cb, cb->start_addr, cb->dma_mr);
1589         else
1590                 krping_format_send(cb, cb->start_addr, cb->start_mr);
1591         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1592         if (ret) {
1593                 log(LOG_ERR, "post send error %d\n", ret);
1594                 return;
1595         }
1596
1597         /* Spin waiting for send completion */
1598         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1599         if (ret < 0) {
1600                 log(LOG_ERR, "poll error %d\n", ret);
1601                 return;
1602         }
1603         if (wc.status) {
1604                 log(LOG_ERR, "send completion error %d\n", wc.status);
1605                 return;
1606         }
1607
1608         /* Spin waiting for server's Start STAG/TO/Len */
1609         while (cb->state < RDMA_WRITE_ADV) {
1610                 krping_cq_event_handler(cb->cq, cb);
1611         }
1612
1613         bw_test(cb);
1614 }
1615
1616 static int krping_connect_client(struct krping_cb *cb)
1617 {
1618         struct rdma_conn_param conn_param;
1619         int ret;
1620
1621         memset(&conn_param, 0, sizeof conn_param);
1622         conn_param.responder_resources = 1;
1623         conn_param.initiator_depth = 1;
1624         conn_param.retry_count = 10;
1625
1626         ret = rdma_connect(cb->cm_id, &conn_param);
1627         if (ret) {
1628                 log(LOG_ERR, "rdma_connect error %d\n", ret);
1629                 return ret;
1630         }
1631
1632         krping_wait(cb, CONNECTED);
1633         if (cb->state == ERROR) {
1634                 log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
1635                 return -1;
1636         }
1637
1638         DEBUG_LOG(PFX "rdma_connect successful\n");
1639         return 0;
1640 }
1641
1642 static int krping_bind_client(struct krping_cb *cb)
1643 {
1644         struct sockaddr_in sin;
1645         int ret;
1646
1647         memset(&sin, 0, sizeof(sin));
1648         sin.sin_len = sizeof sin;
1649         sin.sin_family = AF_INET;
1650         sin.sin_addr.s_addr = cb->addr.s_addr;
1651         sin.sin_port = cb->port;
1652
1653         ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
1654                                 2000);
1655         if (ret) {
1656                 log(LOG_ERR, "rdma_resolve_addr error %d\n", ret);
1657                 return ret;
1658         }
1659
1660         krping_wait(cb, ROUTE_RESOLVED);
1661         if (cb->state != ROUTE_RESOLVED) {
1662                 log(LOG_ERR,  
1663                        "addr/route resolution did not resolve: state %d\n",
1664                        cb->state);
1665                 return EINTR;
1666         }
1667
1668         DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n");
1669         return 0;
1670 }
1671
1672 static void krping_run_client(struct krping_cb *cb)
1673 {
1674         struct ib_recv_wr *bad_wr;
1675         int ret;
1676
1677         ret = krping_bind_client(cb);
1678         if (ret)
1679                 return;
1680
1681         ret = krping_setup_qp(cb, cb->cm_id);
1682         if (ret) {
1683                 log(LOG_ERR, "setup_qp failed: %d\n", ret);
1684                 return;
1685         }
1686
1687         ret = krping_setup_buffers(cb);
1688         if (ret) {
1689                 log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1690                 goto err1;
1691         }
1692
1693         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1694         if (ret) {
1695                 log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1696                 goto err2;
1697         }
1698
1699         ret = krping_connect_client(cb);
1700         if (ret) {
1701                 log(LOG_ERR, "connect error %d\n", ret);
1702                 goto err2;
1703         }
1704
1705         if (cb->wlat)
1706                 krping_wlat_test_client(cb);
1707         else if (cb->rlat)
1708                 krping_rlat_test_client(cb);
1709         else if (cb->bw)
1710                 krping_bw_test_client(cb);
1711         else
1712                 krping_test_client(cb);
1713         rdma_disconnect(cb->cm_id);
1714 err2:
1715         krping_free_buffers(cb);
1716 err1:
1717         krping_free_qp(cb);
1718 }
1719
1720 int krping_doit(char *cmd)
1721 {
1722         struct krping_cb *cb;
1723         int op;
1724         int ret = 0;
1725         char *optarg;
1726         unsigned long optint;
1727         debug = 0;
1728
1729         cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK);
1730         if (!cb)
1731                 return ENOMEM;
1732         bzero(cb, sizeof *cb);
1733
1734         mtx_lock(&krping_mutex);
1735         TAILQ_INSERT_TAIL(&krping_cbs, cb, list);
1736         mtx_unlock(&krping_mutex);
1737
1738         cb->server = -1;
1739         cb->state = IDLE;
1740         cb->size = 64;
1741         cb->txdepth = RPING_SQ_DEPTH;
1742         mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
1743
1744         while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
1745                               &optint)) != 0) {
1746                 switch (op) {
1747                 case 'a':
1748                         cb->addr_str = optarg;
1749                         DEBUG_LOG(PFX "ipaddr (%s)\n", optarg);
1750                         if (!inet_aton(optarg, &cb->addr)) {
1751                                 log(LOG_ERR, "bad addr string %s\n", optarg);
1752                                 ret = EINVAL;
1753                         }
1754                         break;
1755                 case 'D':
1756                         cb->use_dmamr = 1;
1757                         DEBUG_LOG(PFX "using dma mr\n");
1758                         break;
1759                 case 'p':
1760                         cb->port = htons(optint);
1761                         DEBUG_LOG(PFX "port %d\n", (int)optint);
1762                         break;
1763                 case 'P':
1764                         cb->poll = 1;
1765                         DEBUG_LOG("server\n");
1766                         break;
1767                 case 's':
1768                         cb->server = 1;
1769                         DEBUG_LOG(PFX "server\n");
1770                         break;
1771                 case 'c':
1772                         cb->server = 0;
1773                         DEBUG_LOG(PFX "client\n");
1774                         break;
1775                 case 'S':
1776                         cb->size = optint;
1777                         if ((cb->size < 1) ||
1778                             (cb->size > RPING_BUFSIZE)) {
1779                                 log(LOG_ERR, "Invalid size %d "
1780                                        "(valid range is 1 to %d)\n",
1781                                        cb->size, RPING_BUFSIZE);
1782                                 ret = EINVAL;
1783                         } else
1784                                 DEBUG_LOG(PFX "size %d\n", (int)optint);
1785                         break;
1786                 case 'C':
1787                         cb->count = optint;
1788                         if (cb->count < 0) {
1789                                 log(LOG_ERR, "Invalid count %d\n",
1790                                         cb->count);
1791                                 ret = EINVAL;
1792                         } else
1793                                 DEBUG_LOG(PFX "count %d\n", (int) cb->count);
1794                         break;
1795                 case 'v':
1796                         cb->verbose++;
1797                         DEBUG_LOG(PFX "verbose\n");
1798                         break;
1799                 case 'V':
1800                         cb->validate++;
1801                         DEBUG_LOG(PFX "validate data\n");
1802                         break;
1803                 case 'L':
1804                         cb->rlat++;
1805                         break;
1806                 case 'l':
1807                         cb->wlat++;
1808                         break;
1809                 case 'B':
1810                         cb->bw++;
1811                         break;
1812                 case 't':
1813                         cb->txdepth = optint;
1814                         DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth);
1815                         break;
1816                 case 'd':
1817                         debug++;
1818                         break;
1819                 default:
1820                         log(LOG_ERR, "unknown opt %s\n", optarg);
1821                         ret = EINVAL;
1822                         break;
1823                 }
1824         }
1825         if (ret)
1826                 goto out;
1827
1828         if (cb->server == -1) {
1829                 log(LOG_ERR, "must be either client or server\n");
1830                 ret = EINVAL;
1831                 goto out;
1832         }
1833         if ((cb->bw + cb->rlat + cb->wlat) > 1) {
1834                 log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n");
1835                 ret = EINVAL;
1836                 goto out;
1837         }
1838
1839
1840         cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
1841         if (IS_ERR(cb->cm_id)) {
1842                 ret = PTR_ERR(cb->cm_id);
1843                 log(LOG_ERR, "rdma_create_id error %d\n", ret);
1844                 goto out;
1845         }
1846         DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id);
1847         if (cb->server)
1848                 krping_run_server(cb);
1849         else
1850                 krping_run_client(cb);
1851         DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id);
1852         rdma_destroy_id(cb->cm_id);
1853 out:
1854         mtx_lock(&krping_mutex);
1855         TAILQ_REMOVE(&krping_cbs, cb, list);
1856         mtx_unlock(&krping_mutex);
1857         free(cb, M_DEVBUF);
1858         return ret;
1859 }
1860
1861 void krping_init(void)
1862 {
1863         mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF);
1864         TAILQ_INIT(&krping_cbs);
1865 }