]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/contrib/rdma/krping/krping.c
Update our copy of the Linux dts files to be in sync with Linux 4.5-rc1. We
[FreeBSD/FreeBSD.git] / sys / contrib / rdma / krping / krping.c
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <linux/module.h>
38 #include <linux/moduleparam.h>
39 #include <linux/slab.h>
40 #include <linux/err.h>
41 #include <linux/string.h>
42 #include <linux/list.h>
43 #include <linux/in.h>
44 #include <linux/device.h>
45 #include <linux/pci.h>
46 #include <linux/sched.h>
47
48 #include <asm/atomic.h>
49
50 #include <rdma/ib_verbs.h>
51 #include <rdma/rdma_cm.h>
52
53 #include "krping.h"
54 #include "getopt.h"
55
56 extern int krping_debug;
57 #define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x)
58 #define PRINTF(cb, x...) krping_printf((cb)->cookie, x)
59
60 MODULE_AUTHOR("Steve Wise");
61 MODULE_DESCRIPTION("RDMA ping client/server");
62 MODULE_LICENSE("Dual BSD/GPL");
63 MODULE_VERSION(krping, 1);
64 MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
65
66 static __inline uint64_t
67 get_cycles(void)
68 {
69         uint32_t low, high;
70         __asm __volatile("rdtsc" : "=a" (low), "=d" (high));
71         return (low | ((u_int64_t)high << 32));
72 }
73
74 typedef uint64_t cycles_t;
75
76 enum mem_type {
77         DMA = 1,
78         FASTREG = 2,
79         MW = 3,
80         MR = 4
81 };
82
83 static const struct krping_option krping_opts[] = {
84         {"count", OPT_INT, 'C'},
85         {"size", OPT_INT, 'S'},
86         {"addr", OPT_STRING, 'a'},
87         {"port", OPT_INT, 'p'},
88         {"verbose", OPT_NOPARAM, 'v'},
89         {"validate", OPT_NOPARAM, 'V'},
90         {"server", OPT_NOPARAM, 's'},
91         {"client", OPT_NOPARAM, 'c'},
92         {"mem_mode", OPT_STRING, 'm'},
93         {"server_inv", OPT_NOPARAM, 'I'},
94         {"wlat", OPT_NOPARAM, 'l'},
95         {"rlat", OPT_NOPARAM, 'L'},
96         {"bw", OPT_NOPARAM, 'B'},
97         {"duplex", OPT_NOPARAM, 'd'},
98         {"txdepth", OPT_INT, 'T'},
99         {"poll", OPT_NOPARAM, 'P'},
100         {"local_dma_lkey", OPT_NOPARAM, 'Z'},
101         {"read_inv", OPT_NOPARAM, 'R'},
102         {"fr", OPT_NOPARAM, 'f'},
103         {NULL, 0, 0}
104 };
105
106 #define htonll(x) cpu_to_be64((x))
107 #define ntohll(x) cpu_to_be64((x))
108
109 static struct mutex krping_mutex;
110
111 /*
112  * List of running krping threads.
113  */
114 static LIST_HEAD(krping_cbs);
115
116 /*
117  * krping "ping/pong" loop:
118  *      client sends source rkey/addr/len
119  *      server receives source rkey/add/len
120  *      server rdma reads "ping" data from source
121  *      server sends "go ahead" on rdma read completion
122  *      client sends sink rkey/addr/len
123  *      server receives sink rkey/addr/len
124  *      server rdma writes "pong" data to sink
125  *      server sends "go ahead" on rdma write completion
126  *      <repeat loop>
127  */
128
129 /*
130  * These states are used to signal events between the completion handler
131  * and the main client or server thread.
132  *
133  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
134  * and RDMA_WRITE_COMPLETE for each ping.
135  */
136 enum test_state {
137         IDLE = 1,
138         CONNECT_REQUEST,
139         ADDR_RESOLVED,
140         ROUTE_RESOLVED,
141         CONNECTED,
142         RDMA_READ_ADV,
143         RDMA_READ_COMPLETE,
144         RDMA_WRITE_ADV,
145         RDMA_WRITE_COMPLETE,
146         ERROR
147 };
148
149 struct krping_rdma_info {
150         uint64_t buf;
151         uint32_t rkey;
152         uint32_t size;
153 };
154
155 /*
156  * Default max buffer size for IO...
157  */
158 #define RPING_BUFSIZE 128*1024
159 #define RPING_SQ_DEPTH 64
160
161 /*
162  * Control block struct.
163  */
164 struct krping_cb {
165         void *cookie;
166         int server;                     /* 0 iff client */
167         struct ib_cq *cq;
168         struct ib_pd *pd;
169         struct ib_qp *qp;
170
171         enum mem_type mem;
172         struct ib_mr *dma_mr;
173
174         struct ib_fast_reg_page_list *page_list;
175         int page_list_len;
176         struct ib_send_wr fastreg_wr;
177         struct ib_send_wr invalidate_wr;
178         struct ib_mr *fastreg_mr;
179         int server_invalidate;
180         int read_inv;
181         u8 key;
182
183         struct ib_mw *mw;
184         struct ib_mw_bind bind_attr;
185
186         struct ib_recv_wr rq_wr;        /* recv work request record */
187         struct ib_sge recv_sgl;         /* recv single SGE */
188         struct krping_rdma_info recv_buf;/* malloc'd buffer */
189         u64 recv_dma_addr;
190         DECLARE_PCI_UNMAP_ADDR(recv_mapping)
191         struct ib_mr *recv_mr;
192
193         struct ib_send_wr sq_wr;        /* send work requrest record */
194         struct ib_sge send_sgl;
195         struct krping_rdma_info send_buf;/* single send buf */
196         u64 send_dma_addr;
197         DECLARE_PCI_UNMAP_ADDR(send_mapping)
198         struct ib_mr *send_mr;
199
200         struct ib_send_wr rdma_sq_wr;   /* rdma work request record */
201         struct ib_sge rdma_sgl;         /* rdma single SGE */
202         char *rdma_buf;                 /* used as rdma sink */
203         u64  rdma_dma_addr;
204         DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
205         struct ib_mr *rdma_mr;
206
207         uint32_t remote_rkey;           /* remote guys RKEY */
208         uint64_t remote_addr;           /* remote guys TO */
209         uint32_t remote_len;            /* remote guys LEN */
210
211         char *start_buf;                /* rdma read src */
212         u64  start_dma_addr;
213         DECLARE_PCI_UNMAP_ADDR(start_mapping)
214         struct ib_mr *start_mr;
215
216         enum test_state state;          /* used for cond/signalling */
217         wait_queue_head_t sem;
218         struct krping_stats stats;
219
220         uint16_t port;                  /* dst port in NBO */
221         struct in_addr addr;            /* dst addr in NBO */
222         char *addr_str;                 /* dst addr string */
223         int verbose;                    /* verbose logging */
224         int count;                      /* ping count */
225         int size;                       /* ping data size */
226         int validate;                   /* validate ping data */
227         int wlat;                       /* run wlat test */
228         int rlat;                       /* run rlat test */
229         int bw;                         /* run bw test */
230         int duplex;                     /* run bw full duplex test */
231         int poll;                       /* poll or block for rlat test */
232         int txdepth;                    /* SQ depth */
233         int local_dma_lkey;             /* use 0 for lkey */
234         int frtest;                     /* fastreg test */
235
236         /* CM stuff */
237         struct rdma_cm_id *cm_id;       /* connection on client side,*/
238                                         /* listener on server side. */
239         struct rdma_cm_id *child_cm_id; /* connection on server side */
240         struct list_head list;
241 };
242
243 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
244                                    struct rdma_cm_event *event)
245 {
246         int ret;
247         struct krping_cb *cb = cma_id->context;
248
249         DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
250             cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
251
252         switch (event->event) {
253         case RDMA_CM_EVENT_ADDR_RESOLVED:
254                 cb->state = ADDR_RESOLVED;
255                 ret = rdma_resolve_route(cma_id, 2000);
256                 if (ret) {
257                         PRINTF(cb, "rdma_resolve_route error %d\n", ret);
258                         wake_up_interruptible(&cb->sem);
259                 }
260                 break;
261
262         case RDMA_CM_EVENT_ROUTE_RESOLVED:
263                 cb->state = ROUTE_RESOLVED;
264                 wake_up_interruptible(&cb->sem);
265                 break;
266
267         case RDMA_CM_EVENT_CONNECT_REQUEST:
268                 cb->state = CONNECT_REQUEST;
269                 cb->child_cm_id = cma_id;
270                 DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
271                 wake_up_interruptible(&cb->sem);
272                 break;
273
274         case RDMA_CM_EVENT_ESTABLISHED:
275                 DEBUG_LOG(cb, "ESTABLISHED\n");
276                 if (!cb->server) {
277                         cb->state = CONNECTED;
278                 }
279                 wake_up_interruptible(&cb->sem);
280                 break;
281
282         case RDMA_CM_EVENT_ADDR_ERROR:
283         case RDMA_CM_EVENT_ROUTE_ERROR:
284         case RDMA_CM_EVENT_CONNECT_ERROR:
285         case RDMA_CM_EVENT_UNREACHABLE:
286         case RDMA_CM_EVENT_REJECTED:
287                 PRINTF(cb, "cma event %d, error %d\n", event->event,
288                        event->status);
289                 cb->state = ERROR;
290                 wake_up_interruptible(&cb->sem);
291                 break;
292
293         case RDMA_CM_EVENT_DISCONNECTED:
294                 PRINTF(cb, "DISCONNECT EVENT...\n");
295                 cb->state = ERROR;
296                 wake_up_interruptible(&cb->sem);
297                 break;
298
299         case RDMA_CM_EVENT_DEVICE_REMOVAL:
300                 PRINTF(cb, "cma detected device removal!!!!\n");
301                 break;
302
303         default:
304                 PRINTF(cb, "oof bad type!\n");
305                 wake_up_interruptible(&cb->sem);
306                 break;
307         }
308         return 0;
309 }
310
311 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
312 {
313         if (wc->byte_len != sizeof(cb->recv_buf)) {
314                 PRINTF(cb, "Received bogus data, size %d\n", 
315                        wc->byte_len);
316                 return -1;
317         }
318
319         cb->remote_rkey = ntohl(cb->recv_buf.rkey);
320         cb->remote_addr = ntohll(cb->recv_buf.buf);
321         cb->remote_len  = ntohl(cb->recv_buf.size);
322         DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
323                   cb->remote_rkey, (unsigned long long)cb->remote_addr, 
324                   cb->remote_len);
325
326         if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
327                 cb->state = RDMA_READ_ADV;
328         else
329                 cb->state = RDMA_WRITE_ADV;
330
331         return 0;
332 }
333
334 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
335 {
336         if (wc->byte_len != sizeof(cb->recv_buf)) {
337                 PRINTF(cb, "Received bogus data, size %d\n", 
338                        wc->byte_len);
339                 return -1;
340         }
341
342         if (cb->state == RDMA_READ_ADV)
343                 cb->state = RDMA_WRITE_ADV;
344         else
345                 cb->state = RDMA_WRITE_COMPLETE;
346
347         return 0;
348 }
349
350 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
351 {
352         struct krping_cb *cb = ctx;
353         struct ib_wc wc;
354         struct ib_recv_wr *bad_wr;
355         int ret;
356
357         BUG_ON(cb->cq != cq);
358         if (cb->state == ERROR) {
359                 PRINTF(cb, "cq completion in ERROR state\n");
360                 return;
361         }
362         if (cb->frtest) {
363                 PRINTF(cb, "cq completion event in frtest!\n");
364                 return;
365         }
366         if (!cb->wlat && !cb->rlat && !cb->bw)
367                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
368         while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
369                 if (wc.status) {
370                         if (wc.status == IB_WC_WR_FLUSH_ERR) {
371                                 DEBUG_LOG(cb, "cq flushed\n");
372                                 continue;
373                         } else {
374                                 PRINTF(cb, "cq completion failed with "
375                                        "wr_id %Lx status %d opcode %d vender_err %x\n",
376                                         wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
377                                 goto error;
378                         }
379                 }
380
381                 switch (wc.opcode) {
382                 case IB_WC_SEND:
383                         DEBUG_LOG(cb, "send completion\n");
384                         cb->stats.send_bytes += cb->send_sgl.length;
385                         cb->stats.send_msgs++;
386                         break;
387
388                 case IB_WC_RDMA_WRITE:
389                         DEBUG_LOG(cb, "rdma write completion\n");
390                         cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
391                         cb->stats.write_msgs++;
392                         cb->state = RDMA_WRITE_COMPLETE;
393                         wake_up_interruptible(&cb->sem);
394                         break;
395
396                 case IB_WC_RDMA_READ:
397                         DEBUG_LOG(cb, "rdma read completion\n");
398                         cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
399                         cb->stats.read_msgs++;
400                         cb->state = RDMA_READ_COMPLETE;
401                         wake_up_interruptible(&cb->sem);
402                         break;
403
404                 case IB_WC_RECV:
405                         DEBUG_LOG(cb, "recv completion\n");
406                         cb->stats.recv_bytes += sizeof(cb->recv_buf);
407                         cb->stats.recv_msgs++;
408                         if (cb->wlat || cb->rlat || cb->bw)
409                                 ret = server_recv(cb, &wc);
410                         else
411                                 ret = cb->server ? server_recv(cb, &wc) :
412                                                    client_recv(cb, &wc);
413                         if (ret) {
414                                 PRINTF(cb, "recv wc error: %d\n", ret);
415                                 goto error;
416                         }
417
418                         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
419                         if (ret) {
420                                 PRINTF(cb, "post recv error: %d\n", 
421                                        ret);
422                                 goto error;
423                         }
424                         wake_up_interruptible(&cb->sem);
425                         break;
426
427                 default:
428                         PRINTF(cb, 
429                                "%s:%d Unexpected opcode %d, Shutting down\n",
430                                __func__, __LINE__, wc.opcode);
431                         goto error;
432                 }
433         }
434         if (ret) {
435                 PRINTF(cb, "poll error %d\n", ret);
436                 goto error;
437         }
438         return;
439 error:
440         cb->state = ERROR;
441         wake_up_interruptible(&cb->sem);
442 }
443
444 static int krping_accept(struct krping_cb *cb)
445 {
446         struct rdma_conn_param conn_param;
447         int ret;
448
449         DEBUG_LOG(cb, "accepting client connection request\n");
450
451         memset(&conn_param, 0, sizeof conn_param);
452         conn_param.responder_resources = 1;
453         conn_param.initiator_depth = 1;
454
455         ret = rdma_accept(cb->child_cm_id, &conn_param);
456         if (ret) {
457                 PRINTF(cb, "rdma_accept error: %d\n", ret);
458                 return ret;
459         }
460
461         if (!cb->wlat && !cb->rlat && !cb->bw) {
462                 wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
463                 if (cb->state == ERROR) {
464                         PRINTF(cb, "wait for CONNECTED state %d\n", 
465                                 cb->state);
466                         return -1;
467                 }
468         }
469         return 0;
470 }
471
472 static void krping_setup_wr(struct krping_cb *cb)
473 {
474         cb->recv_sgl.addr = cb->recv_dma_addr;
475         cb->recv_sgl.length = sizeof cb->recv_buf;
476         if (cb->local_dma_lkey)
477                 cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
478         else if (cb->mem == DMA)
479                 cb->recv_sgl.lkey = cb->dma_mr->lkey;
480         else
481                 cb->recv_sgl.lkey = cb->recv_mr->lkey;
482         cb->rq_wr.sg_list = &cb->recv_sgl;
483         cb->rq_wr.num_sge = 1;
484
485         cb->send_sgl.addr = cb->send_dma_addr;
486         cb->send_sgl.length = sizeof cb->send_buf;
487         if (cb->local_dma_lkey)
488                 cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
489         else if (cb->mem == DMA)
490                 cb->send_sgl.lkey = cb->dma_mr->lkey;
491         else
492                 cb->send_sgl.lkey = cb->send_mr->lkey;
493
494         cb->sq_wr.opcode = IB_WR_SEND;
495         cb->sq_wr.send_flags = IB_SEND_SIGNALED;
496         cb->sq_wr.sg_list = &cb->send_sgl;
497         cb->sq_wr.num_sge = 1;
498
499         if (cb->server || cb->wlat || cb->rlat || cb->bw) {
500                 cb->rdma_sgl.addr = cb->rdma_dma_addr;
501                 if (cb->mem == MR)
502                         cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
503                 cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
504                 cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
505                 cb->rdma_sq_wr.num_sge = 1;
506         }
507
508         switch(cb->mem) {
509         case FASTREG:
510
511                 /* 
512                  * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
513                  * both unsignaled.  The client uses them to reregister
514                  * the rdma buffers with a new key each iteration.
515                  */
516                 cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
517                 cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
518                 cb->fastreg_wr.wr.fast_reg.length = cb->size;
519                 cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
520                 cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
521
522                 cb->invalidate_wr.next = &cb->fastreg_wr;
523                 cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
524                 break;
525         case MW:
526                 cb->bind_attr.wr_id = 0xabbaabba;
527                 cb->bind_attr.send_flags = 0; /* unsignaled */
528                 cb->bind_attr.bind_info.length = cb->size;
529                 break;
530         default:
531                 break;
532         }
533 }
534
535 static int krping_setup_buffers(struct krping_cb *cb)
536 {
537         int ret;
538         struct ib_phys_buf buf;
539         u64 iovbase;
540
541         DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
542
543         cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device, 
544                                    &cb->recv_buf, 
545                                    sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
546         pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
547         cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device, 
548                                            &cb->send_buf, sizeof(cb->send_buf),
549                                            DMA_BIDIRECTIONAL);
550         pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
551
552         if (cb->mem == DMA) {
553                 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
554                                            IB_ACCESS_REMOTE_READ|
555                                            IB_ACCESS_REMOTE_WRITE);
556                 if (IS_ERR(cb->dma_mr)) {
557                         DEBUG_LOG(cb, "reg_dmamr failed\n");
558                         ret = PTR_ERR(cb->dma_mr);
559                         goto bail;
560                 }
561         } else {
562                 if (!cb->local_dma_lkey) {
563                         buf.addr = cb->recv_dma_addr;
564                         buf.size = sizeof cb->recv_buf;
565                         DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr, 
566                                 (int)buf.size);
567                         iovbase = cb->recv_dma_addr;
568                         cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
569                                                      IB_ACCESS_LOCAL_WRITE, 
570                                                      &iovbase);
571
572                         if (IS_ERR(cb->recv_mr)) {
573                                 DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
574                                 ret = PTR_ERR(cb->recv_mr);
575                                 goto bail;
576                         }
577
578                         buf.addr = cb->send_dma_addr;
579                         buf.size = sizeof cb->send_buf;
580                         DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr, 
581                                 (int)buf.size);
582                         iovbase = cb->send_dma_addr;
583                         cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
584                                                      0, &iovbase);
585
586                         if (IS_ERR(cb->send_mr)) {
587                                 DEBUG_LOG(cb, "send_buf reg_mr failed\n");
588                                 ret = PTR_ERR(cb->send_mr);
589                                 goto bail;
590                         }
591                 }
592         }
593
594         cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
595         if (!cb->rdma_buf) {
596                 DEBUG_LOG(cb, "rdma_buf malloc failed\n");
597                 ret = -ENOMEM;
598                 goto bail;
599         }
600
601         cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device, 
602                                cb->rdma_buf, cb->size, 
603                                DMA_BIDIRECTIONAL);
604         pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
605         if (cb->mem != DMA) {
606                 switch (cb->mem) {
607                 case FASTREG:
608                         cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
609                                 PAGE_SIZE) >> PAGE_SHIFT;
610                         cb->page_list = ib_alloc_fast_reg_page_list(
611                                                 cb->pd->device, 
612                                                 cb->page_list_len);
613                         if (IS_ERR(cb->page_list)) {
614                                 DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
615                                 ret = PTR_ERR(cb->page_list);
616                                 goto bail;
617                         }
618                         cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, 
619                                         cb->page_list->max_page_list_len);
620                         if (IS_ERR(cb->fastreg_mr)) {
621                                 DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
622                                 ret = PTR_ERR(cb->fastreg_mr);
623                                 goto bail;
624                         }
625                         DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
626                                 " page_list_len %u\n", cb->fastreg_mr->rkey, 
627                                 cb->page_list, cb->page_list_len);
628                         break;
629                 case MW:
630                         cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1);
631                         if (IS_ERR(cb->mw)) {
632                                 DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
633                                 ret = PTR_ERR(cb->mw);
634                                 goto bail;
635                         }
636                         DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
637                         /*FALLTHROUGH*/
638                 case MR:
639                         buf.addr = cb->rdma_dma_addr;
640                         buf.size = cb->size;
641                         iovbase = cb->rdma_dma_addr;
642                         cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
643                                              IB_ACCESS_LOCAL_WRITE|
644                                              IB_ACCESS_REMOTE_READ| 
645                                              IB_ACCESS_REMOTE_WRITE, 
646                                              &iovbase);
647                         if (IS_ERR(cb->rdma_mr)) {
648                                 DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
649                                 ret = PTR_ERR(cb->rdma_mr);
650                                 goto bail;
651                         }
652                         DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n", 
653                                 buf.addr, (int)buf.size, cb->rdma_mr->rkey);
654                         break;
655                 default:
656                         ret = -EINVAL;
657                         goto bail;
658                         break;
659                 }
660         }
661
662         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
663
664                 cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
665                 if (!cb->start_buf) {
666                         DEBUG_LOG(cb, "start_buf malloc failed\n");
667                         ret = -ENOMEM;
668                         goto bail;
669                 }
670
671                 cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device, 
672                                                    cb->start_buf, cb->size, 
673                                                    DMA_BIDIRECTIONAL);
674                 pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
675
676                 if (cb->mem == MR || cb->mem == MW) {
677                         unsigned flags = IB_ACCESS_REMOTE_READ;
678
679                         if (cb->wlat || cb->rlat || cb->bw) {
680                                 flags |= IB_ACCESS_LOCAL_WRITE |
681                                     IB_ACCESS_REMOTE_WRITE;
682                         }
683
684                         buf.addr = cb->start_dma_addr;
685                         buf.size = cb->size;
686                         DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n", 
687                                 buf.addr, (int)buf.size);
688                         iovbase = cb->start_dma_addr;
689                         cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
690                                              flags,
691                                              &iovbase);
692
693                         if (IS_ERR(cb->start_mr)) {
694                                 DEBUG_LOG(cb, "start_buf reg_mr failed\n");
695                                 ret = PTR_ERR(cb->start_mr);
696                                 goto bail;
697                         }
698                 }
699         }
700
701         krping_setup_wr(cb);
702         DEBUG_LOG(cb, "allocated & registered buffers...\n");
703         return 0;
704 bail:
705         if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
706                 ib_dereg_mr(cb->fastreg_mr);
707         if (cb->mw && !IS_ERR(cb->mw))
708                 ib_dealloc_mw(cb->mw);
709         if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
710                 ib_dereg_mr(cb->rdma_mr);
711         if (cb->page_list && !IS_ERR(cb->page_list))
712                 ib_free_fast_reg_page_list(cb->page_list);
713         if (cb->dma_mr && !IS_ERR(cb->dma_mr))
714                 ib_dereg_mr(cb->dma_mr);
715         if (cb->recv_mr && !IS_ERR(cb->recv_mr))
716                 ib_dereg_mr(cb->recv_mr);
717         if (cb->send_mr && !IS_ERR(cb->send_mr))
718                 ib_dereg_mr(cb->send_mr);
719         if (cb->rdma_buf)
720                 kfree(cb->rdma_buf);
721         if (cb->start_buf)
722                 kfree(cb->start_buf);
723         return ret;
724 }
725
726 static void krping_free_buffers(struct krping_cb *cb)
727 {
728         DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
729         
730         if (cb->dma_mr)
731                 ib_dereg_mr(cb->dma_mr);
732         if (cb->send_mr)
733                 ib_dereg_mr(cb->send_mr);
734         if (cb->recv_mr)
735                 ib_dereg_mr(cb->recv_mr);
736         if (cb->rdma_mr)
737                 ib_dereg_mr(cb->rdma_mr);
738         if (cb->start_mr)
739                 ib_dereg_mr(cb->start_mr);
740         if (cb->fastreg_mr)
741                 ib_dereg_mr(cb->fastreg_mr);
742         if (cb->mw)
743                 ib_dealloc_mw(cb->mw);
744
745         dma_unmap_single(cb->pd->device->dma_device,
746                          pci_unmap_addr(cb, recv_mapping),
747                          sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
748         dma_unmap_single(cb->pd->device->dma_device,
749                          pci_unmap_addr(cb, send_mapping),
750                          sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
751         dma_unmap_single(cb->pd->device->dma_device,
752                          pci_unmap_addr(cb, rdma_mapping),
753                          cb->size, DMA_BIDIRECTIONAL);
754         kfree(cb->rdma_buf);
755         if (cb->start_buf) {
756                 dma_unmap_single(cb->pd->device->dma_device,
757                          pci_unmap_addr(cb, start_mapping),
758                          cb->size, DMA_BIDIRECTIONAL);
759                 kfree(cb->start_buf);
760         }
761 }
762
763 static int krping_create_qp(struct krping_cb *cb)
764 {
765         struct ib_qp_init_attr init_attr;
766         int ret;
767
768         memset(&init_attr, 0, sizeof(init_attr));
769         init_attr.cap.max_send_wr = cb->txdepth;
770         init_attr.cap.max_recv_wr = 2;
771         init_attr.cap.max_recv_sge = 1;
772         init_attr.cap.max_send_sge = 1;
773         init_attr.qp_type = IB_QPT_RC;
774         init_attr.send_cq = cb->cq;
775         init_attr.recv_cq = cb->cq;
776         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
777
778         if (cb->server) {
779                 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
780                 if (!ret)
781                         cb->qp = cb->child_cm_id->qp;
782         } else {
783                 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
784                 if (!ret)
785                         cb->qp = cb->cm_id->qp;
786         }
787
788         return ret;
789 }
790
791 static void krping_free_qp(struct krping_cb *cb)
792 {
793         ib_destroy_qp(cb->qp);
794         ib_destroy_cq(cb->cq);
795         ib_dealloc_pd(cb->pd);
796 }
797
798 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
799 {
800         int ret;
801         cb->pd = ib_alloc_pd(cm_id->device);
802         if (IS_ERR(cb->pd)) {
803                 PRINTF(cb, "ib_alloc_pd failed\n");
804                 return PTR_ERR(cb->pd);
805         }
806         DEBUG_LOG(cb, "created pd %p\n", cb->pd);
807
808         strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
809
810         cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
811                               cb, cb->txdepth * 2, 0);
812         if (IS_ERR(cb->cq)) {
813                 PRINTF(cb, "ib_create_cq failed\n");
814                 ret = PTR_ERR(cb->cq);
815                 goto err1;
816         }
817         DEBUG_LOG(cb, "created cq %p\n", cb->cq);
818
819         if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
820                 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
821                 if (ret) {
822                         PRINTF(cb, "ib_create_cq failed\n");
823                         goto err2;
824                 }
825         }
826
827         ret = krping_create_qp(cb);
828         if (ret) {
829                 PRINTF(cb, "krping_create_qp failed: %d\n", ret);
830                 goto err2;
831         }
832         DEBUG_LOG(cb, "created qp %p\n", cb->qp);
833         return 0;
834 err2:
835         ib_destroy_cq(cb->cq);
836 err1:
837         ib_dealloc_pd(cb->pd);
838         return ret;
839 }
840
841 /*
842  * return the (possibly rebound) rkey for the rdma buffer.
843  * FASTREG mode: invalidate and rebind via fastreg wr.
844  * MW mode: rebind the MW.
845  * other modes: just return the mr rkey.
846  */
847 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
848 {
849         u32 rkey = 0xffffffff;
850         u64 p;
851         struct ib_send_wr *bad_wr;
852         int i;
853         int ret;
854
855         switch (cb->mem) {
856         case FASTREG:
857                 cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
858
859                 /*
860                  * Update the fastreg key.
861                  */
862                 ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
863                 cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
864
865                 /*
866                  * Update the fastreg WR with new buf info.
867                  */
868                 if (buf == (u64)cb->start_dma_addr)
869                         cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
870                 else
871                         cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
872                 cb->fastreg_wr.wr.fast_reg.iova_start = buf;
873                 p = (u64)(buf & PAGE_MASK);
874                 for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; 
875                      i++, p += PAGE_SIZE) {
876                         cb->page_list->page_list[i] = p;
877                         DEBUG_LOG(cb, "page_list[%d] 0x%llx\n", i, p);
878                 }
879
880                 DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
881                         " iova_start %llx page_list_len %u\n",
882                         post_inv,
883                         cb->fastreg_wr.wr.fast_reg.rkey,
884                         cb->fastreg_wr.wr.fast_reg.page_shift,
885                         cb->fastreg_wr.wr.fast_reg.length,
886                         cb->fastreg_wr.wr.fast_reg.iova_start,
887                         cb->fastreg_wr.wr.fast_reg.page_list_len);
888
889                 if (post_inv)
890                         ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
891                 else
892                         ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
893                 if (ret) {
894                         PRINTF(cb, "post send error %d\n", ret);
895                         cb->state = ERROR;
896                 }
897                 rkey = cb->fastreg_mr->rkey;
898                 break;
899         case MW:
900                 /*
901                  * Update the MW with new buf info.
902                  */
903                 if (buf == (u64)cb->start_dma_addr) {
904                         cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ;
905                         cb->bind_attr.bind_info.mr = cb->start_mr;
906                 } else {
907                         cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
908                         cb->bind_attr.bind_info.mr = cb->rdma_mr;
909                 }
910                 cb->bind_attr.bind_info.addr = buf;
911                 DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n",
912                         cb->mw->rkey, buf, cb->bind_attr.bind_info.mr->rkey);
913                 ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
914                 if (ret) {
915                         PRINTF(cb, "bind mw error %d\n", ret);
916                         cb->state = ERROR;
917                 } else
918                         rkey = cb->mw->rkey;
919                 break;
920         case MR:
921                 if (buf == (u64)cb->start_dma_addr)
922                         rkey = cb->start_mr->rkey;
923                 else
924                         rkey = cb->rdma_mr->rkey;
925                 break;
926         case DMA:
927                 rkey = cb->dma_mr->rkey;
928                 break;
929         default:
930                 PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
931                 cb->state = ERROR;
932                 break;
933         }
934         return rkey;
935 }
936
937 static void krping_format_send(struct krping_cb *cb, u64 buf)
938 {
939         struct krping_rdma_info *info = &cb->send_buf;
940         u32 rkey;
941
942         /*
943          * Client side will do fastreg or mw bind before
944          * advertising the rdma buffer.  Server side
945          * sends have no data.
946          */
947         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
948                 rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
949                 info->buf = htonll(buf);
950                 info->rkey = htonl(rkey);
951                 info->size = htonl(cb->size);
952                 DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
953                           (unsigned long long)buf, rkey, cb->size);
954         }
955 }
956
957 static void krping_test_server(struct krping_cb *cb)
958 {
959         struct ib_send_wr *bad_wr, inv;
960         int ret;
961
962         while (1) {
963                 /* Wait for client's Start STAG/TO/Len */
964                 wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
965                 if (cb->state != RDMA_READ_ADV) {
966                         PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
967                                 cb->state);
968                         break;
969                 }
970
971                 DEBUG_LOG(cb, "server received sink adv\n");
972
973                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
974                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
975                 cb->rdma_sq_wr.sg_list->length = cb->remote_len;
976                 cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
977                 cb->rdma_sq_wr.next = NULL;
978
979                 /* Issue RDMA Read. */
980                 if (cb->read_inv)
981                         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
982                 else {
983
984                         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
985                         if (cb->mem == FASTREG) {
986                                 /* 
987                                  * Immediately follow the read with a 
988                                  * fenced LOCAL_INV.
989                                  */
990                                 cb->rdma_sq_wr.next = &inv;
991                                 memset(&inv, 0, sizeof inv);
992                                 inv.opcode = IB_WR_LOCAL_INV;
993                                 inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
994                                 inv.send_flags = IB_SEND_FENCE;
995                         }
996                 }
997
998                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
999                 if (ret) {
1000                         PRINTF(cb, "post send error %d\n", ret);
1001                         break;
1002                 }
1003                 cb->rdma_sq_wr.next = NULL;
1004
1005                 DEBUG_LOG(cb, "server posted rdma read req \n");
1006
1007                 /* Wait for read completion */
1008                 wait_event_interruptible(cb->sem, 
1009                                          cb->state >= RDMA_READ_COMPLETE);
1010                 if (cb->state != RDMA_READ_COMPLETE) {
1011                         PRINTF(cb, 
1012                                "wait for RDMA_READ_COMPLETE state %d\n",
1013                                cb->state);
1014                         break;
1015                 }
1016                 DEBUG_LOG(cb, "server received read complete\n");
1017
1018                 /* Display data in recv buf */
1019                 if (cb->verbose) {
1020                         if (strlen(cb->rdma_buf) > 128) {
1021                                 char msgbuf[128];
1022
1023                                 strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
1024                                 PRINTF(cb, "server ping data stripped: %s\n",
1025                                        msgbuf);
1026                         } else
1027                                 PRINTF(cb, "server ping data: %s\n",
1028                                        cb->rdma_buf);
1029                 }
1030
1031                 /* Tell client to continue */
1032                 if (cb->server && cb->server_invalidate) {
1033                         cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1034                         cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1035                         DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1036                 } 
1037                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1038                 if (ret) {
1039                         PRINTF(cb, "post send error %d\n", ret);
1040                         break;
1041                 }
1042                 DEBUG_LOG(cb, "server posted go ahead\n");
1043
1044                 /* Wait for client's RDMA STAG/TO/Len */
1045                 wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1046                 if (cb->state != RDMA_WRITE_ADV) {
1047                         PRINTF(cb, 
1048                                "wait for RDMA_WRITE_ADV state %d\n",
1049                                cb->state);
1050                         break;
1051                 }
1052                 DEBUG_LOG(cb, "server received sink adv\n");
1053
1054                 /* RDMA Write echo data */
1055                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1056                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1057                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1058                 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1059                 if (cb->local_dma_lkey)
1060                         cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1061                 else 
1062                         cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1063                         
1064                 DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1065                           cb->rdma_sq_wr.sg_list->lkey,
1066                           (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1067                           cb->rdma_sq_wr.sg_list->length);
1068
1069                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1070                 if (ret) {
1071                         PRINTF(cb, "post send error %d\n", ret);
1072                         break;
1073                 }
1074
1075                 /* Wait for completion */
1076                 ret = wait_event_interruptible(cb->sem, cb->state >= 
1077                                                          RDMA_WRITE_COMPLETE);
1078                 if (cb->state != RDMA_WRITE_COMPLETE) {
1079                         PRINTF(cb, 
1080                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1081                                cb->state);
1082                         break;
1083                 }
1084                 DEBUG_LOG(cb, "server rdma write complete \n");
1085
1086                 cb->state = CONNECTED;
1087
1088                 /* Tell client to begin again */
1089                 if (cb->server && cb->server_invalidate) {
1090                         cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1091                         cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1092                         DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1093                 } 
1094                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1095                 if (ret) {
1096                         PRINTF(cb, "post send error %d\n", ret);
1097                         break;
1098                 }
1099                 DEBUG_LOG(cb, "server posted go ahead\n");
1100         }
1101 }
1102
1103 static void rlat_test(struct krping_cb *cb)
1104 {
1105         int scnt;
1106         int iters = cb->count;
1107         struct timeval start_tv, stop_tv;
1108         int ret;
1109         struct ib_wc wc;
1110         struct ib_send_wr *bad_wr;
1111         int ne;
1112
1113         scnt = 0;
1114         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1115         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1116         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1117         cb->rdma_sq_wr.sg_list->length = cb->size;
1118
1119         microtime(&start_tv);
1120         if (!cb->poll) {
1121                 cb->state = RDMA_READ_ADV;
1122                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1123         }
1124         while (scnt < iters) {
1125
1126                 cb->state = RDMA_READ_ADV;
1127                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1128                 if (ret) {
1129                         PRINTF(cb, 
1130                                 "Couldn't post send: ret=%d scnt %d\n",
1131                                 ret, scnt);
1132                         return;
1133                 }
1134
1135                 do {
1136                         if (!cb->poll) {
1137                                 wait_event_interruptible(cb->sem, 
1138                                         cb->state != RDMA_READ_ADV);
1139                                 if (cb->state == RDMA_READ_COMPLETE) {
1140                                         ne = 1;
1141                                         ib_req_notify_cq(cb->cq, 
1142                                                 IB_CQ_NEXT_COMP);
1143                                 } else {
1144                                         ne = -1;
1145                                 }
1146                         } else
1147                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1148                         if (cb->state == ERROR) {
1149                                 PRINTF(cb, 
1150                                         "state == ERROR...bailing scnt %d\n", 
1151                                         scnt);
1152                                 return;
1153                         }
1154                 } while (ne == 0);
1155
1156                 if (ne < 0) {
1157                         PRINTF(cb, "poll CQ failed %d\n", ne);
1158                         return;
1159                 }
1160                 if (cb->poll && wc.status != IB_WC_SUCCESS) {
1161                         PRINTF(cb, "Completion wth error at %s:\n",
1162                                 cb->server ? "server" : "client");
1163                         PRINTF(cb, "Failed status %d: wr_id %d\n",
1164                                 wc.status, (int) wc.wr_id);
1165                         return;
1166                 }
1167                 ++scnt;
1168         }
1169         microtime(&stop_tv);
1170
1171         if (stop_tv.tv_usec < start_tv.tv_usec) {
1172                 stop_tv.tv_usec += 1000000;
1173                 stop_tv.tv_sec  -= 1;
1174         }
1175
1176         PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1177                 stop_tv.tv_sec - start_tv.tv_sec, 
1178                 stop_tv.tv_usec - start_tv.tv_usec,
1179                 scnt, cb->size);
1180 }
1181
1182 static void wlat_test(struct krping_cb *cb)
1183 {
1184         int ccnt, scnt, rcnt;
1185         int iters=cb->count;
1186         volatile char *poll_buf = (char *) cb->start_buf;
1187         char *buf = (char *)cb->rdma_buf;
1188         struct timeval start_tv, stop_tv;
1189         cycles_t *post_cycles_start, *post_cycles_stop;
1190         cycles_t *poll_cycles_start, *poll_cycles_stop;
1191         cycles_t *last_poll_cycles_start;
1192         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1193         int i;
1194         int cycle_iters = 1000;
1195
1196         ccnt = 0;
1197         scnt = 0;
1198         rcnt = 0;
1199
1200         post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1201         if (!post_cycles_start) {
1202                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1203                 return;
1204         }
1205         post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1206         if (!post_cycles_stop) {
1207                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1208                 return;
1209         }
1210         poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1211         if (!poll_cycles_start) {
1212                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1213                 return;
1214         }
1215         poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1216         if (!poll_cycles_stop) {
1217                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1218                 return;
1219         }
1220         last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
1221                 GFP_KERNEL);
1222         if (!last_poll_cycles_start) {
1223                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1224                 return;
1225         }
1226         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1227         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1228         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1229         cb->rdma_sq_wr.sg_list->length = cb->size;
1230
1231         if (cycle_iters > iters)
1232                 cycle_iters = iters;
1233         microtime(&start_tv);
1234         while (scnt < iters || ccnt < iters || rcnt < iters) {
1235
1236                 /* Wait till buffer changes. */
1237                 if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1238                         ++rcnt;
1239                         while (*poll_buf != (char)rcnt) {
1240                                 if (cb->state == ERROR) {
1241                                         PRINTF(cb, 
1242                                                 "state = ERROR, bailing\n");
1243                                         return;
1244                                 }
1245                         }
1246                 }
1247
1248                 if (scnt < iters) {
1249                         struct ib_send_wr *bad_wr;
1250
1251                         *buf = (char)scnt+1;
1252                         if (scnt < cycle_iters)
1253                                 post_cycles_start[scnt] = get_cycles();
1254                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1255                                 PRINTF(cb, 
1256                                         "Couldn't post send: scnt=%d\n",
1257                                         scnt);
1258                                 return;
1259                         }
1260                         if (scnt < cycle_iters)
1261                                 post_cycles_stop[scnt] = get_cycles();
1262                         scnt++;
1263                 }
1264
1265                 if (ccnt < iters) {
1266                         struct ib_wc wc;
1267                         int ne;
1268
1269                         if (ccnt < cycle_iters)
1270                                 poll_cycles_start[ccnt] = get_cycles();
1271                         do {
1272                                 if (ccnt < cycle_iters)
1273                                         last_poll_cycles_start[ccnt] = 
1274                                                 get_cycles();
1275                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1276                         } while (ne == 0);
1277                         if (ccnt < cycle_iters)
1278                                 poll_cycles_stop[ccnt] = get_cycles();
1279                         ++ccnt;
1280
1281                         if (ne < 0) {
1282                                 PRINTF(cb, "poll CQ failed %d\n", ne);
1283                                 return;
1284                         }
1285                         if (wc.status != IB_WC_SUCCESS) {
1286                                 PRINTF(cb, 
1287                                         "Completion wth error at %s:\n",
1288                                         cb->server ? "server" : "client");
1289                                 PRINTF(cb, 
1290                                         "Failed status %d: wr_id %d\n",
1291                                         wc.status, (int) wc.wr_id);
1292                                 PRINTF(cb, 
1293                                         "scnt=%d, rcnt=%d, ccnt=%d\n",
1294                                         scnt, rcnt, ccnt);
1295                                 return;
1296                         }
1297                 }
1298         }
1299         microtime(&stop_tv);
1300
1301         if (stop_tv.tv_usec < start_tv.tv_usec) {
1302                 stop_tv.tv_usec += 1000000;
1303                 stop_tv.tv_sec  -= 1;
1304         }
1305
1306         for (i=0; i < cycle_iters; i++) {
1307                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1308                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1309                 sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1310         }
1311         PRINTF(cb, 
1312                 "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1313                 " sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1314                 stop_tv.tv_sec - start_tv.tv_sec, 
1315                 stop_tv.tv_usec - start_tv.tv_usec,
1316                 scnt, cb->size, cycle_iters, 
1317                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1318                 (unsigned long long)sum_last_poll);
1319         kfree(post_cycles_start);
1320         kfree(post_cycles_stop);
1321         kfree(poll_cycles_start);
1322         kfree(poll_cycles_stop);
1323         kfree(last_poll_cycles_start);
1324 }
1325
1326 static void bw_test(struct krping_cb *cb)
1327 {
1328         int ccnt, scnt, rcnt;
1329         int iters=cb->count;
1330         struct timeval start_tv, stop_tv;
1331         cycles_t *post_cycles_start, *post_cycles_stop;
1332         cycles_t *poll_cycles_start, *poll_cycles_stop;
1333         cycles_t *last_poll_cycles_start;
1334         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1335         int i;
1336         int cycle_iters = 1000;
1337
1338         ccnt = 0;
1339         scnt = 0;
1340         rcnt = 0;
1341
1342         post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1343         if (!post_cycles_start) {
1344                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1345                 return;
1346         }
1347         post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1348         if (!post_cycles_stop) {
1349                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1350                 return;
1351         }
1352         poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1353         if (!poll_cycles_start) {
1354                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1355                 return;
1356         }
1357         poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1358         if (!poll_cycles_stop) {
1359                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1360                 return;
1361         }
1362         last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
1363                 GFP_KERNEL);
1364         if (!last_poll_cycles_start) {
1365                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1366                 return;
1367         }
1368         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1369         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1370         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1371         cb->rdma_sq_wr.sg_list->length = cb->size;
1372
1373         if (cycle_iters > iters)
1374                 cycle_iters = iters;
1375         microtime(&start_tv);
1376         while (scnt < iters || ccnt < iters) {
1377
1378                 while (scnt < iters && scnt - ccnt < cb->txdepth) {
1379                         struct ib_send_wr *bad_wr;
1380
1381                         if (scnt < cycle_iters)
1382                                 post_cycles_start[scnt] = get_cycles();
1383                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1384                                 PRINTF(cb, 
1385                                         "Couldn't post send: scnt=%d\n",
1386                                         scnt);
1387                                 return;
1388                         }
1389                         if (scnt < cycle_iters)
1390                                 post_cycles_stop[scnt] = get_cycles();
1391                         ++scnt;
1392                 }
1393
1394                 if (ccnt < iters) {
1395                         int ne;
1396                         struct ib_wc wc;
1397
1398                         if (ccnt < cycle_iters)
1399                                 poll_cycles_start[ccnt] = get_cycles();
1400                         do {
1401                                 if (ccnt < cycle_iters)
1402                                         last_poll_cycles_start[ccnt] = 
1403                                                 get_cycles();
1404                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1405                         } while (ne == 0);
1406                         if (ccnt < cycle_iters)
1407                                 poll_cycles_stop[ccnt] = get_cycles();
1408                         ccnt += 1;
1409
1410                         if (ne < 0) {
1411                                 PRINTF(cb, "poll CQ failed %d\n", ne);
1412                                 return;
1413                         }
1414                         if (wc.status != IB_WC_SUCCESS) {
1415                                 PRINTF(cb, 
1416                                         "Completion wth error at %s:\n",
1417                                         cb->server ? "server" : "client");
1418                                 PRINTF(cb, 
1419                                         "Failed status %d: wr_id %d\n",
1420                                         wc.status, (int) wc.wr_id);
1421                                 return;
1422                         }
1423                 }
1424         }
1425         microtime(&stop_tv);
1426
1427         if (stop_tv.tv_usec < start_tv.tv_usec) {
1428                 stop_tv.tv_usec += 1000000;
1429                 stop_tv.tv_sec  -= 1;
1430         }
1431
1432         for (i=0; i < cycle_iters; i++) {
1433                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1434                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1435                 sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1436         }
1437         PRINTF(cb, 
1438                 "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1439                 " sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1440                 stop_tv.tv_sec - start_tv.tv_sec, 
1441                 stop_tv.tv_usec - start_tv.tv_usec,
1442                 scnt, cb->size, cycle_iters, 
1443                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1444                 (unsigned long long)sum_last_poll);
1445         kfree(post_cycles_start);
1446         kfree(post_cycles_stop);
1447         kfree(poll_cycles_start);
1448         kfree(poll_cycles_stop);
1449         kfree(last_poll_cycles_start);
1450 }
1451
1452 static void krping_rlat_test_server(struct krping_cb *cb)
1453 {
1454         struct ib_send_wr *bad_wr;
1455         struct ib_wc wc;
1456         int ret;
1457
1458         /* Spin waiting for client's Start STAG/TO/Len */
1459         while (cb->state < RDMA_READ_ADV) {
1460                 krping_cq_event_handler(cb->cq, cb);
1461         }
1462
1463         /* Send STAG/TO/Len to client */
1464         krping_format_send(cb, cb->start_dma_addr);
1465         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1466         if (ret) {
1467                 PRINTF(cb, "post send error %d\n", ret);
1468                 return;
1469         }
1470
1471         /* Spin waiting for send completion */
1472         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1473         if (ret < 0) {
1474                 PRINTF(cb, "poll error %d\n", ret);
1475                 return;
1476         }
1477         if (wc.status) {
1478                 PRINTF(cb, "send completiong error %d\n", wc.status);
1479                 return;
1480         }
1481
1482         wait_event_interruptible(cb->sem, cb->state == ERROR);
1483 }
1484
1485 static void krping_wlat_test_server(struct krping_cb *cb)
1486 {
1487         struct ib_send_wr *bad_wr;
1488         struct ib_wc wc;
1489         int ret;
1490
1491         /* Spin waiting for client's Start STAG/TO/Len */
1492         while (cb->state < RDMA_READ_ADV) {
1493                 krping_cq_event_handler(cb->cq, cb);
1494         }
1495
1496         /* Send STAG/TO/Len to client */
1497         krping_format_send(cb, cb->start_dma_addr);
1498         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1499         if (ret) {
1500                 PRINTF(cb, "post send error %d\n", ret);
1501                 return;
1502         }
1503
1504         /* Spin waiting for send completion */
1505         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1506         if (ret < 0) {
1507                 PRINTF(cb, "poll error %d\n", ret);
1508                 return;
1509         }
1510         if (wc.status) {
1511                 PRINTF(cb, "send completiong error %d\n", wc.status);
1512                 return;
1513         }
1514
1515         wlat_test(cb);
1516         wait_event_interruptible(cb->sem, cb->state == ERROR);
1517 }
1518
1519 static void krping_bw_test_server(struct krping_cb *cb)
1520 {
1521         struct ib_send_wr *bad_wr;
1522         struct ib_wc wc;
1523         int ret;
1524
1525         /* Spin waiting for client's Start STAG/TO/Len */
1526         while (cb->state < RDMA_READ_ADV) {
1527                 krping_cq_event_handler(cb->cq, cb);
1528         }
1529
1530         /* Send STAG/TO/Len to client */
1531         krping_format_send(cb, cb->start_dma_addr);
1532         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1533         if (ret) {
1534                 PRINTF(cb, "post send error %d\n", ret);
1535                 return;
1536         }
1537
1538         /* Spin waiting for send completion */
1539         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1540         if (ret < 0) {
1541                 PRINTF(cb, "poll error %d\n", ret);
1542                 return;
1543         }
1544         if (wc.status) {
1545                 PRINTF(cb, "send completiong error %d\n", wc.status);
1546                 return;
1547         }
1548
1549         if (cb->duplex)
1550                 bw_test(cb);
1551         wait_event_interruptible(cb->sem, cb->state == ERROR);
1552 }
1553
1554 static int fastreg_supported(struct krping_cb *cb)
1555 {
1556         struct ib_device *dev = cb->child_cm_id->device;
1557         struct ib_device_attr attr;
1558         int ret;
1559
1560         ret = ib_query_device(dev, &attr);
1561         if (ret) {
1562                 PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1563                 return 0;
1564         }
1565         if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1566                 PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%x\n",
1567                     attr.device_cap_flags);
1568                 return 0;
1569         }
1570         DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%x\n",
1571                 attr.device_cap_flags);
1572         return 1;
1573 }
1574
1575 static int krping_bind_server(struct krping_cb *cb)
1576 {
1577         struct sockaddr_in sin;
1578         int ret;
1579
1580         memset(&sin, 0, sizeof(sin));
1581         sin.sin_len = sizeof sin;
1582         sin.sin_family = AF_INET;
1583         sin.sin_addr.s_addr = cb->addr.s_addr;
1584         sin.sin_port = cb->port;
1585
1586         ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1587         if (ret) {
1588                 PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1589                 return ret;
1590         }
1591         DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1592
1593         DEBUG_LOG(cb, "rdma_listen\n");
1594         ret = rdma_listen(cb->cm_id, 3);
1595         if (ret) {
1596                 PRINTF(cb, "rdma_listen failed: %d\n", ret);
1597                 return ret;
1598         }
1599
1600         wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1601         if (cb->state != CONNECT_REQUEST) {
1602                 PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1603                         cb->state);
1604                 return -1;
1605         }
1606
1607         if (cb->mem == FASTREG && !fastreg_supported(cb))
1608                 return -EINVAL;
1609
1610         return 0;
1611 }
1612
1613 static void krping_run_server(struct krping_cb *cb)
1614 {
1615         struct ib_recv_wr *bad_wr;
1616         int ret;
1617
1618         ret = krping_bind_server(cb);
1619         if (ret)
1620                 return;
1621
1622         ret = krping_setup_qp(cb, cb->child_cm_id);
1623         if (ret) {
1624                 PRINTF(cb, "setup_qp failed: %d\n", ret);
1625                 goto err0;
1626         }
1627
1628         ret = krping_setup_buffers(cb);
1629         if (ret) {
1630                 PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
1631                 goto err1;
1632         }
1633
1634         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1635         if (ret) {
1636                 PRINTF(cb, "ib_post_recv failed: %d\n", ret);
1637                 goto err2;
1638         }
1639
1640         ret = krping_accept(cb);
1641         if (ret) {
1642                 PRINTF(cb, "connect error %d\n", ret);
1643                 goto err2;
1644         }
1645
1646         if (cb->wlat)
1647                 krping_wlat_test_server(cb);
1648         else if (cb->rlat)
1649                 krping_rlat_test_server(cb);
1650         else if (cb->bw)
1651                 krping_bw_test_server(cb);
1652         else
1653                 krping_test_server(cb);
1654         rdma_disconnect(cb->child_cm_id);
1655 err2:
1656         krping_free_buffers(cb);
1657 err1:
1658         krping_free_qp(cb);
1659 err0:
1660         rdma_destroy_id(cb->child_cm_id);
1661 }
1662
1663 static void krping_test_client(struct krping_cb *cb)
1664 {
1665         int ping, start, cc, i, ret;
1666         struct ib_send_wr *bad_wr;
1667         unsigned char c;
1668
1669         start = 65;
1670         for (ping = 0; !cb->count || ping < cb->count; ping++) {
1671                 cb->state = RDMA_READ_ADV;
1672
1673                 /* Put some ascii text in the buffer. */
1674                 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1675                 for (i = cc, c = start; i < cb->size; i++) {
1676                         cb->start_buf[i] = c;
1677                         c++;
1678                         if (c > 122)
1679                                 c = 65;
1680                 }
1681                 start++;
1682                 if (start > 122)
1683                         start = 65;
1684                 cb->start_buf[cb->size - 1] = 0;
1685
1686                 krping_format_send(cb, cb->start_dma_addr);
1687                 if (cb->state == ERROR) {
1688                         PRINTF(cb, "krping_format_send failed\n");
1689                         break;
1690                 }
1691                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1692                 if (ret) {
1693                         PRINTF(cb, "post send error %d\n", ret);
1694                         break;
1695                 }
1696
1697                 /* Wait for server to ACK */
1698                 wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1699                 if (cb->state != RDMA_WRITE_ADV) {
1700                         PRINTF(cb, 
1701                                "wait for RDMA_WRITE_ADV state %d\n",
1702                                cb->state);
1703                         break;
1704                 }
1705
1706                 krping_format_send(cb, cb->rdma_dma_addr);
1707                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1708                 if (ret) {
1709                         PRINTF(cb, "post send error %d\n", ret);
1710                         break;
1711                 }
1712
1713                 /* Wait for the server to say the RDMA Write is complete. */
1714                 wait_event_interruptible(cb->sem, 
1715                                          cb->state >= RDMA_WRITE_COMPLETE);
1716                 if (cb->state != RDMA_WRITE_COMPLETE) {
1717                         PRINTF(cb, 
1718                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1719                                cb->state);
1720                         break;
1721                 }
1722
1723                 if (cb->validate)
1724                         if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1725                                 PRINTF(cb, "data mismatch!\n");
1726                                 break;
1727                         }
1728
1729                 if (cb->verbose) {
1730                         if (strlen(cb->rdma_buf) > 128) {
1731                                 char msgbuf[128];
1732
1733                                 strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
1734                                 PRINTF(cb, "ping data stripped: %s\n",
1735                                        msgbuf);
1736                         } else
1737                                 PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
1738                 }
1739 #ifdef SLOW_KRPING
1740                 wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1741 #endif
1742         }
1743 }
1744
1745 static void krping_rlat_test_client(struct krping_cb *cb)
1746 {
1747         struct ib_send_wr *bad_wr;
1748         struct ib_wc wc;
1749         int ret;
1750
1751         cb->state = RDMA_READ_ADV;
1752
1753         /* Send STAG/TO/Len to client */
1754         krping_format_send(cb, cb->start_dma_addr);
1755         if (cb->state == ERROR) {
1756                 PRINTF(cb, "krping_format_send failed\n");
1757                 return;
1758         }
1759         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1760         if (ret) {
1761                 PRINTF(cb, "post send error %d\n", ret);
1762                 return;
1763         }
1764
1765         /* Spin waiting for send completion */
1766         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1767         if (ret < 0) {
1768                 PRINTF(cb, "poll error %d\n", ret);
1769                 return;
1770         }
1771         if (wc.status) {
1772                 PRINTF(cb, "send completion error %d\n", wc.status);
1773                 return;
1774         }
1775
1776         /* Spin waiting for server's Start STAG/TO/Len */
1777         while (cb->state < RDMA_WRITE_ADV) {
1778                 krping_cq_event_handler(cb->cq, cb);
1779         }
1780
1781 #if 0
1782 {
1783         int i;
1784         struct timeval start, stop;
1785         time_t sec;
1786         suseconds_t usec;
1787         unsigned long long elapsed;
1788         struct ib_wc wc;
1789         struct ib_send_wr *bad_wr;
1790         int ne;
1791         
1792         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1793         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1794         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1795         cb->rdma_sq_wr.sg_list->length = 0;
1796         cb->rdma_sq_wr.num_sge = 0;
1797
1798         microtime(&start);
1799         for (i=0; i < 100000; i++) {
1800                 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1801                         PRINTF(cb, "Couldn't post send\n");
1802                         return;
1803                 }
1804                 do {
1805                         ne = ib_poll_cq(cb->cq, 1, &wc);
1806                 } while (ne == 0);
1807                 if (ne < 0) {
1808                         PRINTF(cb, "poll CQ failed %d\n", ne);
1809                         return;
1810                 }
1811                 if (wc.status != IB_WC_SUCCESS) {
1812                         PRINTF(cb, "Completion wth error at %s:\n",
1813                                 cb->server ? "server" : "client");
1814                         PRINTF(cb, "Failed status %d: wr_id %d\n",
1815                                 wc.status, (int) wc.wr_id);
1816                         return;
1817                 }
1818         }
1819         microtime(&stop);
1820         
1821         if (stop.tv_usec < start.tv_usec) {
1822                 stop.tv_usec += 1000000;
1823                 stop.tv_sec  -= 1;
1824         }
1825         sec     = stop.tv_sec - start.tv_sec;
1826         usec    = stop.tv_usec - start.tv_usec;
1827         elapsed = sec * 1000000 + usec;
1828         PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1829 }
1830 #endif
1831
1832         rlat_test(cb);
1833 }
1834
1835 static void krping_wlat_test_client(struct krping_cb *cb)
1836 {
1837         struct ib_send_wr *bad_wr;
1838         struct ib_wc wc;
1839         int ret;
1840
1841         cb->state = RDMA_READ_ADV;
1842
1843         /* Send STAG/TO/Len to client */
1844         krping_format_send(cb, cb->start_dma_addr);
1845         if (cb->state == ERROR) {
1846                 PRINTF(cb, "krping_format_send failed\n");
1847                 return;
1848         }
1849         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1850         if (ret) {
1851                 PRINTF(cb, "post send error %d\n", ret);
1852                 return;
1853         }
1854
1855         /* Spin waiting for send completion */
1856         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1857         if (ret < 0) {
1858                 PRINTF(cb, "poll error %d\n", ret);
1859                 return;
1860         }
1861         if (wc.status) {
1862                 PRINTF(cb, "send completion error %d\n", wc.status);
1863                 return;
1864         }
1865
1866         /* Spin waiting for server's Start STAG/TO/Len */
1867         while (cb->state < RDMA_WRITE_ADV) {
1868                 krping_cq_event_handler(cb->cq, cb);
1869         }
1870
1871         wlat_test(cb);
1872 }
1873
1874 static void krping_bw_test_client(struct krping_cb *cb)
1875 {
1876         struct ib_send_wr *bad_wr;
1877         struct ib_wc wc;
1878         int ret;
1879
1880         cb->state = RDMA_READ_ADV;
1881
1882         /* Send STAG/TO/Len to client */
1883         krping_format_send(cb, cb->start_dma_addr);
1884         if (cb->state == ERROR) {
1885                 PRINTF(cb, "krping_format_send failed\n");
1886                 return;
1887         }
1888         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1889         if (ret) {
1890                 PRINTF(cb, "post send error %d\n", ret);
1891                 return;
1892         }
1893
1894         /* Spin waiting for send completion */
1895         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1896         if (ret < 0) {
1897                 PRINTF(cb, "poll error %d\n", ret);
1898                 return;
1899         }
1900         if (wc.status) {
1901                 PRINTF(cb, "send completion error %d\n", wc.status);
1902                 return;
1903         }
1904
1905         /* Spin waiting for server's Start STAG/TO/Len */
1906         while (cb->state < RDMA_WRITE_ADV) {
1907                 krping_cq_event_handler(cb->cq, cb);
1908         }
1909
1910         bw_test(cb);
1911 }
1912
1913 static void krping_fr_test(struct krping_cb *cb)
1914 {
1915         struct ib_fast_reg_page_list *pl;
1916         struct ib_send_wr fr, inv, *bad;
1917         struct ib_wc wc;
1918         u8 key = 0;
1919         struct ib_mr *mr;
1920         int i;
1921         int ret;
1922         int size = cb->size;
1923         int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1924         time_t start;
1925         int count = 0;
1926         int scnt = 0;
1927
1928         pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1929         if (IS_ERR(pl)) {
1930                 PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
1931                 return;
1932         }
1933         
1934         mr = ib_alloc_fast_reg_mr(cb->pd, plen);
1935         if (IS_ERR(mr)) {
1936                 PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
1937                 goto err1;
1938         }
1939
1940         for (i=0; i<plen; i++)
1941                 pl->page_list[i] = 0xcafebabe | i;
1942         
1943         memset(&fr, 0, sizeof fr);
1944         fr.opcode = IB_WR_FAST_REG_MR;
1945         fr.wr.fast_reg.page_shift = PAGE_SHIFT;
1946         fr.wr.fast_reg.length = size;
1947         fr.wr.fast_reg.page_list = pl;
1948         fr.wr.fast_reg.page_list_len = plen;
1949         fr.wr.fast_reg.iova_start = 0;
1950         fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1951         fr.next = &inv;
1952         memset(&inv, 0, sizeof inv);
1953         inv.opcode = IB_WR_LOCAL_INV;
1954         inv.send_flags = IB_SEND_SIGNALED;
1955         
1956         DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1957         start = time_uptime;
1958         while (1) {
1959                 if ((time_uptime - start) >= 9) {
1960                         DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1961                         wait_event_interruptible(cb->sem, cb->state == ERROR);
1962                         if (cb->state == ERROR)
1963                                 break;
1964                         start = time_uptime;
1965                 }       
1966                 while (scnt < (cb->txdepth>>1)) {
1967                         ib_update_fast_reg_key(mr, ++key);
1968                         fr.wr.fast_reg.rkey = mr->rkey;
1969                         inv.ex.invalidate_rkey = mr->rkey;
1970                         size = arc4random() % cb->size;
1971                         if (size == 0)
1972                                 size = cb->size;
1973                         plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1974                         fr.wr.fast_reg.length = size;
1975                         fr.wr.fast_reg.page_list_len = plen;
1976                         ret = ib_post_send(cb->qp, &fr, &bad);
1977                         if (ret) {
1978                                 PRINTF(cb, "ib_post_send failed %d\n", ret);
1979                                 goto err2;      
1980                         }
1981                         scnt++;
1982                 }
1983
1984                 do {
1985                         ret = ib_poll_cq(cb->cq, 1, &wc);
1986                         if (ret < 0) {
1987                                 PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1988                                 goto err2;      
1989                         }
1990                         if (ret == 1) {
1991                                 if (wc.status) {
1992                                         PRINTF(cb, "completion error %u\n", wc.status);
1993                                         goto err2;
1994                                 }
1995                                 count++;
1996                                 scnt--;
1997                         }
1998                         else if (krping_sigpending()) {
1999                                 PRINTF(cb, "signal!\n");
2000                                 goto err2;
2001                         }
2002                 } while (ret == 1);
2003         }
2004 err2:
2005 #if 0
2006         DEBUG_LOG(cb, "sleeping 1 second\n");
2007         wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2008 #endif
2009         DEBUG_LOG(cb, "draining the cq...\n");
2010         do {
2011                 ret = ib_poll_cq(cb->cq, 1, &wc);
2012                 if (ret < 0) {
2013                         PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2014                         break;
2015                 }
2016                 if (ret == 1) {
2017                         if (wc.status) {
2018                                 PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
2019                         }
2020                 }
2021         } while (ret == 1);
2022         DEBUG_LOG(cb, "fr_test: done!\n");
2023         ib_dereg_mr(mr);
2024 err1:
2025         ib_free_fast_reg_page_list(pl);
2026 }
2027
2028 static int krping_connect_client(struct krping_cb *cb)
2029 {
2030         struct rdma_conn_param conn_param;
2031         int ret;
2032
2033         memset(&conn_param, 0, sizeof conn_param);
2034         conn_param.responder_resources = 1;
2035         conn_param.initiator_depth = 1;
2036         conn_param.retry_count = 10;
2037
2038         ret = rdma_connect(cb->cm_id, &conn_param);
2039         if (ret) {
2040                 PRINTF(cb, "rdma_connect error %d\n", ret);
2041                 return ret;
2042         }
2043
2044         wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
2045         if (cb->state == ERROR) {
2046                 PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
2047                 return -1;
2048         }
2049
2050         DEBUG_LOG(cb, "rdma_connect successful\n");
2051         return 0;
2052 }
2053
2054 static int krping_bind_client(struct krping_cb *cb)
2055 {
2056         struct sockaddr_in sin;
2057         int ret;
2058
2059         memset(&sin, 0, sizeof(sin));
2060         sin.sin_len = sizeof sin;
2061         sin.sin_family = AF_INET;
2062         sin.sin_addr.s_addr = cb->addr.s_addr;
2063         sin.sin_port = cb->port;
2064
2065         ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
2066                                 2000);
2067         if (ret) {
2068                 PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
2069                 return ret;
2070         }
2071
2072         wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
2073         if (cb->state != ROUTE_RESOLVED) {
2074                 PRINTF(cb, 
2075                        "addr/route resolution did not resolve: state %d\n",
2076                        cb->state);
2077                 return -EINTR;
2078         }
2079
2080         if (cb->mem == FASTREG && !fastreg_supported(cb))
2081                 return -EINVAL;
2082
2083         DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
2084         return 0;
2085 }
2086
2087 static void krping_run_client(struct krping_cb *cb)
2088 {
2089         struct ib_recv_wr *bad_wr;
2090         int ret;
2091
2092         ret = krping_bind_client(cb);
2093         if (ret)
2094                 return;
2095
2096         ret = krping_setup_qp(cb, cb->cm_id);
2097         if (ret) {
2098                 PRINTF(cb, "setup_qp failed: %d\n", ret);
2099                 return;
2100         }
2101
2102         ret = krping_setup_buffers(cb);
2103         if (ret) {
2104                 PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2105                 goto err1;
2106         }
2107
2108         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2109         if (ret) {
2110                 PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2111                 goto err2;
2112         }
2113
2114         ret = krping_connect_client(cb);
2115         if (ret) {
2116                 PRINTF(cb, "connect error %d\n", ret);
2117                 goto err2;
2118         }
2119
2120         if (cb->wlat)
2121                 krping_wlat_test_client(cb);
2122         else if (cb->rlat)
2123                 krping_rlat_test_client(cb);
2124         else if (cb->bw)
2125                 krping_bw_test_client(cb);
2126         else if (cb->frtest)
2127                 krping_fr_test(cb);
2128         else
2129                 krping_test_client(cb);
2130         rdma_disconnect(cb->cm_id);
2131 err2:
2132         krping_free_buffers(cb);
2133 err1:
2134         krping_free_qp(cb);
2135 }
2136
2137 int krping_doit(char *cmd, void *cookie)
2138 {
2139         struct krping_cb *cb;
2140         int op;
2141         int ret = 0;
2142         char *optarg;
2143         unsigned long optint;
2144
2145         cb = kzalloc(sizeof(*cb), GFP_KERNEL);
2146         if (!cb)
2147                 return -ENOMEM;
2148
2149         mutex_lock(&krping_mutex);
2150         list_add_tail(&cb->list, &krping_cbs);
2151         mutex_unlock(&krping_mutex);
2152
2153         cb->cookie = cookie;
2154         cb->server = -1;
2155         cb->state = IDLE;
2156         cb->size = 64;
2157         cb->txdepth = RPING_SQ_DEPTH;
2158         cb->mem = DMA;
2159         init_waitqueue_head(&cb->sem);
2160
2161         while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2162                               &optint)) != 0) {
2163                 switch (op) {
2164                 case 'a':
2165                         cb->addr_str = optarg;
2166                         DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
2167                         if (!inet_aton(optarg, &cb->addr)) {
2168                                 PRINTF(cb, "bad addr string %s\n",
2169                                     optarg);
2170                                 ret = EINVAL;
2171                         }
2172                         break;
2173                 case 'p':
2174                         cb->port = htons(optint);
2175                         DEBUG_LOG(cb, "port %d\n", (int)optint);
2176                         break;
2177                 case 'P':
2178                         cb->poll = 1;
2179                         DEBUG_LOG(cb, "server\n");
2180                         break;
2181                 case 's':
2182                         cb->server = 1;
2183                         DEBUG_LOG(cb, "server\n");
2184                         break;
2185                 case 'c':
2186                         cb->server = 0;
2187                         DEBUG_LOG(cb, "client\n");
2188                         break;
2189                 case 'S':
2190                         cb->size = optint;
2191                         if ((cb->size < 1) ||
2192                             (cb->size > RPING_BUFSIZE)) {
2193                                 PRINTF(cb, "Invalid size %d "
2194                                        "(valid range is 1 to %d)\n",
2195                                        cb->size, RPING_BUFSIZE);
2196                                 ret = EINVAL;
2197                         } else
2198                                 DEBUG_LOG(cb, "size %d\n", (int)optint);
2199                         break;
2200                 case 'C':
2201                         cb->count = optint;
2202                         if (cb->count < 0) {
2203                                 PRINTF(cb, "Invalid count %d\n",
2204                                         cb->count);
2205                                 ret = EINVAL;
2206                         } else
2207                                 DEBUG_LOG(cb, "count %d\n", (int) cb->count);
2208                         break;
2209                 case 'v':
2210                         cb->verbose++;
2211                         DEBUG_LOG(cb, "verbose\n");
2212                         break;
2213                 case 'V':
2214                         cb->validate++;
2215                         DEBUG_LOG(cb, "validate data\n");
2216                         break;
2217                 case 'l':
2218                         cb->wlat++;
2219                         break;
2220                 case 'L':
2221                         cb->rlat++;
2222                         break;
2223                 case 'B':
2224                         cb->bw++;
2225                         break;
2226                 case 'd':
2227                         cb->duplex++;
2228                         break;
2229                 case 'm':
2230                         if (!strncmp(optarg, "dma", 3))
2231                                 cb->mem = DMA;
2232                         else if (!strncmp(optarg, "fastreg", 7))
2233                                 cb->mem = FASTREG;
2234                         else if (!strncmp(optarg, "mw", 2))
2235                                 cb->mem = MW;
2236                         else if (!strncmp(optarg, "mr", 2))
2237                                 cb->mem = MR;
2238                         else {
2239                                 PRINTF(cb, "unknown mem mode %s.  "
2240                                         "Must be dma, fastreg, mw, or mr\n",
2241                                         optarg);
2242                                 ret = -EINVAL;
2243                                 break;
2244                         }
2245                         break;
2246                 case 'I':
2247                         cb->server_invalidate = 1;
2248                         break;
2249                 case 'T':
2250                         cb->txdepth = optint;
2251                         DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
2252                         break;
2253                 case 'Z':
2254                         cb->local_dma_lkey = 1;
2255                         DEBUG_LOG(cb, "using local dma lkey\n");
2256                         break;
2257                 case 'R':
2258                         cb->read_inv = 1;
2259                         DEBUG_LOG(cb, "using read-with-inv\n");
2260                         break;
2261                 case 'f':
2262                         cb->frtest = 1;
2263                         DEBUG_LOG(cb, "fast-reg test!\n");
2264                         break;
2265                 default:
2266                         PRINTF(cb, "unknown opt %s\n", optarg);
2267                         ret = -EINVAL;
2268                         break;
2269                 }
2270         }
2271         if (ret)
2272                 goto out;
2273
2274         if (cb->server == -1) {
2275                 PRINTF(cb, "must be either client or server\n");
2276                 ret = -EINVAL;
2277                 goto out;
2278         }
2279
2280         if (cb->server && cb->frtest) {
2281                 PRINTF(cb, "must be client to run frtest\n");
2282                 ret = -EINVAL;
2283                 goto out;
2284         }
2285
2286         if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2287                 PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
2288                 ret = -EINVAL;
2289                 goto out;
2290         }
2291
2292         if (cb->server_invalidate && cb->mem != FASTREG) {
2293                 PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
2294                 ret = -EINVAL;
2295                 goto out;
2296         }
2297
2298         if (cb->read_inv && cb->mem != FASTREG) {
2299                 PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
2300                 ret = -EINVAL;
2301                 goto out;
2302         }
2303
2304         if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw)) {
2305                 PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
2306                 ret = -EINVAL;
2307                 goto out;
2308         }
2309
2310         cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
2311         if (IS_ERR(cb->cm_id)) {
2312                 ret = PTR_ERR(cb->cm_id);
2313                 PRINTF(cb, "rdma_create_id error %d\n", ret);
2314                 goto out;
2315         }
2316         DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
2317
2318         if (cb->server)
2319                 krping_run_server(cb);
2320         else
2321                 krping_run_client(cb);
2322
2323         DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
2324         rdma_destroy_id(cb->cm_id);
2325 out:
2326         mutex_lock(&krping_mutex);
2327         list_del(&cb->list);
2328         mutex_unlock(&krping_mutex);
2329         kfree(cb);
2330         return ret;
2331 }
2332
2333 void
2334 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2335 {
2336         struct krping_cb *cb;
2337
2338         mutex_lock(&krping_mutex);
2339         list_for_each_entry(cb, &krping_cbs, list)
2340             (*f)(cb->pd ? &cb->stats : NULL, arg);
2341         mutex_unlock(&krping_mutex);
2342 }
2343
2344 void krping_init(void)
2345 {
2346
2347         mutex_init(&krping_mutex);
2348 }