]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - sys/contrib/rdma/krping/krping.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / sys / contrib / rdma / krping / krping.c
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <linux/module.h>
38 #include <linux/moduleparam.h>
39 #include <linux/init.h>
40 #include <linux/slab.h>
41 #include <linux/err.h>
42 #include <linux/string.h>
43 #include <linux/inet.h>
44 #include <linux/list.h>
45 #include <linux/in.h>
46 #include <linux/device.h>
47 #include <linux/pci.h>
48 #include <linux/sched.h>
49 #include <asm/system.h>
50
51 #include <asm/atomic.h>
52
53 #include <rdma/ib_verbs.h>
54 #include <rdma/rdma_cm.h>
55
56 #include "krping.h"
57 #include "getopt.h"
58
59 extern int krping_debug;
60 #define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x)
61 #define PRINTF(cb, x...) krping_printf((cb)->cookie, x)
62
63 MODULE_AUTHOR("Steve Wise");
64 MODULE_DESCRIPTION("RDMA ping client/server");
65 MODULE_LICENSE("Dual BSD/GPL");
66
67 static __inline uint64_t
68 get_cycles(void)
69 {
70         uint32_t low, high;
71         __asm __volatile("rdtsc" : "=a" (low), "=d" (high));
72         return (low | ((u_int64_t)high << 32));
73 }
74
75 typedef uint64_t cycles_t;
76
77 enum mem_type {
78         DMA = 1,
79         FASTREG = 2,
80         MW = 3,
81         MR = 4
82 };
83
84 static const struct krping_option krping_opts[] = {
85         {"count", OPT_INT, 'C'},
86         {"size", OPT_INT, 'S'},
87         {"addr", OPT_STRING, 'a'},
88         {"port", OPT_INT, 'p'},
89         {"verbose", OPT_NOPARAM, 'v'},
90         {"validate", OPT_NOPARAM, 'V'},
91         {"server", OPT_NOPARAM, 's'},
92         {"client", OPT_NOPARAM, 'c'},
93         {"mem_mode", OPT_STRING, 'm'},
94         {"server_inv", OPT_NOPARAM, 'I'},
95         {"wlat", OPT_NOPARAM, 'l'},
96         {"rlat", OPT_NOPARAM, 'L'},
97         {"bw", OPT_NOPARAM, 'B'},
98         {"duplex", OPT_NOPARAM, 'd'},
99         {"txdepth", OPT_INT, 'T'},
100         {"poll", OPT_NOPARAM, 'P'},
101         {"local_dma_lkey", OPT_NOPARAM, 'Z'},
102         {"read_inv", OPT_NOPARAM, 'R'},
103         {"fr", OPT_NOPARAM, 'f'},
104         {NULL, 0, 0}
105 };
106
107 #define htonll(x) cpu_to_be64((x))
108 #define ntohll(x) cpu_to_be64((x))
109
110 static struct mutex krping_mutex;
111
112 /*
113  * List of running krping threads.
114  */
115 static LIST_HEAD(krping_cbs);
116
117 /*
118  * krping "ping/pong" loop:
119  *      client sends source rkey/addr/len
120  *      server receives source rkey/add/len
121  *      server rdma reads "ping" data from source
122  *      server sends "go ahead" on rdma read completion
123  *      client sends sink rkey/addr/len
124  *      server receives sink rkey/addr/len
125  *      server rdma writes "pong" data to sink
126  *      server sends "go ahead" on rdma write completion
127  *      <repeat loop>
128  */
129
130 /*
131  * These states are used to signal events between the completion handler
132  * and the main client or server thread.
133  *
134  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
135  * and RDMA_WRITE_COMPLETE for each ping.
136  */
137 enum test_state {
138         IDLE = 1,
139         CONNECT_REQUEST,
140         ADDR_RESOLVED,
141         ROUTE_RESOLVED,
142         CONNECTED,
143         RDMA_READ_ADV,
144         RDMA_READ_COMPLETE,
145         RDMA_WRITE_ADV,
146         RDMA_WRITE_COMPLETE,
147         ERROR
148 };
149
150 struct krping_rdma_info {
151         uint64_t buf;
152         uint32_t rkey;
153         uint32_t size;
154 };
155
156 /*
157  * Default max buffer size for IO...
158  */
159 #define RPING_BUFSIZE 128*1024
160 #define RPING_SQ_DEPTH 64
161
162 /*
163  * Control block struct.
164  */
165 struct krping_cb {
166         void *cookie;
167         int server;                     /* 0 iff client */
168         struct ib_cq *cq;
169         struct ib_pd *pd;
170         struct ib_qp *qp;
171
172         enum mem_type mem;
173         struct ib_mr *dma_mr;
174
175         struct ib_fast_reg_page_list *page_list;
176         int page_list_len;
177         struct ib_send_wr fastreg_wr;
178         struct ib_send_wr invalidate_wr;
179         struct ib_mr *fastreg_mr;
180         int server_invalidate;
181         int read_inv;
182         u8 key;
183
184         struct ib_mw *mw;
185         struct ib_mw_bind bind_attr;
186
187         struct ib_recv_wr rq_wr;        /* recv work request record */
188         struct ib_sge recv_sgl;         /* recv single SGE */
189         struct krping_rdma_info recv_buf;/* malloc'd buffer */
190         u64 recv_dma_addr;
191         DECLARE_PCI_UNMAP_ADDR(recv_mapping)
192         struct ib_mr *recv_mr;
193
194         struct ib_send_wr sq_wr;        /* send work requrest record */
195         struct ib_sge send_sgl;
196         struct krping_rdma_info send_buf;/* single send buf */
197         u64 send_dma_addr;
198         DECLARE_PCI_UNMAP_ADDR(send_mapping)
199         struct ib_mr *send_mr;
200
201         struct ib_send_wr rdma_sq_wr;   /* rdma work request record */
202         struct ib_sge rdma_sgl;         /* rdma single SGE */
203         char *rdma_buf;                 /* used as rdma sink */
204         u64  rdma_dma_addr;
205         DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
206         struct ib_mr *rdma_mr;
207
208         uint32_t remote_rkey;           /* remote guys RKEY */
209         uint64_t remote_addr;           /* remote guys TO */
210         uint32_t remote_len;            /* remote guys LEN */
211
212         char *start_buf;                /* rdma read src */
213         u64  start_dma_addr;
214         DECLARE_PCI_UNMAP_ADDR(start_mapping)
215         struct ib_mr *start_mr;
216
217         enum test_state state;          /* used for cond/signalling */
218         wait_queue_head_t sem;
219         struct krping_stats stats;
220
221         uint16_t port;                  /* dst port in NBO */
222         struct in_addr addr;            /* dst addr in NBO */
223         char *addr_str;                 /* dst addr string */
224         int verbose;                    /* verbose logging */
225         int count;                      /* ping count */
226         int size;                       /* ping data size */
227         int validate;                   /* validate ping data */
228         int wlat;                       /* run wlat test */
229         int rlat;                       /* run rlat test */
230         int bw;                         /* run bw test */
231         int duplex;                     /* run bw full duplex test */
232         int poll;                       /* poll or block for rlat test */
233         int txdepth;                    /* SQ depth */
234         int local_dma_lkey;             /* use 0 for lkey */
235         int frtest;                     /* fastreg test */
236
237         /* CM stuff */
238         struct rdma_cm_id *cm_id;       /* connection on client side,*/
239                                         /* listener on server side. */
240         struct rdma_cm_id *child_cm_id; /* connection on server side */
241         struct list_head list;
242 };
243
244 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
245                                    struct rdma_cm_event *event)
246 {
247         int ret;
248         struct krping_cb *cb = cma_id->context;
249
250         DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
251             cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
252
253         switch (event->event) {
254         case RDMA_CM_EVENT_ADDR_RESOLVED:
255                 cb->state = ADDR_RESOLVED;
256                 ret = rdma_resolve_route(cma_id, 2000);
257                 if (ret) {
258                         PRINTF(cb, "rdma_resolve_route error %d\n", ret);
259                         wake_up_interruptible(&cb->sem);
260                 }
261                 break;
262
263         case RDMA_CM_EVENT_ROUTE_RESOLVED:
264                 cb->state = ROUTE_RESOLVED;
265                 wake_up_interruptible(&cb->sem);
266                 break;
267
268         case RDMA_CM_EVENT_CONNECT_REQUEST:
269                 cb->state = CONNECT_REQUEST;
270                 cb->child_cm_id = cma_id;
271                 DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
272                 wake_up_interruptible(&cb->sem);
273                 break;
274
275         case RDMA_CM_EVENT_ESTABLISHED:
276                 DEBUG_LOG(cb, "ESTABLISHED\n");
277                 if (!cb->server) {
278                         cb->state = CONNECTED;
279                 }
280                 wake_up_interruptible(&cb->sem);
281                 break;
282
283         case RDMA_CM_EVENT_ADDR_ERROR:
284         case RDMA_CM_EVENT_ROUTE_ERROR:
285         case RDMA_CM_EVENT_CONNECT_ERROR:
286         case RDMA_CM_EVENT_UNREACHABLE:
287         case RDMA_CM_EVENT_REJECTED:
288                 PRINTF(cb, "cma event %d, error %d\n", event->event,
289                        event->status);
290                 cb->state = ERROR;
291                 wake_up_interruptible(&cb->sem);
292                 break;
293
294         case RDMA_CM_EVENT_DISCONNECTED:
295                 PRINTF(cb, "DISCONNECT EVENT...\n");
296                 cb->state = ERROR;
297                 wake_up_interruptible(&cb->sem);
298                 break;
299
300         case RDMA_CM_EVENT_DEVICE_REMOVAL:
301                 PRINTF(cb, "cma detected device removal!!!!\n");
302                 break;
303
304         default:
305                 PRINTF(cb, "oof bad type!\n");
306                 wake_up_interruptible(&cb->sem);
307                 break;
308         }
309         return 0;
310 }
311
312 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
313 {
314         if (wc->byte_len != sizeof(cb->recv_buf)) {
315                 PRINTF(cb, "Received bogus data, size %d\n", 
316                        wc->byte_len);
317                 return -1;
318         }
319
320         cb->remote_rkey = ntohl(cb->recv_buf.rkey);
321         cb->remote_addr = ntohll(cb->recv_buf.buf);
322         cb->remote_len  = ntohl(cb->recv_buf.size);
323         DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
324                   cb->remote_rkey, (unsigned long long)cb->remote_addr, 
325                   cb->remote_len);
326
327         if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
328                 cb->state = RDMA_READ_ADV;
329         else
330                 cb->state = RDMA_WRITE_ADV;
331
332         return 0;
333 }
334
335 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
336 {
337         if (wc->byte_len != sizeof(cb->recv_buf)) {
338                 PRINTF(cb, "Received bogus data, size %d\n", 
339                        wc->byte_len);
340                 return -1;
341         }
342
343         if (cb->state == RDMA_READ_ADV)
344                 cb->state = RDMA_WRITE_ADV;
345         else
346                 cb->state = RDMA_WRITE_COMPLETE;
347
348         return 0;
349 }
350
351 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
352 {
353         struct krping_cb *cb = ctx;
354         struct ib_wc wc;
355         struct ib_recv_wr *bad_wr;
356         int ret;
357
358         BUG_ON(cb->cq != cq);
359         if (cb->state == ERROR) {
360                 PRINTF(cb, "cq completion in ERROR state\n");
361                 return;
362         }
363         if (cb->frtest) {
364                 PRINTF(cb, "cq completion event in frtest!\n");
365                 return;
366         }
367         if (!cb->wlat && !cb->rlat && !cb->bw)
368                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
369         while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
370                 if (wc.status) {
371                         if (wc.status == IB_WC_WR_FLUSH_ERR) {
372                                 DEBUG_LOG(cb, "cq flushed\n");
373                                 continue;
374                         } else {
375                                 PRINTF(cb, "cq completion failed with "
376                                        "wr_id %Lx status %d opcode %d vender_err %x\n",
377                                         wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
378                                 goto error;
379                         }
380                 }
381
382                 switch (wc.opcode) {
383                 case IB_WC_SEND:
384                         DEBUG_LOG(cb, "send completion\n");
385                         cb->stats.send_bytes += cb->send_sgl.length;
386                         cb->stats.send_msgs++;
387                         break;
388
389                 case IB_WC_RDMA_WRITE:
390                         DEBUG_LOG(cb, "rdma write completion\n");
391                         cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
392                         cb->stats.write_msgs++;
393                         cb->state = RDMA_WRITE_COMPLETE;
394                         wake_up_interruptible(&cb->sem);
395                         break;
396
397                 case IB_WC_RDMA_READ:
398                         DEBUG_LOG(cb, "rdma read completion\n");
399                         cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
400                         cb->stats.read_msgs++;
401                         cb->state = RDMA_READ_COMPLETE;
402                         wake_up_interruptible(&cb->sem);
403                         break;
404
405                 case IB_WC_RECV:
406                         DEBUG_LOG(cb, "recv completion\n");
407                         cb->stats.recv_bytes += sizeof(cb->recv_buf);
408                         cb->stats.recv_msgs++;
409                         if (cb->wlat || cb->rlat || cb->bw)
410                                 ret = server_recv(cb, &wc);
411                         else
412                                 ret = cb->server ? server_recv(cb, &wc) :
413                                                    client_recv(cb, &wc);
414                         if (ret) {
415                                 PRINTF(cb, "recv wc error: %d\n", ret);
416                                 goto error;
417                         }
418
419                         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
420                         if (ret) {
421                                 PRINTF(cb, "post recv error: %d\n", 
422                                        ret);
423                                 goto error;
424                         }
425                         wake_up_interruptible(&cb->sem);
426                         break;
427
428                 default:
429                         PRINTF(cb, 
430                                "%s:%d Unexpected opcode %d, Shutting down\n",
431                                __func__, __LINE__, wc.opcode);
432                         goto error;
433                 }
434         }
435         if (ret) {
436                 PRINTF(cb, "poll error %d\n", ret);
437                 goto error;
438         }
439         return;
440 error:
441         cb->state = ERROR;
442         wake_up_interruptible(&cb->sem);
443 }
444
445 static int krping_accept(struct krping_cb *cb)
446 {
447         struct rdma_conn_param conn_param;
448         int ret;
449
450         DEBUG_LOG(cb, "accepting client connection request\n");
451
452         memset(&conn_param, 0, sizeof conn_param);
453         conn_param.responder_resources = 1;
454         conn_param.initiator_depth = 1;
455
456         ret = rdma_accept(cb->child_cm_id, &conn_param);
457         if (ret) {
458                 PRINTF(cb, "rdma_accept error: %d\n", ret);
459                 return ret;
460         }
461
462         if (!cb->wlat && !cb->rlat && !cb->bw) {
463                 wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
464                 if (cb->state == ERROR) {
465                         PRINTF(cb, "wait for CONNECTED state %d\n", 
466                                 cb->state);
467                         return -1;
468                 }
469         }
470         return 0;
471 }
472
473 static void krping_setup_wr(struct krping_cb *cb)
474 {
475         cb->recv_sgl.addr = cb->recv_dma_addr;
476         cb->recv_sgl.length = sizeof cb->recv_buf;
477         if (cb->local_dma_lkey)
478                 cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
479         else if (cb->mem == DMA)
480                 cb->recv_sgl.lkey = cb->dma_mr->lkey;
481         else
482                 cb->recv_sgl.lkey = cb->recv_mr->lkey;
483         cb->rq_wr.sg_list = &cb->recv_sgl;
484         cb->rq_wr.num_sge = 1;
485
486         cb->send_sgl.addr = cb->send_dma_addr;
487         cb->send_sgl.length = sizeof cb->send_buf;
488         if (cb->local_dma_lkey)
489                 cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
490         else if (cb->mem == DMA)
491                 cb->send_sgl.lkey = cb->dma_mr->lkey;
492         else
493                 cb->send_sgl.lkey = cb->send_mr->lkey;
494
495         cb->sq_wr.opcode = IB_WR_SEND;
496         cb->sq_wr.send_flags = IB_SEND_SIGNALED;
497         cb->sq_wr.sg_list = &cb->send_sgl;
498         cb->sq_wr.num_sge = 1;
499
500         if (cb->server || cb->wlat || cb->rlat || cb->bw) {
501                 cb->rdma_sgl.addr = cb->rdma_dma_addr;
502                 if (cb->mem == MR)
503                         cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
504                 cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
505                 cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
506                 cb->rdma_sq_wr.num_sge = 1;
507         }
508
509         switch(cb->mem) {
510         case FASTREG:
511
512                 /* 
513                  * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
514                  * both unsignaled.  The client uses them to reregister
515                  * the rdma buffers with a new key each iteration.
516                  */
517                 cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
518                 cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
519                 cb->fastreg_wr.wr.fast_reg.length = cb->size;
520                 cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
521                 cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
522
523                 cb->invalidate_wr.next = &cb->fastreg_wr;
524                 cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
525                 break;
526         case MW:
527                 cb->bind_attr.wr_id = 0xabbaabba;
528                 cb->bind_attr.send_flags = 0; /* unsignaled */
529                 cb->bind_attr.length = cb->size;
530                 break;
531         default:
532                 break;
533         }
534 }
535
536 static int krping_setup_buffers(struct krping_cb *cb)
537 {
538         int ret;
539         struct ib_phys_buf buf;
540         u64 iovbase;
541
542         DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
543
544         cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device, 
545                                    &cb->recv_buf, 
546                                    sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
547         pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
548         cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device, 
549                                            &cb->send_buf, sizeof(cb->send_buf),
550                                            DMA_BIDIRECTIONAL);
551         pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
552
553         if (cb->mem == DMA) {
554                 cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
555                                            IB_ACCESS_REMOTE_READ|
556                                            IB_ACCESS_REMOTE_WRITE);
557                 if (IS_ERR(cb->dma_mr)) {
558                         DEBUG_LOG(cb, "reg_dmamr failed\n");
559                         ret = PTR_ERR(cb->dma_mr);
560                         goto bail;
561                 }
562         } else {
563                 if (!cb->local_dma_lkey) {
564                         buf.addr = cb->recv_dma_addr;
565                         buf.size = sizeof cb->recv_buf;
566                         DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr, 
567                                 (int)buf.size);
568                         iovbase = cb->recv_dma_addr;
569                         cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
570                                                      IB_ACCESS_LOCAL_WRITE, 
571                                                      &iovbase);
572
573                         if (IS_ERR(cb->recv_mr)) {
574                                 DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
575                                 ret = PTR_ERR(cb->recv_mr);
576                                 goto bail;
577                         }
578
579                         buf.addr = cb->send_dma_addr;
580                         buf.size = sizeof cb->send_buf;
581                         DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr, 
582                                 (int)buf.size);
583                         iovbase = cb->send_dma_addr;
584                         cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
585                                                      0, &iovbase);
586
587                         if (IS_ERR(cb->send_mr)) {
588                                 DEBUG_LOG(cb, "send_buf reg_mr failed\n");
589                                 ret = PTR_ERR(cb->send_mr);
590                                 goto bail;
591                         }
592                 }
593         }
594
595         cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
596         if (!cb->rdma_buf) {
597                 DEBUG_LOG(cb, "rdma_buf malloc failed\n");
598                 ret = -ENOMEM;
599                 goto bail;
600         }
601
602         cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device, 
603                                cb->rdma_buf, cb->size, 
604                                DMA_BIDIRECTIONAL);
605         pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
606         if (cb->mem != DMA) {
607                 switch (cb->mem) {
608                 case FASTREG:
609                         cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
610                                 PAGE_SIZE) >> PAGE_SHIFT;
611                         cb->page_list = ib_alloc_fast_reg_page_list(
612                                                 cb->pd->device, 
613                                                 cb->page_list_len);
614                         if (IS_ERR(cb->page_list)) {
615                                 DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
616                                 ret = PTR_ERR(cb->page_list);
617                                 goto bail;
618                         }
619                         cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, 
620                                         cb->page_list->max_page_list_len);
621                         if (IS_ERR(cb->fastreg_mr)) {
622                                 DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
623                                 ret = PTR_ERR(cb->fastreg_mr);
624                                 goto bail;
625                         }
626                         DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
627                                 " page_list_len %u\n", cb->fastreg_mr->rkey, 
628                                 cb->page_list, cb->page_list_len);
629                         break;
630                 case MW:
631                         cb->mw = ib_alloc_mw(cb->pd);
632                         if (IS_ERR(cb->mw)) {
633                                 DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
634                                 ret = PTR_ERR(cb->mw);
635                                 goto bail;
636                         }
637                         DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
638                         /*FALLTHROUGH*/
639                 case MR:
640                         buf.addr = cb->rdma_dma_addr;
641                         buf.size = cb->size;
642                         iovbase = cb->rdma_dma_addr;
643                         cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
644                                              IB_ACCESS_REMOTE_READ| 
645                                              IB_ACCESS_REMOTE_WRITE, 
646                                              &iovbase);
647                         if (IS_ERR(cb->rdma_mr)) {
648                                 DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
649                                 ret = PTR_ERR(cb->rdma_mr);
650                                 goto bail;
651                         }
652                         DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n", 
653                                 buf.addr, (int)buf.size, cb->rdma_mr->rkey);
654                         break;
655                 default:
656                         ret = -EINVAL;
657                         goto bail;
658                         break;
659                 }
660         }
661
662         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
663
664                 cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
665                 if (!cb->start_buf) {
666                         DEBUG_LOG(cb, "start_buf malloc failed\n");
667                         ret = -ENOMEM;
668                         goto bail;
669                 }
670
671                 cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device, 
672                                                    cb->start_buf, cb->size, 
673                                                    DMA_BIDIRECTIONAL);
674                 pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
675
676                 if (cb->mem == MR || cb->mem == MW) {
677                         unsigned flags = IB_ACCESS_REMOTE_READ;
678
679                         if (cb->wlat || cb->rlat || cb->bw)
680                                 flags |= IB_ACCESS_REMOTE_WRITE;
681
682                         buf.addr = cb->start_dma_addr;
683                         buf.size = cb->size;
684                         DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n", 
685                                 buf.addr, (int)buf.size);
686                         iovbase = cb->start_dma_addr;
687                         cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
688                                              flags,
689                                              &iovbase);
690
691                         if (IS_ERR(cb->start_mr)) {
692                                 DEBUG_LOG(cb, "start_buf reg_mr failed\n");
693                                 ret = PTR_ERR(cb->start_mr);
694                                 goto bail;
695                         }
696                 }
697         }
698
699         krping_setup_wr(cb);
700         DEBUG_LOG(cb, "allocated & registered buffers...\n");
701         return 0;
702 bail:
703         if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
704                 ib_dereg_mr(cb->fastreg_mr);
705         if (cb->mw && !IS_ERR(cb->mw))
706                 ib_dealloc_mw(cb->mw);
707         if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
708                 ib_dereg_mr(cb->rdma_mr);
709         if (cb->page_list && !IS_ERR(cb->page_list))
710                 ib_free_fast_reg_page_list(cb->page_list);
711         if (cb->dma_mr && !IS_ERR(cb->dma_mr))
712                 ib_dereg_mr(cb->dma_mr);
713         if (cb->recv_mr && !IS_ERR(cb->recv_mr))
714                 ib_dereg_mr(cb->recv_mr);
715         if (cb->send_mr && !IS_ERR(cb->send_mr))
716                 ib_dereg_mr(cb->send_mr);
717         if (cb->rdma_buf)
718                 kfree(cb->rdma_buf);
719         if (cb->start_buf)
720                 kfree(cb->start_buf);
721         return ret;
722 }
723
724 static void krping_free_buffers(struct krping_cb *cb)
725 {
726         DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
727         
728         if (cb->dma_mr)
729                 ib_dereg_mr(cb->dma_mr);
730         if (cb->send_mr)
731                 ib_dereg_mr(cb->send_mr);
732         if (cb->recv_mr)
733                 ib_dereg_mr(cb->recv_mr);
734         if (cb->rdma_mr)
735                 ib_dereg_mr(cb->rdma_mr);
736         if (cb->start_mr)
737                 ib_dereg_mr(cb->start_mr);
738         if (cb->fastreg_mr)
739                 ib_dereg_mr(cb->fastreg_mr);
740         if (cb->mw)
741                 ib_dealloc_mw(cb->mw);
742
743         dma_unmap_single(cb->pd->device->dma_device,
744                          pci_unmap_addr(cb, recv_mapping),
745                          sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
746         dma_unmap_single(cb->pd->device->dma_device,
747                          pci_unmap_addr(cb, send_mapping),
748                          sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
749         dma_unmap_single(cb->pd->device->dma_device,
750                          pci_unmap_addr(cb, rdma_mapping),
751                          cb->size, DMA_BIDIRECTIONAL);
752         kfree(cb->rdma_buf);
753         if (cb->start_buf) {
754                 dma_unmap_single(cb->pd->device->dma_device,
755                          pci_unmap_addr(cb, start_mapping),
756                          cb->size, DMA_BIDIRECTIONAL);
757                 kfree(cb->start_buf);
758         }
759 }
760
761 static int krping_create_qp(struct krping_cb *cb)
762 {
763         struct ib_qp_init_attr init_attr;
764         int ret;
765
766         memset(&init_attr, 0, sizeof(init_attr));
767         init_attr.cap.max_send_wr = cb->txdepth;
768         init_attr.cap.max_recv_wr = 2;
769         init_attr.cap.max_recv_sge = 1;
770         init_attr.cap.max_send_sge = 1;
771         init_attr.qp_type = IB_QPT_RC;
772         init_attr.send_cq = cb->cq;
773         init_attr.recv_cq = cb->cq;
774         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
775
776         if (cb->server) {
777                 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
778                 if (!ret)
779                         cb->qp = cb->child_cm_id->qp;
780         } else {
781                 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
782                 if (!ret)
783                         cb->qp = cb->cm_id->qp;
784         }
785
786         return ret;
787 }
788
789 static void krping_free_qp(struct krping_cb *cb)
790 {
791         ib_destroy_qp(cb->qp);
792         ib_destroy_cq(cb->cq);
793         ib_dealloc_pd(cb->pd);
794 }
795
796 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
797 {
798         int ret;
799         cb->pd = ib_alloc_pd(cm_id->device);
800         if (IS_ERR(cb->pd)) {
801                 PRINTF(cb, "ib_alloc_pd failed\n");
802                 return PTR_ERR(cb->pd);
803         }
804         DEBUG_LOG(cb, "created pd %p\n", cb->pd);
805
806         strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
807
808         cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
809                               cb, cb->txdepth * 2, 0);
810         if (IS_ERR(cb->cq)) {
811                 PRINTF(cb, "ib_create_cq failed\n");
812                 ret = PTR_ERR(cb->cq);
813                 goto err1;
814         }
815         DEBUG_LOG(cb, "created cq %p\n", cb->cq);
816
817         if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
818                 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
819                 if (ret) {
820                         PRINTF(cb, "ib_create_cq failed\n");
821                         goto err2;
822                 }
823         }
824
825         ret = krping_create_qp(cb);
826         if (ret) {
827                 PRINTF(cb, "krping_create_qp failed: %d\n", ret);
828                 goto err2;
829         }
830         DEBUG_LOG(cb, "created qp %p\n", cb->qp);
831         return 0;
832 err2:
833         ib_destroy_cq(cb->cq);
834 err1:
835         ib_dealloc_pd(cb->pd);
836         return ret;
837 }
838
839 /*
840  * return the (possibly rebound) rkey for the rdma buffer.
841  * FASTREG mode: invalidate and rebind via fastreg wr.
842  * MW mode: rebind the MW.
843  * other modes: just return the mr rkey.
844  */
845 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
846 {
847         u32 rkey = 0xffffffff;
848         u64 p;
849         struct ib_send_wr *bad_wr;
850         int i;
851         int ret;
852
853         switch (cb->mem) {
854         case FASTREG:
855                 cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
856
857                 /*
858                  * Update the fastreg key.
859                  */
860                 ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
861                 cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
862
863                 /*
864                  * Update the fastreg WR with new buf info.
865                  */
866                 if (buf == (u64)cb->start_dma_addr)
867                         cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
868                 else
869                         cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
870                 cb->fastreg_wr.wr.fast_reg.iova_start = buf;
871                 p = (u64)(buf & PAGE_MASK);
872                 for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; 
873                      i++, p += PAGE_SIZE) {
874                         cb->page_list->page_list[i] = p;
875                         DEBUG_LOG(cb, "page_list[%d] 0x%llx\n", i, p);
876                 }
877
878                 DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
879                         " iova_start %llx page_list_len %u\n",
880                         post_inv,
881                         cb->fastreg_wr.wr.fast_reg.rkey,
882                         cb->fastreg_wr.wr.fast_reg.page_shift,
883                         cb->fastreg_wr.wr.fast_reg.length,
884                         cb->fastreg_wr.wr.fast_reg.iova_start,
885                         cb->fastreg_wr.wr.fast_reg.page_list_len);
886
887                 if (post_inv)
888                         ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
889                 else
890                         ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
891                 if (ret) {
892                         PRINTF(cb, "post send error %d\n", ret);
893                         cb->state = ERROR;
894                 }
895                 rkey = cb->fastreg_mr->rkey;
896                 break;
897         case MW:
898                 /*
899                  * Update the MW with new buf info.
900                  */
901                 if (buf == (u64)cb->start_dma_addr) {
902                         cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
903                         cb->bind_attr.mr = cb->start_mr;
904                 } else {
905                         cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
906                         cb->bind_attr.mr = cb->rdma_mr;
907                 }
908                 cb->bind_attr.addr = buf;
909                 DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n",
910                         cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
911                 ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
912                 if (ret) {
913                         PRINTF(cb, "bind mw error %d\n", ret);
914                         cb->state = ERROR;
915                 } else
916                         rkey = cb->mw->rkey;
917                 break;
918         case MR:
919                 if (buf == (u64)cb->start_dma_addr)
920                         rkey = cb->start_mr->rkey;
921                 else
922                         rkey = cb->rdma_mr->rkey;
923                 break;
924         case DMA:
925                 rkey = cb->dma_mr->rkey;
926                 break;
927         default:
928                 PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
929                 cb->state = ERROR;
930                 break;
931         }
932         return rkey;
933 }
934
935 static void krping_format_send(struct krping_cb *cb, u64 buf)
936 {
937         struct krping_rdma_info *info = &cb->send_buf;
938         u32 rkey;
939
940         /*
941          * Client side will do fastreg or mw bind before
942          * advertising the rdma buffer.  Server side
943          * sends have no data.
944          */
945         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
946                 rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
947                 info->buf = htonll(buf);
948                 info->rkey = htonl(rkey);
949                 info->size = htonl(cb->size);
950                 DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
951                           (unsigned long long)buf, rkey, cb->size);
952         }
953 }
954
955 static void krping_test_server(struct krping_cb *cb)
956 {
957         struct ib_send_wr *bad_wr, inv;
958         int ret;
959
960         while (1) {
961                 /* Wait for client's Start STAG/TO/Len */
962                 wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
963                 if (cb->state != RDMA_READ_ADV) {
964                         PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
965                                 cb->state);
966                         break;
967                 }
968
969                 DEBUG_LOG(cb, "server received sink adv\n");
970
971                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
972                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
973                 cb->rdma_sq_wr.sg_list->length = cb->remote_len;
974                 cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
975                 cb->rdma_sq_wr.next = NULL;
976
977                 /* Issue RDMA Read. */
978                 if (cb->read_inv)
979                         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
980                 else {
981
982                         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
983                         if (cb->mem == FASTREG) {
984                                 /* 
985                                  * Immediately follow the read with a 
986                                  * fenced LOCAL_INV.
987                                  */
988                                 cb->rdma_sq_wr.next = &inv;
989                                 memset(&inv, 0, sizeof inv);
990                                 inv.opcode = IB_WR_LOCAL_INV;
991                                 inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
992                                 inv.send_flags = IB_SEND_FENCE;
993                         }
994                 }
995
996                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
997                 if (ret) {
998                         PRINTF(cb, "post send error %d\n", ret);
999                         break;
1000                 }
1001                 cb->rdma_sq_wr.next = NULL;
1002
1003                 DEBUG_LOG(cb, "server posted rdma read req \n");
1004
1005                 /* Wait for read completion */
1006                 wait_event_interruptible(cb->sem, 
1007                                          cb->state >= RDMA_READ_COMPLETE);
1008                 if (cb->state != RDMA_READ_COMPLETE) {
1009                         PRINTF(cb, 
1010                                "wait for RDMA_READ_COMPLETE state %d\n",
1011                                cb->state);
1012                         break;
1013                 }
1014                 DEBUG_LOG(cb, "server received read complete\n");
1015
1016                 /* Display data in recv buf */
1017                 if (cb->verbose)
1018                         PRINTF(cb, "server ping data: %s\n", 
1019                                 cb->rdma_buf);
1020
1021                 /* Tell client to continue */
1022                 if (cb->server && cb->server_invalidate) {
1023                         cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1024                         cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1025                         DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1026                 } 
1027                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1028                 if (ret) {
1029                         PRINTF(cb, "post send error %d\n", ret);
1030                         break;
1031                 }
1032                 DEBUG_LOG(cb, "server posted go ahead\n");
1033
1034                 /* Wait for client's RDMA STAG/TO/Len */
1035                 wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1036                 if (cb->state != RDMA_WRITE_ADV) {
1037                         PRINTF(cb, 
1038                                "wait for RDMA_WRITE_ADV state %d\n",
1039                                cb->state);
1040                         break;
1041                 }
1042                 DEBUG_LOG(cb, "server received sink adv\n");
1043
1044                 /* RDMA Write echo data */
1045                 cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1046                 cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1047                 cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1048                 cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1049                 if (cb->local_dma_lkey)
1050                         cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1051                 else 
1052                         cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1053                         
1054                 DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1055                           cb->rdma_sq_wr.sg_list->lkey,
1056                           (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1057                           cb->rdma_sq_wr.sg_list->length);
1058
1059                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1060                 if (ret) {
1061                         PRINTF(cb, "post send error %d\n", ret);
1062                         break;
1063                 }
1064
1065                 /* Wait for completion */
1066                 ret = wait_event_interruptible(cb->sem, cb->state >= 
1067                                                          RDMA_WRITE_COMPLETE);
1068                 if (cb->state != RDMA_WRITE_COMPLETE) {
1069                         PRINTF(cb, 
1070                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1071                                cb->state);
1072                         break;
1073                 }
1074                 DEBUG_LOG(cb, "server rdma write complete \n");
1075
1076                 cb->state = CONNECTED;
1077
1078                 /* Tell client to begin again */
1079                 if (cb->server && cb->server_invalidate) {
1080                         cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1081                         cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1082                         DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1083                 } 
1084                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1085                 if (ret) {
1086                         PRINTF(cb, "post send error %d\n", ret);
1087                         break;
1088                 }
1089                 DEBUG_LOG(cb, "server posted go ahead\n");
1090         }
1091 }
1092
1093 static void rlat_test(struct krping_cb *cb)
1094 {
1095         int scnt;
1096         int iters = cb->count;
1097         struct timeval start_tv, stop_tv;
1098         int ret;
1099         struct ib_wc wc;
1100         struct ib_send_wr *bad_wr;
1101         int ne;
1102
1103         scnt = 0;
1104         cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1105         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1106         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1107         cb->rdma_sq_wr.sg_list->length = cb->size;
1108
1109         microtime(&start_tv);
1110         if (!cb->poll) {
1111                 cb->state = RDMA_READ_ADV;
1112                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1113         }
1114         while (scnt < iters) {
1115
1116                 cb->state = RDMA_READ_ADV;
1117                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1118                 if (ret) {
1119                         PRINTF(cb, 
1120                                 "Couldn't post send: ret=%d scnt %d\n",
1121                                 ret, scnt);
1122                         return;
1123                 }
1124
1125                 do {
1126                         if (!cb->poll) {
1127                                 wait_event_interruptible(cb->sem, 
1128                                         cb->state != RDMA_READ_ADV);
1129                                 if (cb->state == RDMA_READ_COMPLETE) {
1130                                         ne = 1;
1131                                         ib_req_notify_cq(cb->cq, 
1132                                                 IB_CQ_NEXT_COMP);
1133                                 } else {
1134                                         ne = -1;
1135                                 }
1136                         } else
1137                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1138                         if (cb->state == ERROR) {
1139                                 PRINTF(cb, 
1140                                         "state == ERROR...bailing scnt %d\n", 
1141                                         scnt);
1142                                 return;
1143                         }
1144                 } while (ne == 0);
1145
1146                 if (ne < 0) {
1147                         PRINTF(cb, "poll CQ failed %d\n", ne);
1148                         return;
1149                 }
1150                 if (cb->poll && wc.status != IB_WC_SUCCESS) {
1151                         PRINTF(cb, "Completion wth error at %s:\n",
1152                                 cb->server ? "server" : "client");
1153                         PRINTF(cb, "Failed status %d: wr_id %d\n",
1154                                 wc.status, (int) wc.wr_id);
1155                         return;
1156                 }
1157                 ++scnt;
1158         }
1159         microtime(&stop_tv);
1160
1161         if (stop_tv.tv_usec < start_tv.tv_usec) {
1162                 stop_tv.tv_usec += 1000000;
1163                 stop_tv.tv_sec  -= 1;
1164         }
1165
1166         PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1167                 stop_tv.tv_sec - start_tv.tv_sec, 
1168                 stop_tv.tv_usec - start_tv.tv_usec,
1169                 scnt, cb->size);
1170 }
1171
1172 static void wlat_test(struct krping_cb *cb)
1173 {
1174         int ccnt, scnt, rcnt;
1175         int iters=cb->count;
1176         volatile char *poll_buf = (char *) cb->start_buf;
1177         char *buf = (char *)cb->rdma_buf;
1178         struct timeval start_tv, stop_tv;
1179         cycles_t *post_cycles_start, *post_cycles_stop;
1180         cycles_t *poll_cycles_start, *poll_cycles_stop;
1181         cycles_t *last_poll_cycles_start;
1182         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1183         int i;
1184         int cycle_iters = 1000;
1185
1186         ccnt = 0;
1187         scnt = 0;
1188         rcnt = 0;
1189
1190         post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1191         if (!post_cycles_start) {
1192                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1193                 return;
1194         }
1195         post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1196         if (!post_cycles_stop) {
1197                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1198                 return;
1199         }
1200         poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1201         if (!poll_cycles_start) {
1202                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1203                 return;
1204         }
1205         poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1206         if (!poll_cycles_stop) {
1207                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1208                 return;
1209         }
1210         last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
1211                 GFP_KERNEL);
1212         if (!last_poll_cycles_start) {
1213                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1214                 return;
1215         }
1216         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1217         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1218         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1219         cb->rdma_sq_wr.sg_list->length = cb->size;
1220
1221         if (cycle_iters > iters)
1222                 cycle_iters = iters;
1223         microtime(&start_tv);
1224         while (scnt < iters || ccnt < iters || rcnt < iters) {
1225
1226                 /* Wait till buffer changes. */
1227                 if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1228                         ++rcnt;
1229                         while (*poll_buf != (char)rcnt) {
1230                                 if (cb->state == ERROR) {
1231                                         PRINTF(cb, 
1232                                                 "state = ERROR, bailing\n");
1233                                         return;
1234                                 }
1235                         }
1236                 }
1237
1238                 if (scnt < iters) {
1239                         struct ib_send_wr *bad_wr;
1240
1241                         *buf = (char)scnt+1;
1242                         if (scnt < cycle_iters)
1243                                 post_cycles_start[scnt] = get_cycles();
1244                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1245                                 PRINTF(cb, 
1246                                         "Couldn't post send: scnt=%d\n",
1247                                         scnt);
1248                                 return;
1249                         }
1250                         if (scnt < cycle_iters)
1251                                 post_cycles_stop[scnt] = get_cycles();
1252                         scnt++;
1253                 }
1254
1255                 if (ccnt < iters) {
1256                         struct ib_wc wc;
1257                         int ne;
1258
1259                         if (ccnt < cycle_iters)
1260                                 poll_cycles_start[ccnt] = get_cycles();
1261                         do {
1262                                 if (ccnt < cycle_iters)
1263                                         last_poll_cycles_start[ccnt] = 
1264                                                 get_cycles();
1265                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1266                         } while (ne == 0);
1267                         if (ccnt < cycle_iters)
1268                                 poll_cycles_stop[ccnt] = get_cycles();
1269                         ++ccnt;
1270
1271                         if (ne < 0) {
1272                                 PRINTF(cb, "poll CQ failed %d\n", ne);
1273                                 return;
1274                         }
1275                         if (wc.status != IB_WC_SUCCESS) {
1276                                 PRINTF(cb, 
1277                                         "Completion wth error at %s:\n",
1278                                         cb->server ? "server" : "client");
1279                                 PRINTF(cb, 
1280                                         "Failed status %d: wr_id %d\n",
1281                                         wc.status, (int) wc.wr_id);
1282                                 PRINTF(cb, 
1283                                         "scnt=%d, rcnt=%d, ccnt=%d\n",
1284                                         scnt, rcnt, ccnt);
1285                                 return;
1286                         }
1287                 }
1288         }
1289         microtime(&stop_tv);
1290
1291         if (stop_tv.tv_usec < start_tv.tv_usec) {
1292                 stop_tv.tv_usec += 1000000;
1293                 stop_tv.tv_sec  -= 1;
1294         }
1295
1296         for (i=0; i < cycle_iters; i++) {
1297                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1298                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1299                 sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1300         }
1301         PRINTF(cb, 
1302                 "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1303                 " sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1304                 stop_tv.tv_sec - start_tv.tv_sec, 
1305                 stop_tv.tv_usec - start_tv.tv_usec,
1306                 scnt, cb->size, cycle_iters, 
1307                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1308                 (unsigned long long)sum_last_poll);
1309         kfree(post_cycles_start);
1310         kfree(post_cycles_stop);
1311         kfree(poll_cycles_start);
1312         kfree(poll_cycles_stop);
1313         kfree(last_poll_cycles_start);
1314 }
1315
1316 static void bw_test(struct krping_cb *cb)
1317 {
1318         int ccnt, scnt, rcnt;
1319         int iters=cb->count;
1320         struct timeval start_tv, stop_tv;
1321         cycles_t *post_cycles_start, *post_cycles_stop;
1322         cycles_t *poll_cycles_start, *poll_cycles_stop;
1323         cycles_t *last_poll_cycles_start;
1324         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1325         int i;
1326         int cycle_iters = 1000;
1327
1328         ccnt = 0;
1329         scnt = 0;
1330         rcnt = 0;
1331
1332         post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1333         if (!post_cycles_start) {
1334                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1335                 return;
1336         }
1337         post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1338         if (!post_cycles_stop) {
1339                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1340                 return;
1341         }
1342         poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1343         if (!poll_cycles_start) {
1344                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1345                 return;
1346         }
1347         poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1348         if (!poll_cycles_stop) {
1349                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1350                 return;
1351         }
1352         last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
1353                 GFP_KERNEL);
1354         if (!last_poll_cycles_start) {
1355                 PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1356                 return;
1357         }
1358         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1359         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1360         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1361         cb->rdma_sq_wr.sg_list->length = cb->size;
1362
1363         if (cycle_iters > iters)
1364                 cycle_iters = iters;
1365         microtime(&start_tv);
1366         while (scnt < iters || ccnt < iters) {
1367
1368                 while (scnt < iters && scnt - ccnt < cb->txdepth) {
1369                         struct ib_send_wr *bad_wr;
1370
1371                         if (scnt < cycle_iters)
1372                                 post_cycles_start[scnt] = get_cycles();
1373                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1374                                 PRINTF(cb, 
1375                                         "Couldn't post send: scnt=%d\n",
1376                                         scnt);
1377                                 return;
1378                         }
1379                         if (scnt < cycle_iters)
1380                                 post_cycles_stop[scnt] = get_cycles();
1381                         ++scnt;
1382                 }
1383
1384                 if (ccnt < iters) {
1385                         int ne;
1386                         struct ib_wc wc;
1387
1388                         if (ccnt < cycle_iters)
1389                                 poll_cycles_start[ccnt] = get_cycles();
1390                         do {
1391                                 if (ccnt < cycle_iters)
1392                                         last_poll_cycles_start[ccnt] = 
1393                                                 get_cycles();
1394                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1395                         } while (ne == 0);
1396                         if (ccnt < cycle_iters)
1397                                 poll_cycles_stop[ccnt] = get_cycles();
1398                         ccnt += 1;
1399
1400                         if (ne < 0) {
1401                                 PRINTF(cb, "poll CQ failed %d\n", ne);
1402                                 return;
1403                         }
1404                         if (wc.status != IB_WC_SUCCESS) {
1405                                 PRINTF(cb, 
1406                                         "Completion wth error at %s:\n",
1407                                         cb->server ? "server" : "client");
1408                                 PRINTF(cb, 
1409                                         "Failed status %d: wr_id %d\n",
1410                                         wc.status, (int) wc.wr_id);
1411                                 return;
1412                         }
1413                 }
1414         }
1415         microtime(&stop_tv);
1416
1417         if (stop_tv.tv_usec < start_tv.tv_usec) {
1418                 stop_tv.tv_usec += 1000000;
1419                 stop_tv.tv_sec  -= 1;
1420         }
1421
1422         for (i=0; i < cycle_iters; i++) {
1423                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1424                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1425                 sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1426         }
1427         PRINTF(cb, 
1428                 "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1429                 " sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1430                 stop_tv.tv_sec - start_tv.tv_sec, 
1431                 stop_tv.tv_usec - start_tv.tv_usec,
1432                 scnt, cb->size, cycle_iters, 
1433                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1434                 (unsigned long long)sum_last_poll);
1435         kfree(post_cycles_start);
1436         kfree(post_cycles_stop);
1437         kfree(poll_cycles_start);
1438         kfree(poll_cycles_stop);
1439         kfree(last_poll_cycles_start);
1440 }
1441
1442 static void krping_rlat_test_server(struct krping_cb *cb)
1443 {
1444         struct ib_send_wr *bad_wr;
1445         struct ib_wc wc;
1446         int ret;
1447
1448         /* Spin waiting for client's Start STAG/TO/Len */
1449         while (cb->state < RDMA_READ_ADV) {
1450                 krping_cq_event_handler(cb->cq, cb);
1451         }
1452
1453         /* Send STAG/TO/Len to client */
1454         krping_format_send(cb, cb->start_dma_addr);
1455         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1456         if (ret) {
1457                 PRINTF(cb, "post send error %d\n", ret);
1458                 return;
1459         }
1460
1461         /* Spin waiting for send completion */
1462         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1463         if (ret < 0) {
1464                 PRINTF(cb, "poll error %d\n", ret);
1465                 return;
1466         }
1467         if (wc.status) {
1468                 PRINTF(cb, "send completiong error %d\n", wc.status);
1469                 return;
1470         }
1471
1472         wait_event_interruptible(cb->sem, cb->state == ERROR);
1473 }
1474
1475 static void krping_wlat_test_server(struct krping_cb *cb)
1476 {
1477         struct ib_send_wr *bad_wr;
1478         struct ib_wc wc;
1479         int ret;
1480
1481         /* Spin waiting for client's Start STAG/TO/Len */
1482         while (cb->state < RDMA_READ_ADV) {
1483                 krping_cq_event_handler(cb->cq, cb);
1484         }
1485
1486         /* Send STAG/TO/Len to client */
1487         krping_format_send(cb, cb->start_dma_addr);
1488         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1489         if (ret) {
1490                 PRINTF(cb, "post send error %d\n", ret);
1491                 return;
1492         }
1493
1494         /* Spin waiting for send completion */
1495         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1496         if (ret < 0) {
1497                 PRINTF(cb, "poll error %d\n", ret);
1498                 return;
1499         }
1500         if (wc.status) {
1501                 PRINTF(cb, "send completiong error %d\n", wc.status);
1502                 return;
1503         }
1504
1505         wlat_test(cb);
1506         wait_event_interruptible(cb->sem, cb->state == ERROR);
1507 }
1508
1509 static void krping_bw_test_server(struct krping_cb *cb)
1510 {
1511         struct ib_send_wr *bad_wr;
1512         struct ib_wc wc;
1513         int ret;
1514
1515         /* Spin waiting for client's Start STAG/TO/Len */
1516         while (cb->state < RDMA_READ_ADV) {
1517                 krping_cq_event_handler(cb->cq, cb);
1518         }
1519
1520         /* Send STAG/TO/Len to client */
1521         krping_format_send(cb, cb->start_dma_addr);
1522         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1523         if (ret) {
1524                 PRINTF(cb, "post send error %d\n", ret);
1525                 return;
1526         }
1527
1528         /* Spin waiting for send completion */
1529         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1530         if (ret < 0) {
1531                 PRINTF(cb, "poll error %d\n", ret);
1532                 return;
1533         }
1534         if (wc.status) {
1535                 PRINTF(cb, "send completiong error %d\n", wc.status);
1536                 return;
1537         }
1538
1539         if (cb->duplex)
1540                 bw_test(cb);
1541         wait_event_interruptible(cb->sem, cb->state == ERROR);
1542 }
1543
1544 static int fastreg_supported(struct krping_cb *cb)
1545 {
1546         struct ib_device *dev = cb->child_cm_id->device;
1547         struct ib_device_attr attr;
1548         int ret;
1549
1550         ret = ib_query_device(dev, &attr);
1551         if (ret) {
1552                 PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1553                 return 0;
1554         }
1555         if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1556                 PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%x\n",
1557                     attr.device_cap_flags);
1558                 return 0;
1559         }
1560         DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%x\n",
1561                 attr.device_cap_flags);
1562         return 1;
1563 }
1564
1565 static int krping_bind_server(struct krping_cb *cb)
1566 {
1567         struct sockaddr_in sin;
1568         int ret;
1569
1570         memset(&sin, 0, sizeof(sin));
1571         sin.sin_len = sizeof sin;
1572         sin.sin_family = AF_INET;
1573         sin.sin_addr.s_addr = cb->addr.s_addr;
1574         sin.sin_port = cb->port;
1575
1576         ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1577         if (ret) {
1578                 PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1579                 return ret;
1580         }
1581         DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1582
1583         DEBUG_LOG(cb, "rdma_listen\n");
1584         ret = rdma_listen(cb->cm_id, 3);
1585         if (ret) {
1586                 PRINTF(cb, "rdma_listen failed: %d\n", ret);
1587                 return ret;
1588         }
1589
1590         wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1591         if (cb->state != CONNECT_REQUEST) {
1592                 PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1593                         cb->state);
1594                 return -1;
1595         }
1596
1597         if (cb->mem == FASTREG && !fastreg_supported(cb))
1598                 return -EINVAL;
1599
1600         return 0;
1601 }
1602
1603 static void krping_run_server(struct krping_cb *cb)
1604 {
1605         struct ib_recv_wr *bad_wr;
1606         int ret;
1607
1608         ret = krping_bind_server(cb);
1609         if (ret)
1610                 return;
1611
1612         ret = krping_setup_qp(cb, cb->child_cm_id);
1613         if (ret) {
1614                 PRINTF(cb, "setup_qp failed: %d\n", ret);
1615                 goto err0;
1616         }
1617
1618         ret = krping_setup_buffers(cb);
1619         if (ret) {
1620                 PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
1621                 goto err1;
1622         }
1623
1624         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1625         if (ret) {
1626                 PRINTF(cb, "ib_post_recv failed: %d\n", ret);
1627                 goto err2;
1628         }
1629
1630         ret = krping_accept(cb);
1631         if (ret) {
1632                 PRINTF(cb, "connect error %d\n", ret);
1633                 goto err2;
1634         }
1635
1636         if (cb->wlat)
1637                 krping_wlat_test_server(cb);
1638         else if (cb->rlat)
1639                 krping_rlat_test_server(cb);
1640         else if (cb->bw)
1641                 krping_bw_test_server(cb);
1642         else
1643                 krping_test_server(cb);
1644         rdma_disconnect(cb->child_cm_id);
1645 err2:
1646         krping_free_buffers(cb);
1647 err1:
1648         krping_free_qp(cb);
1649 err0:
1650         rdma_destroy_id(cb->child_cm_id);
1651 }
1652
1653 static void krping_test_client(struct krping_cb *cb)
1654 {
1655         int ping, start, cc, i, ret;
1656         struct ib_send_wr *bad_wr;
1657         unsigned char c;
1658
1659         start = 65;
1660         for (ping = 0; !cb->count || ping < cb->count; ping++) {
1661                 cb->state = RDMA_READ_ADV;
1662
1663                 /* Put some ascii text in the buffer. */
1664                 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1665                 for (i = cc, c = start; i < cb->size; i++) {
1666                         cb->start_buf[i] = c;
1667                         c++;
1668                         if (c > 122)
1669                                 c = 65;
1670                 }
1671                 start++;
1672                 if (start > 122)
1673                         start = 65;
1674                 cb->start_buf[cb->size - 1] = 0;
1675
1676                 krping_format_send(cb, cb->start_dma_addr);
1677                 if (cb->state == ERROR) {
1678                         PRINTF(cb, "krping_format_send failed\n");
1679                         break;
1680                 }
1681                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1682                 if (ret) {
1683                         PRINTF(cb, "post send error %d\n", ret);
1684                         break;
1685                 }
1686
1687                 /* Wait for server to ACK */
1688                 wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1689                 if (cb->state != RDMA_WRITE_ADV) {
1690                         PRINTF(cb, 
1691                                "wait for RDMA_WRITE_ADV state %d\n",
1692                                cb->state);
1693                         break;
1694                 }
1695
1696                 krping_format_send(cb, cb->rdma_dma_addr);
1697                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1698                 if (ret) {
1699                         PRINTF(cb, "post send error %d\n", ret);
1700                         break;
1701                 }
1702
1703                 /* Wait for the server to say the RDMA Write is complete. */
1704                 wait_event_interruptible(cb->sem, 
1705                                          cb->state >= RDMA_WRITE_COMPLETE);
1706                 if (cb->state != RDMA_WRITE_COMPLETE) {
1707                         PRINTF(cb, 
1708                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1709                                cb->state);
1710                         break;
1711                 }
1712
1713                 if (cb->validate)
1714                         if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1715                                 PRINTF(cb, "data mismatch!\n");
1716                                 break;
1717                         }
1718
1719                 if (cb->verbose)
1720                         PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
1721 #ifdef SLOW_KRPING
1722                 wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1723 #endif
1724         }
1725 }
1726
1727 static void krping_rlat_test_client(struct krping_cb *cb)
1728 {
1729         struct ib_send_wr *bad_wr;
1730         struct ib_wc wc;
1731         int ret;
1732
1733         cb->state = RDMA_READ_ADV;
1734
1735         /* Send STAG/TO/Len to client */
1736         krping_format_send(cb, cb->start_dma_addr);
1737         if (cb->state == ERROR) {
1738                 PRINTF(cb, "krping_format_send failed\n");
1739                 return;
1740         }
1741         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1742         if (ret) {
1743                 PRINTF(cb, "post send error %d\n", ret);
1744                 return;
1745         }
1746
1747         /* Spin waiting for send completion */
1748         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1749         if (ret < 0) {
1750                 PRINTF(cb, "poll error %d\n", ret);
1751                 return;
1752         }
1753         if (wc.status) {
1754                 PRINTF(cb, "send completion error %d\n", wc.status);
1755                 return;
1756         }
1757
1758         /* Spin waiting for server's Start STAG/TO/Len */
1759         while (cb->state < RDMA_WRITE_ADV) {
1760                 krping_cq_event_handler(cb->cq, cb);
1761         }
1762
1763 #if 0
1764 {
1765         int i;
1766         struct timeval start, stop;
1767         time_t sec;
1768         suseconds_t usec;
1769         unsigned long long elapsed;
1770         struct ib_wc wc;
1771         struct ib_send_wr *bad_wr;
1772         int ne;
1773         
1774         cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1775         cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1776         cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1777         cb->rdma_sq_wr.sg_list->length = 0;
1778         cb->rdma_sq_wr.num_sge = 0;
1779
1780         microtime(&start);
1781         for (i=0; i < 100000; i++) {
1782                 if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1783                         PRINTF(cb, "Couldn't post send\n");
1784                         return;
1785                 }
1786                 do {
1787                         ne = ib_poll_cq(cb->cq, 1, &wc);
1788                 } while (ne == 0);
1789                 if (ne < 0) {
1790                         PRINTF(cb, "poll CQ failed %d\n", ne);
1791                         return;
1792                 }
1793                 if (wc.status != IB_WC_SUCCESS) {
1794                         PRINTF(cb, "Completion wth error at %s:\n",
1795                                 cb->server ? "server" : "client");
1796                         PRINTF(cb, "Failed status %d: wr_id %d\n",
1797                                 wc.status, (int) wc.wr_id);
1798                         return;
1799                 }
1800         }
1801         microtime(&stop);
1802         
1803         if (stop.tv_usec < start.tv_usec) {
1804                 stop.tv_usec += 1000000;
1805                 stop.tv_sec  -= 1;
1806         }
1807         sec     = stop.tv_sec - start.tv_sec;
1808         usec    = stop.tv_usec - start.tv_usec;
1809         elapsed = sec * 1000000 + usec;
1810         PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1811 }
1812 #endif
1813
1814         rlat_test(cb);
1815 }
1816
1817 static void krping_wlat_test_client(struct krping_cb *cb)
1818 {
1819         struct ib_send_wr *bad_wr;
1820         struct ib_wc wc;
1821         int ret;
1822
1823         cb->state = RDMA_READ_ADV;
1824
1825         /* Send STAG/TO/Len to client */
1826         krping_format_send(cb, cb->start_dma_addr);
1827         if (cb->state == ERROR) {
1828                 PRINTF(cb, "krping_format_send failed\n");
1829                 return;
1830         }
1831         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1832         if (ret) {
1833                 PRINTF(cb, "post send error %d\n", ret);
1834                 return;
1835         }
1836
1837         /* Spin waiting for send completion */
1838         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1839         if (ret < 0) {
1840                 PRINTF(cb, "poll error %d\n", ret);
1841                 return;
1842         }
1843         if (wc.status) {
1844                 PRINTF(cb, "send completion error %d\n", wc.status);
1845                 return;
1846         }
1847
1848         /* Spin waiting for server's Start STAG/TO/Len */
1849         while (cb->state < RDMA_WRITE_ADV) {
1850                 krping_cq_event_handler(cb->cq, cb);
1851         }
1852
1853         wlat_test(cb);
1854 }
1855
1856 static void krping_bw_test_client(struct krping_cb *cb)
1857 {
1858         struct ib_send_wr *bad_wr;
1859         struct ib_wc wc;
1860         int ret;
1861
1862         cb->state = RDMA_READ_ADV;
1863
1864         /* Send STAG/TO/Len to client */
1865         krping_format_send(cb, cb->start_dma_addr);
1866         if (cb->state == ERROR) {
1867                 PRINTF(cb, "krping_format_send failed\n");
1868                 return;
1869         }
1870         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1871         if (ret) {
1872                 PRINTF(cb, "post send error %d\n", ret);
1873                 return;
1874         }
1875
1876         /* Spin waiting for send completion */
1877         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1878         if (ret < 0) {
1879                 PRINTF(cb, "poll error %d\n", ret);
1880                 return;
1881         }
1882         if (wc.status) {
1883                 PRINTF(cb, "send completion error %d\n", wc.status);
1884                 return;
1885         }
1886
1887         /* Spin waiting for server's Start STAG/TO/Len */
1888         while (cb->state < RDMA_WRITE_ADV) {
1889                 krping_cq_event_handler(cb->cq, cb);
1890         }
1891
1892         bw_test(cb);
1893 }
1894
1895 static void krping_fr_test(struct krping_cb *cb)
1896 {
1897         struct ib_fast_reg_page_list *pl;
1898         struct ib_send_wr fr, inv, *bad;
1899         struct ib_wc wc;
1900         u8 key = 0;
1901         struct ib_mr *mr;
1902         int i;
1903         int ret;
1904         int size = cb->size;
1905         int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1906         time_t start;
1907         int count = 0;
1908         int scnt = 0;
1909
1910         pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1911         if (IS_ERR(pl)) {
1912                 PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
1913                 return;
1914         }
1915         
1916         mr = ib_alloc_fast_reg_mr(cb->pd, plen);
1917         if (IS_ERR(mr)) {
1918                 PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
1919                 goto err1;
1920         }
1921
1922         for (i=0; i<plen; i++)
1923                 pl->page_list[i] = 0xcafebabe | i;
1924         
1925         memset(&fr, 0, sizeof fr);
1926         fr.opcode = IB_WR_FAST_REG_MR;
1927         fr.wr.fast_reg.page_shift = PAGE_SHIFT;
1928         fr.wr.fast_reg.length = size;
1929         fr.wr.fast_reg.page_list = pl;
1930         fr.wr.fast_reg.page_list_len = plen;
1931         fr.wr.fast_reg.iova_start = 0;
1932         fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1933         fr.next = &inv;
1934         memset(&inv, 0, sizeof inv);
1935         inv.opcode = IB_WR_LOCAL_INV;
1936         inv.send_flags = IB_SEND_SIGNALED;
1937         
1938         DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1939         start = time_uptime;
1940         while (1) {
1941                 if ((time_uptime - start) >= 9) {
1942                         DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1943                         wait_event_interruptible(cb->sem, cb->state == ERROR);
1944                         if (cb->state == ERROR)
1945                                 break;
1946                         start = time_uptime;
1947                 }       
1948                 while (scnt < (cb->txdepth>>1)) {
1949                         ib_update_fast_reg_key(mr, ++key);
1950                         fr.wr.fast_reg.rkey = mr->rkey;
1951                         inv.ex.invalidate_rkey = mr->rkey;
1952                         size = arc4random() % cb->size;
1953                         if (size == 0)
1954                                 size = cb->size;
1955                         plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1956                         fr.wr.fast_reg.length = size;
1957                         fr.wr.fast_reg.page_list_len = plen;
1958                         ret = ib_post_send(cb->qp, &fr, &bad);
1959                         if (ret) {
1960                                 PRINTF(cb, "ib_post_send failed %d\n", ret);
1961                                 goto err2;      
1962                         }
1963                         scnt++;
1964                 }
1965
1966                 do {
1967                         ret = ib_poll_cq(cb->cq, 1, &wc);
1968                         if (ret < 0) {
1969                                 PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1970                                 goto err2;      
1971                         }
1972                         if (ret == 1) {
1973                                 if (wc.status) {
1974                                         PRINTF(cb, "completion error %u\n", wc.status);
1975                                         goto err2;
1976                                 }
1977                                 count++;
1978                                 scnt--;
1979                         }
1980                         else if (krping_sigpending()) {
1981                                 PRINTF(cb, "signal!\n");
1982                                 goto err2;
1983                         }
1984                 } while (ret == 1);
1985         }
1986 err2:
1987 #if 0
1988         DEBUG_LOG(cb, "sleeping 1 second\n");
1989         wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1990 #endif
1991         DEBUG_LOG(cb, "draining the cq...\n");
1992         do {
1993                 ret = ib_poll_cq(cb->cq, 1, &wc);
1994                 if (ret < 0) {
1995                         PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1996                         break;
1997                 }
1998                 if (ret == 1) {
1999                         if (wc.status) {
2000                                 PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
2001                         }
2002                 }
2003         } while (ret == 1);
2004         DEBUG_LOG(cb, "fr_test: done!\n");
2005         ib_dereg_mr(mr);
2006 err1:
2007         ib_free_fast_reg_page_list(pl);
2008 }
2009
2010 static int krping_connect_client(struct krping_cb *cb)
2011 {
2012         struct rdma_conn_param conn_param;
2013         int ret;
2014
2015         memset(&conn_param, 0, sizeof conn_param);
2016         conn_param.responder_resources = 1;
2017         conn_param.initiator_depth = 1;
2018         conn_param.retry_count = 10;
2019
2020         ret = rdma_connect(cb->cm_id, &conn_param);
2021         if (ret) {
2022                 PRINTF(cb, "rdma_connect error %d\n", ret);
2023                 return ret;
2024         }
2025
2026         wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
2027         if (cb->state == ERROR) {
2028                 PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
2029                 return -1;
2030         }
2031
2032         DEBUG_LOG(cb, "rdma_connect successful\n");
2033         return 0;
2034 }
2035
2036 static int krping_bind_client(struct krping_cb *cb)
2037 {
2038         struct sockaddr_in sin;
2039         int ret;
2040
2041         memset(&sin, 0, sizeof(sin));
2042         sin.sin_len = sizeof sin;
2043         sin.sin_family = AF_INET;
2044         sin.sin_addr.s_addr = cb->addr.s_addr;
2045         sin.sin_port = cb->port;
2046
2047         ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
2048                                 2000);
2049         if (ret) {
2050                 PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
2051                 return ret;
2052         }
2053
2054         wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
2055         if (cb->state != ROUTE_RESOLVED) {
2056                 PRINTF(cb, 
2057                        "addr/route resolution did not resolve: state %d\n",
2058                        cb->state);
2059                 return -EINTR;
2060         }
2061
2062         if (cb->mem == FASTREG && !fastreg_supported(cb))
2063                 return -EINVAL;
2064
2065         DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
2066         return 0;
2067 }
2068
2069 static void krping_run_client(struct krping_cb *cb)
2070 {
2071         struct ib_recv_wr *bad_wr;
2072         int ret;
2073
2074         ret = krping_bind_client(cb);
2075         if (ret)
2076                 return;
2077
2078         ret = krping_setup_qp(cb, cb->cm_id);
2079         if (ret) {
2080                 PRINTF(cb, "setup_qp failed: %d\n", ret);
2081                 return;
2082         }
2083
2084         ret = krping_setup_buffers(cb);
2085         if (ret) {
2086                 PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2087                 goto err1;
2088         }
2089
2090         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2091         if (ret) {
2092                 PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2093                 goto err2;
2094         }
2095
2096         ret = krping_connect_client(cb);
2097         if (ret) {
2098                 PRINTF(cb, "connect error %d\n", ret);
2099                 goto err2;
2100         }
2101
2102         if (cb->wlat)
2103                 krping_wlat_test_client(cb);
2104         else if (cb->rlat)
2105                 krping_rlat_test_client(cb);
2106         else if (cb->bw)
2107                 krping_bw_test_client(cb);
2108         else if (cb->frtest)
2109                 krping_fr_test(cb);
2110         else
2111                 krping_test_client(cb);
2112         rdma_disconnect(cb->cm_id);
2113 err2:
2114         krping_free_buffers(cb);
2115 err1:
2116         krping_free_qp(cb);
2117 }
2118
2119 int krping_doit(char *cmd, void *cookie)
2120 {
2121         struct krping_cb *cb;
2122         int op;
2123         int ret = 0;
2124         char *optarg;
2125         unsigned long optint;
2126
2127         cb = kzalloc(sizeof(*cb), GFP_KERNEL);
2128         if (!cb)
2129                 return -ENOMEM;
2130
2131         mutex_lock(&krping_mutex);
2132         list_add_tail(&cb->list, &krping_cbs);
2133         mutex_unlock(&krping_mutex);
2134
2135         cb->cookie = cookie;
2136         cb->server = -1;
2137         cb->state = IDLE;
2138         cb->size = 64;
2139         cb->txdepth = RPING_SQ_DEPTH;
2140         cb->mem = DMA;
2141         init_waitqueue_head(&cb->sem);
2142
2143         while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2144                               &optint)) != 0) {
2145                 switch (op) {
2146                 case 'a':
2147                         cb->addr_str = optarg;
2148                         DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
2149                         if (!inet_aton(optarg, &cb->addr)) {
2150                                 PRINTF(cb, "bad addr string %s\n",
2151                                     optarg);
2152                                 ret = EINVAL;
2153                         }
2154                         break;
2155                 case 'p':
2156                         cb->port = htons(optint);
2157                         DEBUG_LOG(cb, "port %d\n", (int)optint);
2158                         break;
2159                 case 'P':
2160                         cb->poll = 1;
2161                         DEBUG_LOG(cb, "server\n");
2162                         break;
2163                 case 's':
2164                         cb->server = 1;
2165                         DEBUG_LOG(cb, "server\n");
2166                         break;
2167                 case 'c':
2168                         cb->server = 0;
2169                         DEBUG_LOG(cb, "client\n");
2170                         break;
2171                 case 'S':
2172                         cb->size = optint;
2173                         if ((cb->size < 1) ||
2174                             (cb->size > RPING_BUFSIZE)) {
2175                                 PRINTF(cb, "Invalid size %d "
2176                                        "(valid range is 1 to %d)\n",
2177                                        cb->size, RPING_BUFSIZE);
2178                                 ret = EINVAL;
2179                         } else
2180                                 DEBUG_LOG(cb, "size %d\n", (int)optint);
2181                         break;
2182                 case 'C':
2183                         cb->count = optint;
2184                         if (cb->count < 0) {
2185                                 PRINTF(cb, "Invalid count %d\n",
2186                                         cb->count);
2187                                 ret = EINVAL;
2188                         } else
2189                                 DEBUG_LOG(cb, "count %d\n", (int) cb->count);
2190                         break;
2191                 case 'v':
2192                         cb->verbose++;
2193                         DEBUG_LOG(cb, "verbose\n");
2194                         break;
2195                 case 'V':
2196                         cb->validate++;
2197                         DEBUG_LOG(cb, "validate data\n");
2198                         break;
2199                 case 'l':
2200                         cb->wlat++;
2201                         break;
2202                 case 'L':
2203                         cb->rlat++;
2204                         break;
2205                 case 'B':
2206                         cb->bw++;
2207                         break;
2208                 case 'd':
2209                         cb->duplex++;
2210                         break;
2211                 case 'm':
2212                         if (!strncmp(optarg, "dma", 3))
2213                                 cb->mem = DMA;
2214                         else if (!strncmp(optarg, "fastreg", 7))
2215                                 cb->mem = FASTREG;
2216                         else if (!strncmp(optarg, "mw", 2))
2217                                 cb->mem = MW;
2218                         else if (!strncmp(optarg, "mr", 2))
2219                                 cb->mem = MR;
2220                         else {
2221                                 PRINTF(cb, "unknown mem mode %s.  "
2222                                         "Must be dma, fastreg, mw, or mr\n",
2223                                         optarg);
2224                                 ret = -EINVAL;
2225                                 break;
2226                         }
2227                         break;
2228                 case 'I':
2229                         cb->server_invalidate = 1;
2230                         break;
2231                 case 'T':
2232                         cb->txdepth = optint;
2233                         DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
2234                         break;
2235                 case 'Z':
2236                         cb->local_dma_lkey = 1;
2237                         DEBUG_LOG(cb, "using local dma lkey\n");
2238                         break;
2239                 case 'R':
2240                         cb->read_inv = 1;
2241                         DEBUG_LOG(cb, "using read-with-inv\n");
2242                         break;
2243                 case 'f':
2244                         cb->frtest = 1;
2245                         DEBUG_LOG(cb, "fast-reg test!\n");
2246                         break;
2247                 default:
2248                         PRINTF(cb, "unknown opt %s\n", optarg);
2249                         ret = -EINVAL;
2250                         break;
2251                 }
2252         }
2253         if (ret)
2254                 goto out;
2255
2256         if (cb->server == -1) {
2257                 PRINTF(cb, "must be either client or server\n");
2258                 ret = -EINVAL;
2259                 goto out;
2260         }
2261
2262         if (cb->server && cb->frtest) {
2263                 PRINTF(cb, "must be client to run frtest\n");
2264                 ret = -EINVAL;
2265                 goto out;
2266         }
2267
2268         if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2269                 PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
2270                 ret = -EINVAL;
2271                 goto out;
2272         }
2273
2274         if (cb->server_invalidate && cb->mem != FASTREG) {
2275                 PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
2276                 ret = -EINVAL;
2277                 goto out;
2278         }
2279
2280         if (cb->read_inv && cb->mem != FASTREG) {
2281                 PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
2282                 ret = -EINVAL;
2283                 goto out;
2284         }
2285
2286         if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw)) {
2287                 PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
2288                 ret = -EINVAL;
2289                 goto out;
2290         }
2291
2292         cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
2293         if (IS_ERR(cb->cm_id)) {
2294                 ret = PTR_ERR(cb->cm_id);
2295                 PRINTF(cb, "rdma_create_id error %d\n", ret);
2296                 goto out;
2297         }
2298         DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
2299
2300         if (cb->server)
2301                 krping_run_server(cb);
2302         else
2303                 krping_run_client(cb);
2304
2305         DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
2306         rdma_destroy_id(cb->cm_id);
2307 out:
2308         mutex_lock(&krping_mutex);
2309         list_del(&cb->list);
2310         mutex_unlock(&krping_mutex);
2311         kfree(cb);
2312         return ret;
2313 }
2314
2315 void
2316 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2317 {
2318         struct krping_cb *cb;
2319
2320         mutex_lock(&krping_mutex);
2321         list_for_each_entry(cb, &krping_cbs, list)
2322             (*f)(cb->pd ? &cb->stats : NULL, arg);
2323         mutex_unlock(&krping_mutex);
2324 }
2325
2326 void krping_init(void)
2327 {
2328
2329         mutex_init(&krping_mutex);
2330 }