]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/contrib/rdma/krping/krping.c
Upgrade Unbound to 1.6.7. More to follow.
[FreeBSD/FreeBSD.git] / sys / contrib / rdma / krping / krping.c
1 /*
2  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <linux/module.h>
38 #include <linux/moduleparam.h>
39 #include <linux/slab.h>
40 #include <linux/err.h>
41 #include <linux/string.h>
42 #include <linux/list.h>
43 #include <linux/in.h>
44 #include <linux/device.h>
45 #include <linux/pci.h>
46 #include <linux/sched.h>
47 #include <linux/wait.h>
48
49 #include <asm/atomic.h>
50
51 #include <rdma/ib_verbs.h>
52 #include <rdma/rdma_cm.h>
53
54 #include "krping.h"
55 #include "getopt.h"
56
57 #define PFX "krping: "
58
59 extern int krping_debug;
60 #define DEBUG_LOG(...) do { if (krping_debug) log(LOG_INFO, __VA_ARGS__); } while (0)
61 #define BIND_INFO 1
62
63 MODULE_AUTHOR("Steve Wise");
64 MODULE_DESCRIPTION("RDMA ping server");
65 MODULE_LICENSE("Dual BSD/GPL");
66 MODULE_VERSION(krping, 1);
67 MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
68
69 static __inline uint64_t
70 get_cycles(void)
71 {
72         uint32_t low, high;
73         __asm __volatile("rdtsc" : "=a" (low), "=d" (high));
74         return (low | ((u_int64_t)high << 32));
75 }
76
77 typedef uint64_t cycles_t;
78
79 enum mem_type {
80         DMA = 1,
81         REG = 2,
82 };
83
84 static const struct krping_option krping_opts[] = {
85         {"count", OPT_INT, 'C'},
86         {"size", OPT_INT, 'S'},
87         {"addr", OPT_STRING, 'a'},
88         {"addr6", OPT_STRING, 'A'},
89         {"port", OPT_INT, 'p'},
90         {"verbose", OPT_NOPARAM, 'v'},
91         {"validate", OPT_NOPARAM, 'V'},
92         {"server", OPT_NOPARAM, 's'},
93         {"client", OPT_NOPARAM, 'c'},
94         {"server_inv", OPT_NOPARAM, 'I'},
95         {"wlat", OPT_NOPARAM, 'l'},
96         {"rlat", OPT_NOPARAM, 'L'},
97         {"bw", OPT_NOPARAM, 'B'},
98         {"duplex", OPT_NOPARAM, 'd'},
99         {"txdepth", OPT_INT, 'T'},
100         {"poll", OPT_NOPARAM, 'P'},
101         {"local_dma_lkey", OPT_NOPARAM, 'Z'},
102         {"read_inv", OPT_NOPARAM, 'R'},
103         {"fr", OPT_NOPARAM, 'f'},
104         {NULL, 0, 0}
105 };
106
107 #define htonll(x) cpu_to_be64((x))
108 #define ntohll(x) cpu_to_be64((x))
109
110 static DEFINE_MUTEX(krping_mutex);
111
112 /*
113  * List of running krping threads.
114  */
115 static LIST_HEAD(krping_cbs);
116
117 /*
118  * Invoke like this, one on each side, using the server's address on
119  * the RDMA device (iw%d):
120  *
121  * /bin/echo server,port=9999,addr=192.168.69.142,validate > /proc/krping  
122  * /bin/echo client,port=9999,addr=192.168.69.142,validate > /proc/krping  
123  * /bin/echo client,port=9999,addr6=2001:db8:0:f101::1,validate > /proc/krping
124  *
125  * krping "ping/pong" loop:
126  *      client sends source rkey/addr/len
127  *      server receives source rkey/add/len
128  *      server rdma reads "ping" data from source
129  *      server sends "go ahead" on rdma read completion
130  *      client sends sink rkey/addr/len
131  *      server receives sink rkey/addr/len
132  *      server rdma writes "pong" data to sink
133  *      server sends "go ahead" on rdma write completion
134  *      <repeat loop>
135  */
136
137 /*
138  * These states are used to signal events between the completion handler
139  * and the main client or server thread.
140  *
141  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
142  * and RDMA_WRITE_COMPLETE for each ping.
143  */
144 enum test_state {
145         IDLE = 1,
146         CONNECT_REQUEST,
147         ADDR_RESOLVED,
148         ROUTE_RESOLVED,
149         CONNECTED,
150         RDMA_READ_ADV,
151         RDMA_READ_COMPLETE,
152         RDMA_WRITE_ADV,
153         RDMA_WRITE_COMPLETE,
154         ERROR
155 };
156
157 struct krping_rdma_info {
158         uint64_t buf;
159         uint32_t rkey;
160         uint32_t size;
161 };
162
163 /*
164  * Default max buffer size for IO...
165  */
166 #define RPING_BUFSIZE 128*1024
167 #define RPING_SQ_DEPTH 64
168
169 /*
170  * Control block struct.
171  */
172 struct krping_cb {
173         int server;                     /* 0 iff client */
174         struct ib_cq *cq;
175         struct ib_pd *pd;
176         struct ib_qp *qp;
177
178         struct ib_mr *dma_mr;
179
180         struct ib_fast_reg_page_list *page_list;
181         int page_list_len;
182         struct ib_reg_wr reg_mr_wr;
183         struct ib_send_wr invalidate_wr;
184         struct ib_mr *reg_mr;
185         int server_invalidate;
186         int read_inv;
187         u8 key;
188
189         struct ib_recv_wr rq_wr;        /* recv work request record */
190         struct ib_sge recv_sgl;         /* recv single SGE */
191         struct krping_rdma_info recv_buf __aligned(16); /* malloc'd buffer */
192         u64 recv_dma_addr;
193         DECLARE_PCI_UNMAP_ADDR(recv_mapping)
194
195         struct ib_send_wr sq_wr;        /* send work requrest record */
196         struct ib_sge send_sgl;
197         struct krping_rdma_info send_buf __aligned(16); /* single send buf */
198         u64 send_dma_addr;
199         DECLARE_PCI_UNMAP_ADDR(send_mapping)
200
201         struct ib_rdma_wr rdma_sq_wr;   /* rdma work request record */
202         struct ib_sge rdma_sgl;         /* rdma single SGE */
203         char *rdma_buf;                 /* used as rdma sink */
204         u64  rdma_dma_addr;
205         DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
206         struct ib_mr *rdma_mr;
207
208         uint32_t remote_rkey;           /* remote guys RKEY */
209         uint64_t remote_addr;           /* remote guys TO */
210         uint32_t remote_len;            /* remote guys LEN */
211
212         char *start_buf;                /* rdma read src */
213         u64  start_dma_addr;
214         DECLARE_PCI_UNMAP_ADDR(start_mapping)
215         struct ib_mr *start_mr;
216
217         enum test_state state;          /* used for cond/signalling */
218         wait_queue_head_t sem;
219         struct krping_stats stats;
220
221         uint16_t port;                  /* dst port in NBO */
222         u8 addr[16] __aligned(8);       /* dst addr in NBO */
223         char *addr_str;                 /* dst addr string */
224         uint8_t addr_type;              /* ADDR_FAMILY - IPv4/V6 */
225         int verbose;                    /* verbose logging */
226         int count;                      /* ping count */
227         int size;                       /* ping data size */
228         int validate;                   /* validate ping data */
229         int wlat;                       /* run wlat test */
230         int rlat;                       /* run rlat test */
231         int bw;                         /* run bw test */
232         int duplex;                     /* run bw full duplex test */
233         int poll;                       /* poll or block for rlat test */
234         int txdepth;                    /* SQ depth */
235         int local_dma_lkey;             /* use 0 for lkey */
236         int frtest;                     /* reg test */
237
238         /* CM stuff */
239         struct rdma_cm_id *cm_id;       /* connection on client side,*/
240                                         /* listener on server side. */
241         struct rdma_cm_id *child_cm_id; /* connection on server side */
242         struct list_head list;
243 };
244
245 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
246                                    struct rdma_cm_event *event)
247 {
248         int ret;
249         struct krping_cb *cb = cma_id->context;
250
251         DEBUG_LOG("cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
252                   (cma_id == cb->cm_id) ? "parent" : "child");
253
254         switch (event->event) {
255         case RDMA_CM_EVENT_ADDR_RESOLVED:
256                 cb->state = ADDR_RESOLVED;
257                 ret = rdma_resolve_route(cma_id, 2000);
258                 if (ret) {
259                         printk(KERN_ERR PFX "rdma_resolve_route error %d\n", 
260                                ret);
261                         wake_up_interruptible(&cb->sem);
262                 }
263                 break;
264
265         case RDMA_CM_EVENT_ROUTE_RESOLVED:
266                 cb->state = ROUTE_RESOLVED;
267                 wake_up_interruptible(&cb->sem);
268                 break;
269
270         case RDMA_CM_EVENT_CONNECT_REQUEST:
271                 cb->state = CONNECT_REQUEST;
272                 cb->child_cm_id = cma_id;
273                 DEBUG_LOG("child cma %p\n", cb->child_cm_id);
274                 wake_up_interruptible(&cb->sem);
275                 break;
276
277         case RDMA_CM_EVENT_ESTABLISHED:
278                 DEBUG_LOG("ESTABLISHED\n");
279                 if (!cb->server) {
280                         cb->state = CONNECTED;
281                 }
282                 wake_up_interruptible(&cb->sem);
283                 break;
284
285         case RDMA_CM_EVENT_ADDR_ERROR:
286         case RDMA_CM_EVENT_ROUTE_ERROR:
287         case RDMA_CM_EVENT_CONNECT_ERROR:
288         case RDMA_CM_EVENT_UNREACHABLE:
289         case RDMA_CM_EVENT_REJECTED:
290                 printk(KERN_ERR PFX "cma event %d, error %d\n", event->event,
291                        event->status);
292                 cb->state = ERROR;
293                 wake_up_interruptible(&cb->sem);
294                 break;
295
296         case RDMA_CM_EVENT_DISCONNECTED:
297                 printk(KERN_ERR PFX "DISCONNECT EVENT...\n");
298                 cb->state = ERROR;
299                 wake_up_interruptible(&cb->sem);
300                 break;
301
302         case RDMA_CM_EVENT_DEVICE_REMOVAL:
303                 printk(KERN_ERR PFX "cma detected device removal!!!!\n");
304                 cb->state = ERROR;
305                 wake_up_interruptible(&cb->sem);
306                 break;
307
308         default:
309                 printk(KERN_ERR PFX "oof bad type!\n");
310                 wake_up_interruptible(&cb->sem);
311                 break;
312         }
313         return 0;
314 }
315
316 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
317 {
318         if (wc->byte_len != sizeof(cb->recv_buf)) {
319                 printk(KERN_ERR PFX "Received bogus data, size %d\n", 
320                        wc->byte_len);
321                 return -1;
322         }
323
324         cb->remote_rkey = ntohl(cb->recv_buf.rkey);
325         cb->remote_addr = ntohll(cb->recv_buf.buf);
326         cb->remote_len  = ntohl(cb->recv_buf.size);
327         DEBUG_LOG("Received rkey %x addr %llx len %d from peer\n",
328                   cb->remote_rkey, (unsigned long long)cb->remote_addr, 
329                   cb->remote_len);
330
331         if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
332                 cb->state = RDMA_READ_ADV;
333         else
334                 cb->state = RDMA_WRITE_ADV;
335
336         return 0;
337 }
338
339 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
340 {
341         if (wc->byte_len != sizeof(cb->recv_buf)) {
342                 printk(KERN_ERR PFX "Received bogus data, size %d\n", 
343                        wc->byte_len);
344                 return -1;
345         }
346
347         if (cb->state == RDMA_READ_ADV)
348                 cb->state = RDMA_WRITE_ADV;
349         else
350                 cb->state = RDMA_WRITE_COMPLETE;
351
352         return 0;
353 }
354
355 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
356 {
357         struct krping_cb *cb = ctx;
358         struct ib_wc wc;
359         struct ib_recv_wr *bad_wr;
360         int ret;
361
362         BUG_ON(cb->cq != cq);
363         if (cb->state == ERROR) {
364                 printk(KERN_ERR PFX "cq completion in ERROR state\n");
365                 return;
366         }
367         if (cb->frtest) {
368                 printk(KERN_ERR PFX "cq completion event in frtest!\n");
369                 return;
370         }
371         if (!cb->wlat && !cb->rlat && !cb->bw)
372                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
373         while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
374                 if (wc.status) {
375                         if (wc.status == IB_WC_WR_FLUSH_ERR) {
376                                 DEBUG_LOG("cq flushed\n");
377                                 continue;
378                         } else {
379                                 printk(KERN_ERR PFX "cq completion failed with "
380                                        "wr_id %jx status %d opcode %d vender_err %x\n",
381                                         (uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
382                                 goto error;
383                         }
384                 }
385
386                 switch (wc.opcode) {
387                 case IB_WC_SEND:
388                         DEBUG_LOG("send completion\n");
389                         cb->stats.send_bytes += cb->send_sgl.length;
390                         cb->stats.send_msgs++;
391                         break;
392
393                 case IB_WC_RDMA_WRITE:
394                         DEBUG_LOG("rdma write completion\n");
395                         cb->stats.write_bytes += cb->rdma_sq_wr.wr.sg_list->length;
396                         cb->stats.write_msgs++;
397                         cb->state = RDMA_WRITE_COMPLETE;
398                         wake_up_interruptible(&cb->sem);
399                         break;
400
401                 case IB_WC_RDMA_READ:
402                         DEBUG_LOG("rdma read completion\n");
403                         cb->stats.read_bytes += cb->rdma_sq_wr.wr.sg_list->length;
404                         cb->stats.read_msgs++;
405                         cb->state = RDMA_READ_COMPLETE;
406                         wake_up_interruptible(&cb->sem);
407                         break;
408
409                 case IB_WC_RECV:
410                         DEBUG_LOG("recv completion\n");
411                         cb->stats.recv_bytes += sizeof(cb->recv_buf);
412                         cb->stats.recv_msgs++;
413                         if (cb->wlat || cb->rlat || cb->bw)
414                                 ret = server_recv(cb, &wc);
415                         else
416                                 ret = cb->server ? server_recv(cb, &wc) :
417                                                    client_recv(cb, &wc);
418                         if (ret) {
419                                 printk(KERN_ERR PFX "recv wc error: %d\n", ret);
420                                 goto error;
421                         }
422
423                         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
424                         if (ret) {
425                                 printk(KERN_ERR PFX "post recv error: %d\n", 
426                                        ret);
427                                 goto error;
428                         }
429                         wake_up_interruptible(&cb->sem);
430                         break;
431
432                 default:
433                         printk(KERN_ERR PFX
434                                "%s:%d Unexpected opcode %d, Shutting down\n",
435                                __func__, __LINE__, wc.opcode);
436                         goto error;
437                 }
438         }
439         if (ret) {
440                 printk(KERN_ERR PFX "poll error %d\n", ret);
441                 goto error;
442         }
443         return;
444 error:
445         cb->state = ERROR;
446         wake_up_interruptible(&cb->sem);
447 }
448
449 static int krping_accept(struct krping_cb *cb)
450 {
451         struct rdma_conn_param conn_param;
452         int ret;
453
454         DEBUG_LOG("accepting client connection request\n");
455
456         memset(&conn_param, 0, sizeof conn_param);
457         conn_param.responder_resources = 1;
458         conn_param.initiator_depth = 1;
459
460         ret = rdma_accept(cb->child_cm_id, &conn_param);
461         if (ret) {
462                 printk(KERN_ERR PFX "rdma_accept error: %d\n", ret);
463                 return ret;
464         }
465
466         if (!cb->wlat && !cb->rlat && !cb->bw) {
467                 wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
468                 if (cb->state == ERROR) {
469                         printk(KERN_ERR PFX "wait for CONNECTED state %d\n", 
470                                 cb->state);
471                         return -1;
472                 }
473         }
474         return 0;
475 }
476
477 static void krping_setup_wr(struct krping_cb *cb)
478 {
479         cb->recv_sgl.addr = cb->recv_dma_addr;
480         cb->recv_sgl.length = sizeof cb->recv_buf;
481         cb->recv_sgl.lkey = cb->pd->local_dma_lkey;
482         cb->rq_wr.sg_list = &cb->recv_sgl;
483         cb->rq_wr.num_sge = 1;
484
485         cb->send_sgl.addr = cb->send_dma_addr;
486         cb->send_sgl.length = sizeof cb->send_buf;
487         cb->send_sgl.lkey = cb->pd->local_dma_lkey;
488
489         cb->sq_wr.opcode = IB_WR_SEND;
490         cb->sq_wr.send_flags = IB_SEND_SIGNALED;
491         cb->sq_wr.sg_list = &cb->send_sgl;
492         cb->sq_wr.num_sge = 1;
493
494         if (cb->server || cb->wlat || cb->rlat || cb->bw) {
495                 cb->rdma_sgl.addr = cb->rdma_dma_addr;
496                 cb->rdma_sq_wr.wr.send_flags = IB_SEND_SIGNALED;
497                 cb->rdma_sq_wr.wr.sg_list = &cb->rdma_sgl;
498                 cb->rdma_sq_wr.wr.num_sge = 1;
499         }
500
501         /* 
502          * A chain of 2 WRs, INVALDATE_MR + REG_MR.
503          * both unsignaled.  The client uses them to reregister
504          * the rdma buffers with a new key each iteration.
505          */
506         cb->reg_mr_wr.wr.opcode = IB_WR_REG_MR;
507         cb->reg_mr_wr.mr = cb->reg_mr;
508
509         cb->invalidate_wr.next = &cb->reg_mr_wr.wr;
510         cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
511 }
512
513 static int krping_setup_buffers(struct krping_cb *cb)
514 {
515         int ret;
516
517         DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
518
519         cb->recv_dma_addr = ib_dma_map_single(cb->pd->device,
520                                    &cb->recv_buf, 
521                                    sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
522         pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
523         cb->send_dma_addr = ib_dma_map_single(cb->pd->device,
524                                            &cb->send_buf, sizeof(cb->send_buf),
525                                            DMA_BIDIRECTIONAL);
526         pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
527
528         cb->rdma_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size,
529                                              &cb->rdma_dma_addr,
530                                              GFP_KERNEL);
531         if (!cb->rdma_buf) {
532                 DEBUG_LOG(PFX "rdma_buf allocation failed\n");
533                 ret = -ENOMEM;
534                 goto bail;
535         }
536         pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
537         cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE)
538                                 >> PAGE_SHIFT;
539         cb->reg_mr = ib_alloc_mr(cb->pd,  IB_MR_TYPE_MEM_REG,
540                                  cb->page_list_len);
541         if (IS_ERR(cb->reg_mr)) {
542                 ret = PTR_ERR(cb->reg_mr);
543                 DEBUG_LOG(PFX "recv_buf reg_mr failed %d\n", ret);
544                 goto bail;
545         }
546         DEBUG_LOG(PFX "reg rkey 0x%x page_list_len %u\n",
547                 cb->reg_mr->rkey, cb->page_list_len);
548
549         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
550
551                 cb->start_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size,
552                                                       &cb->start_dma_addr,
553                                                       GFP_KERNEL);
554                 if (!cb->start_buf) {
555                         DEBUG_LOG(PFX "start_buf malloc failed\n");
556                         ret = -ENOMEM;
557                         goto bail;
558                 }
559                 pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
560         }
561
562         krping_setup_wr(cb);
563         DEBUG_LOG(PFX "allocated & registered buffers...\n");
564         return 0;
565 bail:
566         if (cb->reg_mr && !IS_ERR(cb->reg_mr))
567                 ib_dereg_mr(cb->reg_mr);
568         if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
569                 ib_dereg_mr(cb->rdma_mr);
570         if (cb->dma_mr && !IS_ERR(cb->dma_mr))
571                 ib_dereg_mr(cb->dma_mr);
572         if (cb->rdma_buf) {
573                 ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf,
574                                      cb->rdma_dma_addr);
575         }
576         if (cb->start_buf) {
577                 ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf,
578                                      cb->start_dma_addr);
579         }
580         return ret;
581 }
582
583 static void krping_free_buffers(struct krping_cb *cb)
584 {
585         DEBUG_LOG("krping_free_buffers called on cb %p\n", cb);
586         
587         if (cb->dma_mr)
588                 ib_dereg_mr(cb->dma_mr);
589         if (cb->rdma_mr)
590                 ib_dereg_mr(cb->rdma_mr);
591         if (cb->start_mr)
592                 ib_dereg_mr(cb->start_mr);
593         if (cb->reg_mr)
594                 ib_dereg_mr(cb->reg_mr);
595
596         dma_unmap_single(cb->pd->device->dma_device,
597                          pci_unmap_addr(cb, recv_mapping),
598                          sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
599         dma_unmap_single(cb->pd->device->dma_device,
600                          pci_unmap_addr(cb, send_mapping),
601                          sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
602
603         ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf,
604                              cb->rdma_dma_addr);
605
606         if (cb->start_buf) {
607                 ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf,
608                                      cb->start_dma_addr);
609         }
610 }
611
612 static int krping_create_qp(struct krping_cb *cb)
613 {
614         struct ib_qp_init_attr init_attr;
615         int ret;
616
617         memset(&init_attr, 0, sizeof(init_attr));
618         init_attr.cap.max_send_wr = cb->txdepth;
619         init_attr.cap.max_recv_wr = 2;
620         
621         /* For flush_qp() */
622         init_attr.cap.max_send_wr++;
623         init_attr.cap.max_recv_wr++;
624
625         init_attr.cap.max_recv_sge = 1;
626         init_attr.cap.max_send_sge = 1;
627         init_attr.qp_type = IB_QPT_RC;
628         init_attr.send_cq = cb->cq;
629         init_attr.recv_cq = cb->cq;
630         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
631
632         if (cb->server) {
633                 ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
634                 if (!ret)
635                         cb->qp = cb->child_cm_id->qp;
636         } else {
637                 ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
638                 if (!ret)
639                         cb->qp = cb->cm_id->qp;
640         }
641
642         return ret;
643 }
644
645 static void krping_free_qp(struct krping_cb *cb)
646 {
647         ib_destroy_qp(cb->qp);
648         ib_destroy_cq(cb->cq);
649         ib_dealloc_pd(cb->pd);
650 }
651
652 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
653 {
654         int ret;
655         struct ib_cq_init_attr attr = {0};
656
657         cb->pd = ib_alloc_pd(cm_id->device, 0);
658         if (IS_ERR(cb->pd)) {
659                 printk(KERN_ERR PFX "ib_alloc_pd failed\n");
660                 return PTR_ERR(cb->pd);
661         }
662         DEBUG_LOG("created pd %p\n", cb->pd);
663
664         strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
665
666         attr.cqe = cb->txdepth * 2;
667         attr.comp_vector = 0;
668         cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
669                               cb, &attr);
670         if (IS_ERR(cb->cq)) {
671                 printk(KERN_ERR PFX "ib_create_cq failed\n");
672                 ret = PTR_ERR(cb->cq);
673                 goto err1;
674         }
675         DEBUG_LOG("created cq %p\n", cb->cq);
676
677         if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
678                 ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
679                 if (ret) {
680                         printk(KERN_ERR PFX "ib_create_cq failed\n");
681                         goto err2;
682                 }
683         }
684
685         ret = krping_create_qp(cb);
686         if (ret) {
687                 printk(KERN_ERR PFX "krping_create_qp failed: %d\n", ret);
688                 goto err2;
689         }
690         DEBUG_LOG("created qp %p\n", cb->qp);
691         return 0;
692 err2:
693         ib_destroy_cq(cb->cq);
694 err1:
695         ib_dealloc_pd(cb->pd);
696         return ret;
697 }
698
699 /*
700  * return the (possibly rebound) rkey for the rdma buffer.
701  * REG mode: invalidate and rebind via reg wr.
702  * other modes: just return the mr rkey.
703  */
704 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
705 {
706         u32 rkey;
707         struct ib_send_wr *bad_wr;
708         int ret;
709         struct scatterlist sg = {0};
710
711         cb->invalidate_wr.ex.invalidate_rkey = cb->reg_mr->rkey;
712
713         /*
714          * Update the reg key.
715          */
716         ib_update_fast_reg_key(cb->reg_mr, ++cb->key);
717         cb->reg_mr_wr.key = cb->reg_mr->rkey;
718
719         /*
720          * Update the reg WR with new buf info.
721          */
722         if (buf == (u64)cb->start_dma_addr)
723                 cb->reg_mr_wr.access = IB_ACCESS_REMOTE_READ;
724         else
725                 cb->reg_mr_wr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
726         sg_dma_address(&sg) = buf;
727         sg_dma_len(&sg) = cb->size;
728
729         ret = ib_map_mr_sg(cb->reg_mr, &sg, 1, NULL, PAGE_SIZE);
730         BUG_ON(ret <= 0 || ret > cb->page_list_len);
731
732         DEBUG_LOG(PFX "post_inv = %d, reg_mr new rkey 0x%x pgsz %u len %u"
733                 " iova_start %llx\n",
734                 post_inv,
735                 cb->reg_mr_wr.key,
736                 cb->reg_mr->page_size,
737                 cb->reg_mr->length,
738                 (unsigned long long)cb->reg_mr->iova);
739
740         if (post_inv)
741                 ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
742         else
743                 ret = ib_post_send(cb->qp, &cb->reg_mr_wr.wr, &bad_wr);
744         if (ret) {
745                 printk(KERN_ERR PFX "post send error %d\n", ret);
746                 cb->state = ERROR;
747         }
748         rkey = cb->reg_mr->rkey;
749         return rkey;
750 }
751
752 static void krping_format_send(struct krping_cb *cb, u64 buf)
753 {
754         struct krping_rdma_info *info = &cb->send_buf;
755         u32 rkey;
756
757         /*
758          * Client side will do reg or mw bind before
759          * advertising the rdma buffer.  Server side
760          * sends have no data.
761          */
762         if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
763                 rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
764                 info->buf = htonll(buf);
765                 info->rkey = htonl(rkey);
766                 info->size = htonl(cb->size);
767                 DEBUG_LOG("RDMA addr %llx rkey %x len %d\n",
768                           (unsigned long long)buf, rkey, cb->size);
769         }
770 }
771
772 static void krping_test_server(struct krping_cb *cb)
773 {
774         struct ib_send_wr *bad_wr, inv;
775         int ret;
776
777         while (1) {
778                 /* Wait for client's Start STAG/TO/Len */
779                 wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
780                 if (cb->state != RDMA_READ_ADV) {
781                         printk(KERN_ERR PFX "wait for RDMA_READ_ADV state %d\n",
782                                 cb->state);
783                         break;
784                 }
785
786                 DEBUG_LOG("server received sink adv\n");
787
788                 cb->rdma_sq_wr.rkey = cb->remote_rkey;
789                 cb->rdma_sq_wr.remote_addr = cb->remote_addr;
790                 cb->rdma_sq_wr.wr.sg_list->length = cb->remote_len;
791                 cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, !cb->read_inv);
792                 cb->rdma_sq_wr.wr.next = NULL;
793
794                 /* Issue RDMA Read. */
795                 if (cb->read_inv)
796                         cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
797                 else {
798
799                         cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ;
800                         /* 
801                          * Immediately follow the read with a 
802                          * fenced LOCAL_INV.
803                          */
804                         cb->rdma_sq_wr.wr.next = &inv;
805                         memset(&inv, 0, sizeof inv);
806                         inv.opcode = IB_WR_LOCAL_INV;
807                         inv.ex.invalidate_rkey = cb->reg_mr->rkey;
808                         inv.send_flags = IB_SEND_FENCE;
809                 }
810
811                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
812                 if (ret) {
813                         printk(KERN_ERR PFX "post send error %d\n", ret);
814                         break;
815                 }
816                 cb->rdma_sq_wr.wr.next = NULL;
817
818                 DEBUG_LOG("server posted rdma read req \n");
819
820                 /* Wait for read completion */
821                 wait_event_interruptible(cb->sem, 
822                                          cb->state >= RDMA_READ_COMPLETE);
823                 if (cb->state != RDMA_READ_COMPLETE) {
824                         printk(KERN_ERR PFX 
825                                "wait for RDMA_READ_COMPLETE state %d\n",
826                                cb->state);
827                         break;
828                 }
829                 DEBUG_LOG("server received read complete\n");
830
831                 /* Display data in recv buf */
832                 if (cb->verbose)
833                         printk(KERN_INFO PFX "server ping data: %s\n",
834                                 cb->rdma_buf);
835
836                 /* Tell client to continue */
837                 if (cb->server && cb->server_invalidate) {
838                         cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
839                         cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
840                         DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey);
841                 } 
842                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
843                 if (ret) {
844                         printk(KERN_ERR PFX "post send error %d\n", ret);
845                         break;
846                 }
847                 DEBUG_LOG("server posted go ahead\n");
848
849                 /* Wait for client's RDMA STAG/TO/Len */
850                 wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
851                 if (cb->state != RDMA_WRITE_ADV) {
852                         printk(KERN_ERR PFX 
853                                "wait for RDMA_WRITE_ADV state %d\n",
854                                cb->state);
855                         break;
856                 }
857                 DEBUG_LOG("server received sink adv\n");
858
859                 /* RDMA Write echo data */
860                 cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
861                 cb->rdma_sq_wr.rkey = cb->remote_rkey;
862                 cb->rdma_sq_wr.remote_addr = cb->remote_addr;
863                 cb->rdma_sq_wr.wr.sg_list->length = strlen(cb->rdma_buf) + 1;
864                 if (cb->local_dma_lkey)
865                         cb->rdma_sgl.lkey = cb->pd->local_dma_lkey;
866                 else 
867                         cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
868                         
869                 DEBUG_LOG("rdma write from lkey %x laddr %llx len %d\n",
870                           cb->rdma_sq_wr.wr.sg_list->lkey,
871                           (unsigned long long)cb->rdma_sq_wr.wr.sg_list->addr,
872                           cb->rdma_sq_wr.wr.sg_list->length);
873
874                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
875                 if (ret) {
876                         printk(KERN_ERR PFX "post send error %d\n", ret);
877                         break;
878                 }
879
880                 /* Wait for completion */
881                 ret = wait_event_interruptible(cb->sem, cb->state >= 
882                                                          RDMA_WRITE_COMPLETE);
883                 if (cb->state != RDMA_WRITE_COMPLETE) {
884                         printk(KERN_ERR PFX 
885                                "wait for RDMA_WRITE_COMPLETE state %d\n",
886                                cb->state);
887                         break;
888                 }
889                 DEBUG_LOG("server rdma write complete \n");
890
891                 cb->state = CONNECTED;
892
893                 /* Tell client to begin again */
894                 if (cb->server && cb->server_invalidate) {
895                         cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
896                         cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
897                         DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey);
898                 } 
899                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
900                 if (ret) {
901                         printk(KERN_ERR PFX "post send error %d\n", ret);
902                         break;
903                 }
904                 DEBUG_LOG("server posted go ahead\n");
905         }
906 }
907
908 static void rlat_test(struct krping_cb *cb)
909 {
910         int scnt;
911         int iters = cb->count;
912         struct timeval start_tv, stop_tv;
913         int ret;
914         struct ib_wc wc;
915         struct ib_send_wr *bad_wr;
916         int ne;
917
918         scnt = 0;
919         cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ;
920         cb->rdma_sq_wr.rkey = cb->remote_rkey;
921         cb->rdma_sq_wr.remote_addr = cb->remote_addr;
922         cb->rdma_sq_wr.wr.sg_list->length = cb->size;
923
924         microtime(&start_tv);
925         if (!cb->poll) {
926                 cb->state = RDMA_READ_ADV;
927                 ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
928         }
929         while (scnt < iters) {
930
931                 cb->state = RDMA_READ_ADV;
932                 ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
933                 if (ret) {
934                         printk(KERN_ERR PFX  
935                                 "Couldn't post send: ret=%d scnt %d\n",
936                                 ret, scnt);
937                         return;
938                 }
939
940                 do {
941                         if (!cb->poll) {
942                                 wait_event_interruptible(cb->sem, 
943                                         cb->state != RDMA_READ_ADV);
944                                 if (cb->state == RDMA_READ_COMPLETE) {
945                                         ne = 1;
946                                         ib_req_notify_cq(cb->cq, 
947                                                 IB_CQ_NEXT_COMP);
948                                 } else {
949                                         ne = -1;
950                                 }
951                         } else
952                                 ne = ib_poll_cq(cb->cq, 1, &wc);
953                         if (cb->state == ERROR) {
954                                 printk(KERN_ERR PFX 
955                                         "state == ERROR...bailing scnt %d\n", 
956                                         scnt);
957                                 return;
958                         }
959                 } while (ne == 0);
960
961                 if (ne < 0) {
962                         printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
963                         return;
964                 }
965                 if (cb->poll && wc.status != IB_WC_SUCCESS) {
966                         printk(KERN_ERR PFX "Completion wth error at %s:\n",
967                                 cb->server ? "server" : "client");
968                         printk(KERN_ERR PFX "Failed status %d: wr_id %d\n",
969                                 wc.status, (int) wc.wr_id);
970                         return;
971                 }
972                 ++scnt;
973         }
974         microtime(&stop_tv);
975
976         if (stop_tv.tv_usec < start_tv.tv_usec) {
977                 stop_tv.tv_usec += 1000000;
978                 stop_tv.tv_sec  -= 1;
979         }
980
981         printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d\n",
982                 (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
983                 (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
984                 scnt, cb->size);
985 }
986
987 static void wlat_test(struct krping_cb *cb)
988 {
989         int ccnt, scnt, rcnt;
990         int iters=cb->count;
991         volatile char *poll_buf = (char *) cb->start_buf;
992         char *buf = (char *)cb->rdma_buf;
993         struct timeval start_tv, stop_tv;
994         cycles_t *post_cycles_start, *post_cycles_stop;
995         cycles_t *poll_cycles_start, *poll_cycles_stop;
996         cycles_t *last_poll_cycles_start;
997         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
998         int i;
999         int cycle_iters = 1000;
1000
1001         ccnt = 0;
1002         scnt = 0;
1003         rcnt = 0;
1004
1005         post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1006         if (!post_cycles_start) {
1007                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1008                 return;
1009         }
1010         post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1011         if (!post_cycles_stop) {
1012                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1013                 return;
1014         }
1015         poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1016         if (!poll_cycles_start) {
1017                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1018                 return;
1019         }
1020         poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1021         if (!poll_cycles_stop) {
1022                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1023                 return;
1024         }
1025         last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
1026                 GFP_KERNEL);
1027         if (!last_poll_cycles_start) {
1028                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1029                 return;
1030         }
1031         cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
1032         cb->rdma_sq_wr.rkey = cb->remote_rkey;
1033         cb->rdma_sq_wr.remote_addr = cb->remote_addr;
1034         cb->rdma_sq_wr.wr.sg_list->length = cb->size;
1035
1036         if (cycle_iters > iters)
1037                 cycle_iters = iters;
1038         microtime(&start_tv);
1039         while (scnt < iters || ccnt < iters || rcnt < iters) {
1040
1041                 /* Wait till buffer changes. */
1042                 if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1043                         ++rcnt;
1044                         while (*poll_buf != (char)rcnt) {
1045                                 if (cb->state == ERROR) {
1046                                         printk(KERN_ERR PFX 
1047                                                 "state = ERROR, bailing\n");
1048                                         return;
1049                                 }
1050                         }
1051                 }
1052
1053                 if (scnt < iters) {
1054                         struct ib_send_wr *bad_wr;
1055
1056                         *buf = (char)scnt+1;
1057                         if (scnt < cycle_iters)
1058                                 post_cycles_start[scnt] = get_cycles();
1059                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
1060                                 printk(KERN_ERR PFX  
1061                                         "Couldn't post send: scnt=%d\n",
1062                                         scnt);
1063                                 return;
1064                         }
1065                         if (scnt < cycle_iters)
1066                                 post_cycles_stop[scnt] = get_cycles();
1067                         scnt++;
1068                 }
1069
1070                 if (ccnt < iters) {
1071                         struct ib_wc wc;
1072                         int ne;
1073
1074                         if (ccnt < cycle_iters)
1075                                 poll_cycles_start[ccnt] = get_cycles();
1076                         do {
1077                                 if (ccnt < cycle_iters)
1078                                         last_poll_cycles_start[ccnt] = 
1079                                                 get_cycles();
1080                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1081                         } while (ne == 0);
1082                         if (ccnt < cycle_iters)
1083                                 poll_cycles_stop[ccnt] = get_cycles();
1084                         ++ccnt;
1085
1086                         if (ne < 0) {
1087                                 printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
1088                                 return;
1089                         }
1090                         if (wc.status != IB_WC_SUCCESS) {
1091                                 printk(KERN_ERR PFX 
1092                                         "Completion wth error at %s:\n",
1093                                         cb->server ? "server" : "client");
1094                                 printk(KERN_ERR PFX 
1095                                         "Failed status %d: wr_id %d\n",
1096                                         wc.status, (int) wc.wr_id);
1097                                 printk(KERN_ERR PFX 
1098                                         "scnt=%d, rcnt=%d, ccnt=%d\n",
1099                                         scnt, rcnt, ccnt);
1100                                 return;
1101                         }
1102                 }
1103         }
1104         microtime(&stop_tv);
1105
1106         if (stop_tv.tv_usec < start_tv.tv_usec) {
1107                 stop_tv.tv_usec += 1000000;
1108                 stop_tv.tv_sec  -= 1;
1109         }
1110
1111         for (i=0; i < cycle_iters; i++) {
1112                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1113                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1114                 sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1115         }
1116         printk(KERN_ERR PFX 
1117                 "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1118                 " sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1119                 (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1120                 (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1121                 scnt, cb->size, cycle_iters,
1122                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1123                 (unsigned long long)sum_last_poll);
1124         kfree(post_cycles_start);
1125         kfree(post_cycles_stop);
1126         kfree(poll_cycles_start);
1127         kfree(poll_cycles_stop);
1128         kfree(last_poll_cycles_start);
1129 }
1130
1131 static void bw_test(struct krping_cb *cb)
1132 {
1133         int ccnt, scnt, rcnt;
1134         int iters=cb->count;
1135         struct timeval start_tv, stop_tv;
1136         cycles_t *post_cycles_start, *post_cycles_stop;
1137         cycles_t *poll_cycles_start, *poll_cycles_stop;
1138         cycles_t *last_poll_cycles_start;
1139         cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1140         int i;
1141         int cycle_iters = 1000;
1142
1143         ccnt = 0;
1144         scnt = 0;
1145         rcnt = 0;
1146
1147         post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1148         if (!post_cycles_start) {
1149                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1150                 return;
1151         }
1152         post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1153         if (!post_cycles_stop) {
1154                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1155                 return;
1156         }
1157         poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1158         if (!poll_cycles_start) {
1159                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1160                 return;
1161         }
1162         poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1163         if (!poll_cycles_stop) {
1164                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1165                 return;
1166         }
1167         last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
1168                 GFP_KERNEL);
1169         if (!last_poll_cycles_start) {
1170                 printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
1171                 return;
1172         }
1173         cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
1174         cb->rdma_sq_wr.rkey = cb->remote_rkey;
1175         cb->rdma_sq_wr.remote_addr = cb->remote_addr;
1176         cb->rdma_sq_wr.wr.sg_list->length = cb->size;
1177
1178         if (cycle_iters > iters)
1179                 cycle_iters = iters;
1180         microtime(&start_tv);
1181         while (scnt < iters || ccnt < iters) {
1182
1183                 while (scnt < iters && scnt - ccnt < cb->txdepth) {
1184                         struct ib_send_wr *bad_wr;
1185
1186                         if (scnt < cycle_iters)
1187                                 post_cycles_start[scnt] = get_cycles();
1188                         if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
1189                                 printk(KERN_ERR PFX  
1190                                         "Couldn't post send: scnt=%d\n",
1191                                         scnt);
1192                                 return;
1193                         }
1194                         if (scnt < cycle_iters)
1195                                 post_cycles_stop[scnt] = get_cycles();
1196                         ++scnt;
1197                 }
1198
1199                 if (ccnt < iters) {
1200                         int ne;
1201                         struct ib_wc wc;
1202
1203                         if (ccnt < cycle_iters)
1204                                 poll_cycles_start[ccnt] = get_cycles();
1205                         do {
1206                                 if (ccnt < cycle_iters)
1207                                         last_poll_cycles_start[ccnt] = 
1208                                                 get_cycles();
1209                                 ne = ib_poll_cq(cb->cq, 1, &wc);
1210                         } while (ne == 0);
1211                         if (ccnt < cycle_iters)
1212                                 poll_cycles_stop[ccnt] = get_cycles();
1213                         ccnt += 1;
1214
1215                         if (ne < 0) {
1216                                 printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
1217                                 return;
1218                         }
1219                         if (wc.status != IB_WC_SUCCESS) {
1220                                 printk(KERN_ERR PFX 
1221                                         "Completion wth error at %s:\n",
1222                                         cb->server ? "server" : "client");
1223                                 printk(KERN_ERR PFX 
1224                                         "Failed status %d: wr_id %d\n",
1225                                         wc.status, (int) wc.wr_id);
1226                                 return;
1227                         }
1228                 }
1229         }
1230         microtime(&stop_tv);
1231
1232         if (stop_tv.tv_usec < start_tv.tv_usec) {
1233                 stop_tv.tv_usec += 1000000;
1234                 stop_tv.tv_sec  -= 1;
1235         }
1236
1237         for (i=0; i < cycle_iters; i++) {
1238                 sum_post += post_cycles_stop[i] - post_cycles_start[i];
1239                 sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1240                 sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1241         }
1242         printk(KERN_ERR PFX 
1243                 "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1244                 " sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1245                 (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
1246                 (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
1247                 scnt, cb->size, cycle_iters, 
1248                 (unsigned long long)sum_post, (unsigned long long)sum_poll, 
1249                 (unsigned long long)sum_last_poll);
1250         kfree(post_cycles_start);
1251         kfree(post_cycles_stop);
1252         kfree(poll_cycles_start);
1253         kfree(poll_cycles_stop);
1254         kfree(last_poll_cycles_start);
1255 }
1256
1257 static void krping_rlat_test_server(struct krping_cb *cb)
1258 {
1259         struct ib_send_wr *bad_wr;
1260         struct ib_wc wc;
1261         int ret;
1262
1263         /* Spin waiting for client's Start STAG/TO/Len */
1264         while (cb->state < RDMA_READ_ADV) {
1265                 krping_cq_event_handler(cb->cq, cb);
1266         }
1267
1268         /* Send STAG/TO/Len to client */
1269         krping_format_send(cb, cb->start_dma_addr);
1270         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1271         if (ret) {
1272                 printk(KERN_ERR PFX "post send error %d\n", ret);
1273                 return;
1274         }
1275
1276         /* Spin waiting for send completion */
1277         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1278         if (ret < 0) {
1279                 printk(KERN_ERR PFX "poll error %d\n", ret);
1280                 return;
1281         }
1282         if (wc.status) {
1283                 printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
1284                 return;
1285         }
1286
1287         wait_event_interruptible(cb->sem, cb->state == ERROR);
1288 }
1289
1290 static void krping_wlat_test_server(struct krping_cb *cb)
1291 {
1292         struct ib_send_wr *bad_wr;
1293         struct ib_wc wc;
1294         int ret;
1295
1296         /* Spin waiting for client's Start STAG/TO/Len */
1297         while (cb->state < RDMA_READ_ADV) {
1298                 krping_cq_event_handler(cb->cq, cb);
1299         }
1300
1301         /* Send STAG/TO/Len to client */
1302         krping_format_send(cb, cb->start_dma_addr);
1303         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1304         if (ret) {
1305                 printk(KERN_ERR PFX "post send error %d\n", ret);
1306                 return;
1307         }
1308
1309         /* Spin waiting for send completion */
1310         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1311         if (ret < 0) {
1312                 printk(KERN_ERR PFX "poll error %d\n", ret);
1313                 return;
1314         }
1315         if (wc.status) {
1316                 printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
1317                 return;
1318         }
1319
1320         wlat_test(cb);
1321         wait_event_interruptible(cb->sem, cb->state == ERROR);
1322 }
1323
1324 static void krping_bw_test_server(struct krping_cb *cb)
1325 {
1326         struct ib_send_wr *bad_wr;
1327         struct ib_wc wc;
1328         int ret;
1329
1330         /* Spin waiting for client's Start STAG/TO/Len */
1331         while (cb->state < RDMA_READ_ADV) {
1332                 krping_cq_event_handler(cb->cq, cb);
1333         }
1334
1335         /* Send STAG/TO/Len to client */
1336         krping_format_send(cb, cb->start_dma_addr);
1337         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1338         if (ret) {
1339                 printk(KERN_ERR PFX "post send error %d\n", ret);
1340                 return;
1341         }
1342
1343         /* Spin waiting for send completion */
1344         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1345         if (ret < 0) {
1346                 printk(KERN_ERR PFX "poll error %d\n", ret);
1347                 return;
1348         }
1349         if (wc.status) {
1350                 printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
1351                 return;
1352         }
1353
1354         if (cb->duplex)
1355                 bw_test(cb);
1356         wait_event_interruptible(cb->sem, cb->state == ERROR);
1357 }
1358
1359 static int reg_supported(struct ib_device *dev)
1360 {
1361         u64 needed_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
1362
1363         if ((dev->attrs.device_cap_flags & needed_flags) != needed_flags) {
1364                 printk(KERN_ERR PFX 
1365                         "Fastreg not supported - device_cap_flags 0x%llx\n",
1366                         (unsigned long long)dev->attrs.device_cap_flags);
1367                 return 0;
1368         }
1369         DEBUG_LOG("Fastreg supported - device_cap_flags 0x%llx\n",
1370                 (unsigned long long)dev->attrs.device_cap_flags);
1371         return 1;
1372 }
1373
1374 static void fill_sockaddr(struct sockaddr_storage *sin, struct krping_cb *cb)
1375 {
1376         memset(sin, 0, sizeof(*sin));
1377
1378         if (cb->addr_type == AF_INET) {
1379                 struct sockaddr_in *sin4 = (struct sockaddr_in *)sin;
1380                 sin4->sin_len = sizeof(*sin4);
1381                 sin4->sin_family = AF_INET;
1382                 memcpy((void *)&sin4->sin_addr.s_addr, cb->addr, 4);
1383                 sin4->sin_port = cb->port;
1384         } else if (cb->addr_type == AF_INET6) {
1385                 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
1386                 sin6->sin6_len = sizeof(*sin6);
1387                 sin6->sin6_family = AF_INET6;
1388                 memcpy((void *)&sin6->sin6_addr, cb->addr, 16);
1389                 sin6->sin6_port = cb->port;
1390         }
1391 }
1392
1393 static int krping_bind_server(struct krping_cb *cb)
1394 {
1395         struct sockaddr_storage sin;
1396         int ret;
1397
1398
1399         fill_sockaddr(&sin, cb);
1400
1401         ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *)&sin);
1402         if (ret) {
1403                 printk(KERN_ERR PFX "rdma_bind_addr error %d\n", ret);
1404                 return ret;
1405         }
1406         DEBUG_LOG("rdma_bind_addr successful\n");
1407
1408         DEBUG_LOG("rdma_listen\n");
1409         ret = rdma_listen(cb->cm_id, 3);
1410         if (ret) {
1411                 printk(KERN_ERR PFX "rdma_listen failed: %d\n", ret);
1412                 return ret;
1413         }
1414
1415         wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1416         if (cb->state != CONNECT_REQUEST) {
1417                 printk(KERN_ERR PFX "wait for CONNECT_REQUEST state %d\n",
1418                         cb->state);
1419                 return -1;
1420         }
1421
1422         if (!reg_supported(cb->child_cm_id->device))
1423                 return -EINVAL;
1424
1425         return 0;
1426 }
1427
1428 static void krping_run_server(struct krping_cb *cb)
1429 {
1430         struct ib_recv_wr *bad_wr;
1431         int ret;
1432
1433         ret = krping_bind_server(cb);
1434         if (ret)
1435                 return;
1436
1437         ret = krping_setup_qp(cb, cb->child_cm_id);
1438         if (ret) {
1439                 printk(KERN_ERR PFX "setup_qp failed: %d\n", ret);
1440                 goto err0;
1441         }
1442
1443         ret = krping_setup_buffers(cb);
1444         if (ret) {
1445                 printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret);
1446                 goto err1;
1447         }
1448
1449         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1450         if (ret) {
1451                 printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
1452                 goto err2;
1453         }
1454
1455         ret = krping_accept(cb);
1456         if (ret) {
1457                 printk(KERN_ERR PFX "connect error %d\n", ret);
1458                 goto err2;
1459         }
1460
1461         if (cb->wlat)
1462                 krping_wlat_test_server(cb);
1463         else if (cb->rlat)
1464                 krping_rlat_test_server(cb);
1465         else if (cb->bw)
1466                 krping_bw_test_server(cb);
1467         else
1468                 krping_test_server(cb);
1469         rdma_disconnect(cb->child_cm_id);
1470 err2:
1471         krping_free_buffers(cb);
1472 err1:
1473         krping_free_qp(cb);
1474 err0:
1475         rdma_destroy_id(cb->child_cm_id);
1476 }
1477
1478 static void krping_test_client(struct krping_cb *cb)
1479 {
1480         int ping, start, cc, i, ret;
1481         struct ib_send_wr *bad_wr;
1482         unsigned char c;
1483
1484         start = 65;
1485         for (ping = 0; !cb->count || ping < cb->count; ping++) {
1486                 cb->state = RDMA_READ_ADV;
1487
1488                 /* Put some ascii text in the buffer. */
1489                 cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1490                 for (i = cc, c = start; i < cb->size; i++) {
1491                         cb->start_buf[i] = c;
1492                         c++;
1493                         if (c > 122)
1494                                 c = 65;
1495                 }
1496                 start++;
1497                 if (start > 122)
1498                         start = 65;
1499                 cb->start_buf[cb->size - 1] = 0;
1500
1501                 krping_format_send(cb, cb->start_dma_addr);
1502                 if (cb->state == ERROR) {
1503                         printk(KERN_ERR PFX "krping_format_send failed\n");
1504                         break;
1505                 }
1506                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1507                 if (ret) {
1508                         printk(KERN_ERR PFX "post send error %d\n", ret);
1509                         break;
1510                 }
1511
1512                 /* Wait for server to ACK */
1513                 wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1514                 if (cb->state != RDMA_WRITE_ADV) {
1515                         printk(KERN_ERR PFX 
1516                                "wait for RDMA_WRITE_ADV state %d\n",
1517                                cb->state);
1518                         break;
1519                 }
1520
1521                 krping_format_send(cb, cb->rdma_dma_addr);
1522                 ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1523                 if (ret) {
1524                         printk(KERN_ERR PFX "post send error %d\n", ret);
1525                         break;
1526                 }
1527
1528                 /* Wait for the server to say the RDMA Write is complete. */
1529                 wait_event_interruptible(cb->sem, 
1530                                          cb->state >= RDMA_WRITE_COMPLETE);
1531                 if (cb->state != RDMA_WRITE_COMPLETE) {
1532                         printk(KERN_ERR PFX 
1533                                "wait for RDMA_WRITE_COMPLETE state %d\n",
1534                                cb->state);
1535                         break;
1536                 }
1537
1538                 if (cb->validate)
1539                         if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1540                                 printk(KERN_ERR PFX "data mismatch!\n");
1541                                 break;
1542                         }
1543
1544                 if (cb->verbose)
1545                         printk(KERN_INFO PFX "ping data: %s\n", cb->rdma_buf);
1546 #ifdef SLOW_KRPING
1547                 wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1548 #endif
1549         }
1550 }
1551
1552 static void krping_rlat_test_client(struct krping_cb *cb)
1553 {
1554         struct ib_send_wr *bad_wr;
1555         struct ib_wc wc;
1556         int ret;
1557
1558         cb->state = RDMA_READ_ADV;
1559
1560         /* Send STAG/TO/Len to client */
1561         krping_format_send(cb, cb->start_dma_addr);
1562         if (cb->state == ERROR) {
1563                 printk(KERN_ERR PFX "krping_format_send failed\n");
1564                 return;
1565         }
1566         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1567         if (ret) {
1568                 printk(KERN_ERR PFX "post send error %d\n", ret);
1569                 return;
1570         }
1571
1572         /* Spin waiting for send completion */
1573         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1574         if (ret < 0) {
1575                 printk(KERN_ERR PFX "poll error %d\n", ret);
1576                 return;
1577         }
1578         if (wc.status) {
1579                 printk(KERN_ERR PFX "send completion error %d\n", wc.status);
1580                 return;
1581         }
1582
1583         /* Spin waiting for server's Start STAG/TO/Len */
1584         while (cb->state < RDMA_WRITE_ADV) {
1585                 krping_cq_event_handler(cb->cq, cb);
1586         }
1587
1588 #if 0
1589 {
1590         int i;
1591         struct timeval start, stop;
1592         time_t sec;
1593         suseconds_t usec;
1594         unsigned long long elapsed;
1595         struct ib_wc wc;
1596         struct ib_send_wr *bad_wr;
1597         int ne;
1598         
1599         cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
1600         cb->rdma_sq_wr.rkey = cb->remote_rkey;
1601         cb->rdma_sq_wr.remote_addr = cb->remote_addr;
1602         cb->rdma_sq_wr.wr.sg_list->length = 0;
1603         cb->rdma_sq_wr.wr.num_sge = 0;
1604
1605         microtime(&start);
1606         for (i=0; i < 100000; i++) {
1607                 if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
1608                         printk(KERN_ERR PFX  "Couldn't post send\n");
1609                         return;
1610                 }
1611                 do {
1612                         ne = ib_poll_cq(cb->cq, 1, &wc);
1613                 } while (ne == 0);
1614                 if (ne < 0) {
1615                         printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
1616                         return;
1617                 }
1618                 if (wc.status != IB_WC_SUCCESS) {
1619                         printk(KERN_ERR PFX "Completion wth error at %s:\n",
1620                                 cb->server ? "server" : "client");
1621                         printk(KERN_ERR PFX "Failed status %d: wr_id %d\n",
1622                                 wc.status, (int) wc.wr_id);
1623                         return;
1624                 }
1625         }
1626         microtime(&stop);
1627         
1628         if (stop.tv_usec < start.tv_usec) {
1629                 stop.tv_usec += 1000000;
1630                 stop.tv_sec  -= 1;
1631         }
1632         sec     = stop.tv_sec - start.tv_sec;
1633         usec    = stop.tv_usec - start.tv_usec;
1634         elapsed = sec * 1000000 + usec;
1635         printk(KERN_ERR PFX "0B-write-lat iters 100000 usec %llu\n", elapsed);
1636 }
1637 #endif
1638
1639         rlat_test(cb);
1640 }
1641
1642 static void krping_wlat_test_client(struct krping_cb *cb)
1643 {
1644         struct ib_send_wr *bad_wr;
1645         struct ib_wc wc;
1646         int ret;
1647
1648         cb->state = RDMA_READ_ADV;
1649
1650         /* Send STAG/TO/Len to client */
1651         krping_format_send(cb, cb->start_dma_addr);
1652         if (cb->state == ERROR) {
1653                 printk(KERN_ERR PFX "krping_format_send failed\n");
1654                 return;
1655         }
1656         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1657         if (ret) {
1658                 printk(KERN_ERR PFX "post send error %d\n", ret);
1659                 return;
1660         }
1661
1662         /* Spin waiting for send completion */
1663         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1664         if (ret < 0) {
1665                 printk(KERN_ERR PFX "poll error %d\n", ret);
1666                 return;
1667         }
1668         if (wc.status) {
1669                 printk(KERN_ERR PFX "send completion error %d\n", wc.status);
1670                 return;
1671         }
1672
1673         /* Spin waiting for server's Start STAG/TO/Len */
1674         while (cb->state < RDMA_WRITE_ADV) {
1675                 krping_cq_event_handler(cb->cq, cb);
1676         }
1677
1678         wlat_test(cb);
1679 }
1680
1681 static void krping_bw_test_client(struct krping_cb *cb)
1682 {
1683         struct ib_send_wr *bad_wr;
1684         struct ib_wc wc;
1685         int ret;
1686
1687         cb->state = RDMA_READ_ADV;
1688
1689         /* Send STAG/TO/Len to client */
1690         krping_format_send(cb, cb->start_dma_addr);
1691         if (cb->state == ERROR) {
1692                 printk(KERN_ERR PFX "krping_format_send failed\n");
1693                 return;
1694         }
1695         ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1696         if (ret) {
1697                 printk(KERN_ERR PFX "post send error %d\n", ret);
1698                 return;
1699         }
1700
1701         /* Spin waiting for send completion */
1702         while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1703         if (ret < 0) {
1704                 printk(KERN_ERR PFX "poll error %d\n", ret);
1705                 return;
1706         }
1707         if (wc.status) {
1708                 printk(KERN_ERR PFX "send completion error %d\n", wc.status);
1709                 return;
1710         }
1711
1712         /* Spin waiting for server's Start STAG/TO/Len */
1713         while (cb->state < RDMA_WRITE_ADV) {
1714                 krping_cq_event_handler(cb->cq, cb);
1715         }
1716
1717         bw_test(cb);
1718 }
1719
1720 /*
1721  * Manual qp flush test
1722  */
1723 static void flush_qp(struct krping_cb *cb)
1724 {
1725         struct ib_send_wr wr = { 0 }, *bad;
1726         struct ib_recv_wr recv_wr = { 0 }, *recv_bad;
1727         struct ib_wc wc;
1728         int ret;
1729         int flushed = 0;
1730         int ccnt = 0;
1731
1732         rdma_disconnect(cb->cm_id);
1733         DEBUG_LOG("disconnected!\n");
1734
1735         wr.opcode = IB_WR_SEND;
1736         wr.wr_id = 0xdeadbeefcafebabe;
1737         ret = ib_post_send(cb->qp, &wr, &bad);
1738         if (ret) {
1739                 printk(KERN_ERR PFX "%s post_send failed ret %d\n", __func__, ret);
1740                 return;
1741         }
1742
1743         recv_wr.wr_id = 0xcafebabedeadbeef;
1744         ret = ib_post_recv(cb->qp, &recv_wr, &recv_bad);
1745         if (ret) {
1746                 printk(KERN_ERR PFX "%s post_recv failed ret %d\n", __func__, ret);
1747                 return;
1748         }
1749
1750         /* poll until the flush WRs complete */
1751         do {
1752                 ret = ib_poll_cq(cb->cq, 1, &wc);
1753                 if (ret < 0) {
1754                         printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret);
1755                         return;
1756                 }
1757                 if (ret == 0)
1758                         continue;
1759                 ccnt++;
1760                 if (wc.wr_id == 0xdeadbeefcafebabe ||
1761                     wc.wr_id == 0xcafebabedeadbeef)
1762                         flushed++;
1763         } while (flushed != 2);
1764         DEBUG_LOG("qp_flushed! ccnt %u\n", ccnt);
1765 }
1766
1767 static void krping_fr_test(struct krping_cb *cb)
1768 {
1769         struct ib_send_wr inv, *bad;
1770         struct ib_reg_wr fr;
1771         struct ib_wc wc;
1772         u8 key = 0;
1773         struct ib_mr *mr;
1774         int ret;
1775         int size = cb->size;
1776         int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1777         unsigned long start;
1778         int count = 0;
1779         int scnt = 0;
1780         struct scatterlist sg = {0};
1781
1782         mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, plen);
1783         if (IS_ERR(mr)) {
1784                 printk(KERN_ERR PFX "ib_alloc_mr failed %ld\n", PTR_ERR(mr));
1785                 return;
1786         }
1787
1788         sg_dma_address(&sg) = (dma_addr_t)0xcafebabe0000ULL;
1789         sg_dma_len(&sg) = size;
1790         ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE);
1791         if (ret <= 0) {
1792                 printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret);
1793                 goto err2;
1794         }
1795
1796         memset(&fr, 0, sizeof fr);
1797         fr.wr.opcode = IB_WR_REG_MR;
1798         fr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1799         fr.mr = mr;
1800         fr.wr.next = &inv;
1801
1802         memset(&inv, 0, sizeof inv);
1803         inv.opcode = IB_WR_LOCAL_INV;
1804         inv.send_flags = IB_SEND_SIGNALED;
1805         
1806         DEBUG_LOG("fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1807         start = time_uptime;
1808         while (!cb->count || count <= cb->count) {
1809                 if (SIGPENDING(curthread)) {
1810                         printk(KERN_ERR PFX "signal!\n");
1811                         break;
1812                 }
1813                 if ((time_uptime - start) >= 9) {
1814                         DEBUG_LOG("fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1815                         wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1816                         if (cb->state == ERROR)
1817                                 break;
1818                         start = time_uptime;
1819                 }       
1820                 while (scnt < (cb->txdepth>>1)) {
1821                         ib_update_fast_reg_key(mr, ++key);
1822                         fr.key = mr->rkey;
1823                         inv.ex.invalidate_rkey = mr->rkey;
1824
1825                         size = arc4random() % cb->size;
1826                         if (size == 0)
1827                                 size = cb->size;
1828                         sg_dma_len(&sg) = size;
1829                         ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE);
1830                         if (ret <= 0) {
1831                                 printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret);
1832                                 goto err2;
1833                         }
1834                         ret = ib_post_send(cb->qp, &fr.wr, &bad);
1835                         if (ret) {
1836                                 printk(KERN_ERR PFX "ib_post_send failed %d\n", ret);
1837                                 goto err2;      
1838                         }
1839                         scnt++;
1840                 }
1841
1842                 ret = ib_poll_cq(cb->cq, 1, &wc);
1843                 if (ret < 0) {
1844                         printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret);
1845                         goto err2;      
1846                 }
1847                 if (ret == 1) {
1848                         if (wc.status) {
1849                                 printk(KERN_ERR PFX "completion error %u\n", wc.status);
1850                                 goto err2;
1851                         }
1852                         count++;
1853                         scnt--;
1854                 }
1855         }
1856 err2:
1857         flush_qp(cb);
1858         DEBUG_LOG("fr_test: done!\n");
1859         ib_dereg_mr(mr);
1860 }
1861
1862 static int krping_connect_client(struct krping_cb *cb)
1863 {
1864         struct rdma_conn_param conn_param;
1865         int ret;
1866
1867         memset(&conn_param, 0, sizeof conn_param);
1868         conn_param.responder_resources = 1;
1869         conn_param.initiator_depth = 1;
1870         conn_param.retry_count = 10;
1871
1872         ret = rdma_connect(cb->cm_id, &conn_param);
1873         if (ret) {
1874                 printk(KERN_ERR PFX "rdma_connect error %d\n", ret);
1875                 return ret;
1876         }
1877
1878         wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
1879         if (cb->state == ERROR) {
1880                 printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state);
1881                 return -1;
1882         }
1883
1884         DEBUG_LOG("rdma_connect successful\n");
1885         return 0;
1886 }
1887
1888 static int krping_bind_client(struct krping_cb *cb)
1889 {
1890         struct sockaddr_storage sin;
1891         int ret;
1892
1893         fill_sockaddr(&sin, cb);
1894
1895         ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *)&sin, 2000);
1896         if (ret) {
1897                 printk(KERN_ERR PFX "rdma_resolve_addr error %d\n", ret);
1898                 return ret;
1899         }
1900
1901         wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
1902         if (cb->state != ROUTE_RESOLVED) {
1903                 printk(KERN_ERR PFX 
1904                        "addr/route resolution did not resolve: state %d\n",
1905                        cb->state);
1906                 return -EINTR;
1907         }
1908
1909         if (!reg_supported(cb->cm_id->device))
1910                 return -EINVAL;
1911
1912         DEBUG_LOG("rdma_resolve_addr - rdma_resolve_route successful\n");
1913         return 0;
1914 }
1915
1916 static void krping_run_client(struct krping_cb *cb)
1917 {
1918         struct ib_recv_wr *bad_wr;
1919         int ret;
1920
1921         ret = krping_bind_client(cb);
1922         if (ret)
1923                 return;
1924
1925         ret = krping_setup_qp(cb, cb->cm_id);
1926         if (ret) {
1927                 printk(KERN_ERR PFX "setup_qp failed: %d\n", ret);
1928                 return;
1929         }
1930
1931         ret = krping_setup_buffers(cb);
1932         if (ret) {
1933                 printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret);
1934                 goto err1;
1935         }
1936
1937         ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1938         if (ret) {
1939                 printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
1940                 goto err2;
1941         }
1942
1943         ret = krping_connect_client(cb);
1944         if (ret) {
1945                 printk(KERN_ERR PFX "connect error %d\n", ret);
1946                 goto err2;
1947         }
1948
1949         if (cb->wlat)
1950                 krping_wlat_test_client(cb);
1951         else if (cb->rlat)
1952                 krping_rlat_test_client(cb);
1953         else if (cb->bw)
1954                 krping_bw_test_client(cb);
1955         else if (cb->frtest)
1956                 krping_fr_test(cb);
1957         else
1958                 krping_test_client(cb);
1959         rdma_disconnect(cb->cm_id);
1960 err2:
1961         krping_free_buffers(cb);
1962 err1:
1963         krping_free_qp(cb);
1964 }
1965
1966 static uint16_t
1967 krping_get_ipv6_scope_id(char *name)
1968 {
1969         struct ifnet *ifp;
1970         uint16_t retval;
1971
1972         if (name == NULL)
1973                 return (0);
1974         CURVNET_SET_QUIET(TD_TO_VNET(curthread));
1975         ifp = ifunit_ref(name);
1976         CURVNET_RESTORE();
1977         if (ifp == NULL)
1978                 return (0);
1979         retval = ifp->if_index;
1980         if_rele(ifp);
1981         return (retval);
1982 }
1983
1984 int krping_doit(char *cmd)
1985 {
1986         struct krping_cb *cb;
1987         int op;
1988         int ret = 0;
1989         char *optarg;
1990         char *scope;
1991         unsigned long optint;
1992
1993         cb = kzalloc(sizeof(*cb), GFP_KERNEL);
1994         if (!cb)
1995                 return -ENOMEM;
1996
1997         mutex_lock(&krping_mutex);
1998         list_add_tail(&cb->list, &krping_cbs);
1999         mutex_unlock(&krping_mutex);
2000
2001         cb->server = -1;
2002         cb->state = IDLE;
2003         cb->size = 64;
2004         cb->txdepth = RPING_SQ_DEPTH;
2005         init_waitqueue_head(&cb->sem);
2006
2007         while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2008                               &optint)) != 0) {
2009                 switch (op) {
2010                 case 'a':
2011                         cb->addr_str = optarg;
2012                         cb->addr_type = AF_INET;
2013                         DEBUG_LOG("ipaddr (%s)\n", optarg);
2014                         if (inet_pton(AF_INET, optarg, cb->addr) != 1) {
2015                                 printk(KERN_ERR PFX "bad addr string %s\n",
2016                                     optarg);
2017                                 ret = EINVAL;
2018                         }
2019                         break;
2020                 case 'A':
2021                         cb->addr_str = optarg;
2022                         cb->addr_type = AF_INET6;
2023                         DEBUG_LOG("ipv6addr (%s)\n", optarg);
2024                         scope = strstr(optarg, "%");
2025                         /* extract scope ID, if any */
2026                         if (scope != NULL)
2027                                 *scope++ = 0;
2028                         /* extract IPv6 network address */
2029                         if (inet_pton(AF_INET6, optarg, cb->addr) != 1) {
2030                                 printk(KERN_ERR PFX "bad addr string %s\n",
2031                                     optarg);
2032                                 ret = EINVAL;
2033                         } else if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)cb->addr) ||
2034                             IN6_IS_ADDR_MC_INTFACELOCAL((struct in6_addr *)cb->addr)) {
2035                                 uint16_t scope_id = krping_get_ipv6_scope_id(scope);
2036                                 DEBUG_LOG("ipv6 scope ID = %d\n", scope_id);
2037                                 cb->addr[2] = scope_id >> 8;
2038                                 cb->addr[3] = scope_id & 0xFF;
2039                         }
2040                         break;
2041                 case 'p':
2042                         cb->port = htons(optint);
2043                         DEBUG_LOG("port %d\n", (int)optint);
2044                         break;
2045                 case 'P':
2046                         cb->poll = 1;
2047                         DEBUG_LOG("server\n");
2048                         break;
2049                 case 's':
2050                         cb->server = 1;
2051                         DEBUG_LOG("server\n");
2052                         break;
2053                 case 'c':
2054                         cb->server = 0;
2055                         DEBUG_LOG("client\n");
2056                         break;
2057                 case 'S':
2058                         cb->size = optint;
2059                         if ((cb->size < 1) ||
2060                             (cb->size > RPING_BUFSIZE)) {
2061                                 printk(KERN_ERR PFX "Invalid size %d "
2062                                        "(valid range is 1 to %d)\n",
2063                                        cb->size, RPING_BUFSIZE);
2064                                 ret = EINVAL;
2065                         } else
2066                                 DEBUG_LOG("size %d\n", (int)optint);
2067                         break;
2068                 case 'C':
2069                         cb->count = optint;
2070                         if (cb->count < 0) {
2071                                 printk(KERN_ERR PFX "Invalid count %d\n",
2072                                         cb->count);
2073                                 ret = EINVAL;
2074                         } else
2075                                 DEBUG_LOG("count %d\n", (int) cb->count);
2076                         break;
2077                 case 'v':
2078                         cb->verbose++;
2079                         DEBUG_LOG("verbose\n");
2080                         break;
2081                 case 'V':
2082                         cb->validate++;
2083                         DEBUG_LOG("validate data\n");
2084                         break;
2085                 case 'l':
2086                         cb->wlat++;
2087                         break;
2088                 case 'L':
2089                         cb->rlat++;
2090                         break;
2091                 case 'B':
2092                         cb->bw++;
2093                         break;
2094                 case 'd':
2095                         cb->duplex++;
2096                         break;
2097                 case 'I':
2098                         cb->server_invalidate = 1;
2099                         break;
2100                 case 'T':
2101                         cb->txdepth = optint;
2102                         DEBUG_LOG("txdepth %d\n", (int) cb->txdepth);
2103                         break;
2104                 case 'Z':
2105                         cb->local_dma_lkey = 1;
2106                         DEBUG_LOG("using local dma lkey\n");
2107                         break;
2108                 case 'R':
2109                         cb->read_inv = 1;
2110                         DEBUG_LOG("using read-with-inv\n");
2111                         break;
2112                 case 'f':
2113                         cb->frtest = 1;
2114                         DEBUG_LOG("fast-reg test!\n");
2115                         break;
2116                 default:
2117                         printk(KERN_ERR PFX "unknown opt %s\n", optarg);
2118                         ret = -EINVAL;
2119                         break;
2120                 }
2121         }
2122         if (ret)
2123                 goto out;
2124
2125         if (cb->server == -1) {
2126                 printk(KERN_ERR PFX "must be either client or server\n");
2127                 ret = -EINVAL;
2128                 goto out;
2129         }
2130
2131         if (cb->server && cb->frtest) {
2132                 printk(KERN_ERR PFX "must be client to run frtest\n");
2133                 ret = -EINVAL;
2134                 goto out;
2135         }
2136
2137         if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2138                 printk(KERN_ERR PFX "Pick only one test: fr, bw, rlat, wlat\n");
2139                 ret = -EINVAL;
2140                 goto out;
2141         }
2142
2143         if (cb->wlat || cb->rlat || cb->bw) {
2144                 printk(KERN_ERR PFX "wlat, rlat, and bw tests only support mem_mode MR - which is no longer supported\n");
2145                 ret = -EINVAL;
2146                 goto out;
2147         }
2148
2149         cb->cm_id = rdma_create_id(&init_net, krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
2150         if (IS_ERR(cb->cm_id)) {
2151                 ret = PTR_ERR(cb->cm_id);
2152                 printk(KERN_ERR PFX "rdma_create_id error %d\n", ret);
2153                 goto out;
2154         }
2155         DEBUG_LOG("created cm_id %p\n", cb->cm_id);
2156
2157         if (cb->server)
2158                 krping_run_server(cb);
2159         else
2160                 krping_run_client(cb);
2161
2162         DEBUG_LOG("destroy cm_id %p\n", cb->cm_id);
2163         rdma_destroy_id(cb->cm_id);
2164 out:
2165         mutex_lock(&krping_mutex);
2166         list_del(&cb->list);
2167         mutex_unlock(&krping_mutex);
2168         kfree(cb);
2169         return ret;
2170 }
2171
2172 void
2173 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2174 {
2175         struct krping_cb *cb;
2176
2177         mutex_lock(&krping_mutex);
2178         list_for_each_entry(cb, &krping_cbs, list)
2179             (*f)(cb->pd ? &cb->stats : NULL, arg);
2180         mutex_unlock(&krping_mutex);
2181 }