]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/iser/iser_verbs.c
Merge ^/head r319801 through r320041.
[FreeBSD/FreeBSD.git] / sys / dev / iser / iser_verbs.c
1 /* $FreeBSD$ */
2 /*-
3  * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include "icl_iser.h"
28
29 static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend");
30 static int iser_cq_poll_limit = 512;
31
32 static void
33 iser_cq_event_callback(struct ib_event *cause, void *context)
34 {
35         ISER_ERR("got cq event %d", cause->event);
36 }
37
38 static void
39 iser_qp_event_callback(struct ib_event *cause, void *context)
40 {
41         ISER_ERR("got qp event %d", cause->event);
42 }
43
44 static void
45 iser_event_handler(struct ib_event_handler *handler,
46                                 struct ib_event *event)
47 {
48         ISER_ERR("async event %d on device %s port %d",
49                  event->event, event->device->name,
50                  event->element.port_num);
51 }
52
53 /**
54  * is_iser_tx_desc - Indicate if the completion wr_id
55  *     is a TX descriptor or not.
56  * @iser_conn: iser connection
57  * @wr_id: completion WR identifier
58  *
59  * Since we cannot rely on wc opcode in FLUSH errors
60  * we must work around it by checking if the wr_id address
61  * falls in the iser connection rx_descs buffer. If so
62  * it is an RX descriptor, otherwize it is a TX.
63  */
64 static inline bool
65 is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
66 {
67         void *start = iser_conn->rx_descs;
68         u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
69         void *end = (void *)((uintptr_t)start + (uintptr_t)len);
70
71         if (start) {
72                 if (wr_id >= start && wr_id < end)
73                         return false;
74         } else {
75                 return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf);
76         }
77
78         return true;
79 }
80
81 /**
82  * iser_handle_comp_error() - Handle error completion
83  * @ib_conn:   connection RDMA resources
84  * @wc:        work completion
85  *
86  * Notes: Update post_recv_buf_count in case of recv error completion.
87  *        For non-FLUSH error completion we should also notify iscsi layer that
88  *        connection is failed (in case we passed bind stage).
89  */
90 static void
91 iser_handle_comp_error(struct ib_conn *ib_conn,
92                        struct ib_wc *wc)
93 {
94         void *wr_id = (void *)(uintptr_t)wc->wr_id;
95         struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
96                                                    ib_conn);
97
98         if (is_iser_tx_desc(iser_conn, wr_id)) {
99                 ISER_DBG("conn %p got send comp error", iser_conn);
100         } else {
101                 ISER_DBG("conn %p got recv comp error", iser_conn);
102                 ib_conn->post_recv_buf_count--;
103         }
104         if (wc->status != IB_WC_WR_FLUSH_ERR)
105                 iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
106 }
107
108 /**
109  * iser_handle_wc - handle a single work completion
110  * @wc: work completion
111  *
112  * Soft-IRQ context, work completion can be either
113  * SEND or RECV, and can turn out successful or
114  * with error (or flush error).
115  */
116 static void iser_handle_wc(struct ib_wc *wc)
117 {
118         struct ib_conn *ib_conn;
119         struct iser_tx_desc *tx_desc;
120         struct iser_rx_desc *rx_desc;
121
122         ib_conn = wc->qp->qp_context;
123         if (likely(wc->status == IB_WC_SUCCESS)) {
124                 if (wc->opcode == IB_WC_RECV) {
125                         rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
126                         iser_rcv_completion(rx_desc, wc->byte_len,
127                                             ib_conn);
128                 } else
129                 if (wc->opcode == IB_WC_SEND) {
130                         tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
131                         iser_snd_completion(tx_desc, ib_conn);
132                 } else {
133                         ISER_ERR("Unknown wc opcode %d", wc->opcode);
134                 }
135         } else {
136                 struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
137                                         ib_conn);
138                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
139                         ISER_ERR("conn %p wr id %llx status %d vend_err %x",
140                                  iser_conn, (unsigned long long)wc->wr_id,
141                                  wc->status, wc->vendor_err);
142                 } else {
143                         ISER_DBG("flush error: conn %p wr id %llx",
144                                  iser_conn, (unsigned long long)wc->wr_id);
145                 }
146
147                 if (wc->wr_id == ISER_BEACON_WRID) {
148                         /* all flush errors were consumed */
149                         mtx_lock(&ib_conn->beacon.flush_lock);
150                         ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn);
151                         cv_signal(&ib_conn->beacon.flush_cv);
152                         mtx_unlock(&ib_conn->beacon.flush_lock);
153                 } else {
154                         iser_handle_comp_error(ib_conn, wc);
155                 }
156         }
157 }
158
159 static void
160 iser_cq_tasklet_fn(void *data, int pending)
161 {
162         struct iser_comp *comp = (struct iser_comp *)data;
163         struct ib_cq *cq = comp->cq;
164         struct ib_wc *const wcs = comp->wcs;
165         int completed = 0;
166         int i;
167         int n;
168
169         while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
170                 for (i = 0; i < n; i++)
171                         iser_handle_wc(&wcs[i]);
172
173                 completed += n;
174                 if (completed >= iser_cq_poll_limit)
175                         break;
176         }
177
178         /*
179          * It is assumed here that arming CQ only once its empty
180          * would not cause interrupts to be missed.
181          */
182         ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
183 }
184
185 static void
186 iser_cq_callback(struct ib_cq *cq, void *cq_context)
187 {
188         struct iser_comp *comp = cq_context;
189
190         taskqueue_enqueue(comp->tq, &comp->task);
191 }
192
193 /**
194  * iser_create_device_ib_res - creates Protection Domain (PD), Completion
195  * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
196  * the adapator.
197  *
198  * returns 0 on success, -1 on failure
199  */
200 static int
201 iser_create_device_ib_res(struct iser_device *device)
202 {
203         struct ib_device_attr *dev_attr = &device->dev_attr;
204         int ret, i, max_cqe;
205
206         ret = ib_query_device(device->ib_device, dev_attr);
207         if (ret) {
208                 ISER_ERR("Query device failed for %s", device->ib_device->name);
209                 return (ret);
210         }
211
212         if (!(dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
213                 ISER_ERR("device %s doesn't support Fastreg, "
214                          "can't register memory", device->ib_device->name);
215                 return (1);
216         }
217
218         device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors);
219
220         device->comps = malloc(device->comps_used * sizeof(*device->comps),
221                 M_ISER_VERBS, M_WAITOK | M_ZERO);
222         if (!device->comps)
223                 goto comps_err;
224
225         max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
226
227         ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d",
228                  device->comps_used, device->ib_device->name,
229                  device->ib_device->num_comp_vectors, max_cqe);
230
231         device->pd = ib_alloc_pd(device->ib_device);
232         if (IS_ERR(device->pd))
233                 goto pd_err;
234
235         for (i = 0; i < device->comps_used; i++) {
236                 struct iser_comp *comp = &device->comps[i];
237
238                 comp->device = device;
239                 comp->cq = ib_create_cq(device->ib_device,
240                                         iser_cq_callback,
241                                         iser_cq_event_callback,
242                                         (void *)comp,
243                                         max_cqe, i);
244                 if (IS_ERR(comp->cq)) {
245                         comp->cq = NULL;
246                         goto cq_err;
247                 }
248
249                 if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
250                         goto cq_err;
251
252                 TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp);
253                 comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT,
254                                 taskqueue_thread_enqueue, &comp->tq);
255                 if (!comp->tq)
256                         goto tq_err;
257                 taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq");
258         }
259
260         device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |
261                                    IB_ACCESS_REMOTE_WRITE |
262                                    IB_ACCESS_REMOTE_READ);
263         if (IS_ERR(device->mr))
264                 goto tq_err;
265
266         INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
267                                 iser_event_handler);
268         if (ib_register_event_handler(&device->event_handler))
269                 goto handler_err;
270
271         return (0);
272
273 handler_err:
274         ib_dereg_mr(device->mr);
275 tq_err:
276         for (i = 0; i < device->comps_used; i++) {
277                 struct iser_comp *comp = &device->comps[i];
278                 if (comp->tq)
279                         taskqueue_free(comp->tq);
280         }
281 cq_err:
282         for (i = 0; i < device->comps_used; i++) {
283                 struct iser_comp *comp = &device->comps[i];
284                 if (comp->cq)
285                         ib_destroy_cq(comp->cq);
286         }
287         ib_dealloc_pd(device->pd);
288 pd_err:
289         free(device->comps, M_ISER_VERBS);
290 comps_err:
291         ISER_ERR("failed to allocate an IB resource");
292         return (1);
293 }
294
295 /**
296  * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
297  * CQ and PD created with the device associated with the adapator.
298  */
299 static void
300 iser_free_device_ib_res(struct iser_device *device)
301 {
302         int i;
303
304         for (i = 0; i < device->comps_used; i++) {
305                 struct iser_comp *comp = &device->comps[i];
306
307                 taskqueue_free(comp->tq);
308                 ib_destroy_cq(comp->cq);
309                 comp->cq = NULL;
310         }
311
312         (void)ib_unregister_event_handler(&device->event_handler);
313         (void)ib_dereg_mr(device->mr);
314         (void)ib_dealloc_pd(device->pd);
315
316         free(device->comps, M_ISER_VERBS);
317         device->comps = NULL;
318
319         device->mr = NULL;
320         device->pd = NULL;
321 }
322
323 static int
324 iser_alloc_reg_res(struct ib_device *ib_device,
325                    struct ib_pd *pd,
326                    struct iser_reg_resources *res)
327 {
328         int ret;
329
330         res->frpl = ib_alloc_fast_reg_page_list(ib_device,
331                                                 ISCSI_ISER_SG_TABLESIZE + 1);
332         if (IS_ERR(res->frpl)) {
333                 ret = -PTR_ERR(res->frpl);
334                 ISER_ERR("Failed to allocate fast reg page list err=%d", ret);
335                 return (ret);
336         }
337
338         res->mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1);
339         if (IS_ERR(res->mr)) {
340                 ret = -PTR_ERR(res->mr);
341                 ISER_ERR("Failed to allocate  fast reg mr err=%d", ret);
342                 goto fast_reg_mr_failure;
343         }
344         res->mr_valid = 1;
345
346         return (0);
347
348 fast_reg_mr_failure:
349         ib_free_fast_reg_page_list(res->frpl);
350
351         return (ret);
352 }
353
354 static void
355 iser_free_reg_res(struct iser_reg_resources *rsc)
356 {
357         ib_dereg_mr(rsc->mr);
358         ib_free_fast_reg_page_list(rsc->frpl);
359 }
360
361 static struct fast_reg_descriptor *
362 iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd)
363 {
364         struct fast_reg_descriptor *desc;
365         int ret;
366
367         desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO);
368         if (!desc) {
369                 ISER_ERR("Failed to allocate a new fastreg descriptor");
370                 return (NULL);
371         }
372
373         ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
374         if (ret) {
375                 ISER_ERR("failed to allocate reg_resources");
376                 goto err;
377         }
378
379         return (desc);
380 err:
381         free(desc, M_ISER_VERBS);
382         return (NULL);
383 }
384
385 /**
386  * iser_create_fmr_pool - Creates FMR pool and page_vector
387  *
388  * returns 0 on success, or errno code on failure
389  */
390 int
391 iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
392 {
393         struct iser_device *device = ib_conn->device;
394         struct fast_reg_descriptor *desc;
395         int i;
396
397         INIT_LIST_HEAD(&ib_conn->fastreg.pool);
398         ib_conn->fastreg.pool_size = 0;
399         for (i = 0; i < cmds_max; i++) {
400                 desc = iser_create_fastreg_desc(device->ib_device, device->pd);
401                 if (!desc) {
402                         ISER_ERR("Failed to create fastreg descriptor");
403                         goto err;
404                 }
405
406                 list_add_tail(&desc->list, &ib_conn->fastreg.pool);
407                 ib_conn->fastreg.pool_size++;
408         }
409
410         return (0);
411
412 err:
413         iser_free_fastreg_pool(ib_conn);
414         return (ENOMEM);
415 }
416
417 /**
418  * iser_free_fmr_pool - releases the FMR pool and page vec
419  */
420 void
421 iser_free_fastreg_pool(struct ib_conn *ib_conn)
422 {
423         struct fast_reg_descriptor *desc, *tmp;
424         int i = 0;
425
426         if (list_empty(&ib_conn->fastreg.pool))
427                 return;
428
429         ISER_DBG("freeing conn %p fr pool", ib_conn);
430
431         list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
432                 list_del(&desc->list);
433                 iser_free_reg_res(&desc->rsc);
434                 free(desc, M_ISER_VERBS);
435                 ++i;
436         }
437
438         if (i < ib_conn->fastreg.pool_size)
439                 ISER_WARN("pool still has %d regions registered",
440                           ib_conn->fastreg.pool_size - i);
441 }
442
443 /**
444  * iser_create_ib_conn_res - Queue-Pair (QP)
445  *
446  * returns 0 on success, 1 on failure
447  */
448 static int
449 iser_create_ib_conn_res(struct ib_conn *ib_conn)
450 {
451         struct iser_conn *iser_conn;
452         struct iser_device *device;
453         struct ib_device_attr *dev_attr;
454         struct ib_qp_init_attr init_attr;
455         int index, min_index = 0;
456         int ret = -ENOMEM;
457
458         iser_conn = container_of(ib_conn, struct iser_conn, ib_conn);
459         device = ib_conn->device;
460         dev_attr = &device->dev_attr;
461
462         mtx_lock(&ig.connlist_mutex);
463         /* select the CQ with the minimal number of usages */
464         for (index = 0; index < device->comps_used; index++) {
465                 if (device->comps[index].active_qps <
466                     device->comps[min_index].active_qps)
467                         min_index = index;
468         }
469         ib_conn->comp = &device->comps[min_index];
470         ib_conn->comp->active_qps++;
471         mtx_unlock(&ig.connlist_mutex);
472         ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn);
473
474         memset(&init_attr, 0, sizeof init_attr);
475         init_attr.event_handler = iser_qp_event_callback;
476         init_attr.qp_context    = (void *)ib_conn;
477         init_attr.send_cq       = ib_conn->comp->cq;
478         init_attr.recv_cq       = ib_conn->comp->cq;
479         init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
480         init_attr.cap.max_send_sge = 2;
481         init_attr.cap.max_recv_sge = 1;
482         init_attr.sq_sig_type   = IB_SIGNAL_REQ_WR;
483         init_attr.qp_type       = IB_QPT_RC;
484
485         if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
486                 init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
487                 iser_conn->max_cmds =
488                         ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
489         } else {
490                 init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
491                 iser_conn->max_cmds =
492                         ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
493         }
494         ISER_DBG("device %s supports max_send_wr %d",
495                  device->ib_device->name, dev_attr->max_qp_wr);
496
497         ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
498         if (ret)
499                 goto out_err;
500
501         ib_conn->qp = ib_conn->cma_id->qp;
502         ISER_DBG("setting conn %p cma_id %p qp %p",
503                  ib_conn, ib_conn->cma_id,
504                  ib_conn->cma_id->qp);
505
506         return (ret);
507
508 out_err:
509         mtx_lock(&ig.connlist_mutex);
510         ib_conn->comp->active_qps--;
511         mtx_unlock(&ig.connlist_mutex);
512         ISER_ERR("unable to alloc mem or create resource, err %d", ret);
513
514         return (ret);
515 }
516
517 /**
518  * based on the resolved device node GUID see if there already allocated
519  * device for this device. If there's no such, create one.
520  */
521 static struct iser_device *
522 iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
523 {
524         struct iser_device *device;
525
526         sx_xlock(&ig.device_list_mutex);
527
528         list_for_each_entry(device, &ig.device_list, ig_list)
529                 /* find if there's a match using the node GUID */
530                 if (device->ib_device->node_guid == cma_id->device->node_guid)
531                         goto inc_refcnt;
532
533         device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO);
534         if (device == NULL)
535                 goto out;
536
537         /* assign this device to the device */
538         device->ib_device = cma_id->device;
539         /* init the device and link it into ig device list */
540         if (iser_create_device_ib_res(device)) {
541                 free(device, M_ISER_VERBS);
542                 device = NULL;
543                 goto out;
544         }
545         list_add(&device->ig_list, &ig.device_list);
546
547 inc_refcnt:
548         device->refcount++;
549         ISER_INFO("device %p refcount %d", device, device->refcount);
550 out:
551         sx_xunlock(&ig.device_list_mutex);
552         return (device);
553 }
554
555 /* if there's no demand for this device, release it */
556 static void
557 iser_device_try_release(struct iser_device *device)
558 {
559         sx_xlock(&ig.device_list_mutex);
560         device->refcount--;
561         ISER_INFO("device %p refcount %d", device, device->refcount);
562         if (!device->refcount) {
563                 iser_free_device_ib_res(device);
564                 list_del(&device->ig_list);
565                 free(device, M_ISER_VERBS);
566                 device = NULL;
567         }
568         sx_xunlock(&ig.device_list_mutex);
569 }
570
571 /**
572  * Called with state mutex held
573  **/
574 static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
575                                      enum iser_conn_state comp,
576                                      enum iser_conn_state exch)
577 {
578         int ret;
579
580         ret = (iser_conn->state == comp);
581         if (ret)
582                 iser_conn->state = exch;
583
584         return ret;
585 }
586
587 /**
588  * iser_free_ib_conn_res - release IB related resources
589  * @iser_conn: iser connection struct
590  * @destroy: indicator if we need to try to release the
591  *     iser device and memory regoins pool (only iscsi
592  *     shutdown and DEVICE_REMOVAL will use this).
593  *
594  * This routine is called with the iser state mutex held
595  * so the cm_id removal is out of here. It is Safe to
596  * be invoked multiple times.
597  */
598 void
599 iser_free_ib_conn_res(struct iser_conn *iser_conn,
600                                   bool destroy)
601 {
602         struct ib_conn *ib_conn = &iser_conn->ib_conn;
603         struct iser_device *device = ib_conn->device;
604
605         ISER_INFO("freeing conn %p cma_id %p qp %p",
606                   iser_conn, ib_conn->cma_id, ib_conn->qp);
607
608         if (ib_conn->qp != NULL) {
609                 mtx_lock(&ig.connlist_mutex);
610                 ib_conn->comp->active_qps--;
611                 mtx_unlock(&ig.connlist_mutex);
612                 rdma_destroy_qp(ib_conn->cma_id);
613                 ib_conn->qp = NULL;
614         }
615
616         if (destroy) {
617                 if (iser_conn->login_buf)
618                         iser_free_login_buf(iser_conn);
619
620                 if (iser_conn->rx_descs)
621                         iser_free_rx_descriptors(iser_conn);
622
623                 if (device != NULL) {
624                         iser_device_try_release(device);
625                         ib_conn->device = NULL;
626                 }
627         }
628 }
629
630 /**
631  * triggers start of the disconnect procedures and wait for them to be done
632  * Called with state mutex held
633  */
634 int
635 iser_conn_terminate(struct iser_conn *iser_conn)
636 {
637         struct ib_conn *ib_conn = &iser_conn->ib_conn;
638         struct ib_send_wr *bad_send_wr;
639         struct ib_recv_wr *bad_recv_wr;
640         int err = 0;
641
642         /* terminate the iser conn only if the conn state is UP */
643         if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
644                                            ISER_CONN_TERMINATING))
645                 return (0);
646
647         ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state);
648
649         if (ib_conn->qp == NULL) {
650                 /* HOW can this be??? */
651                 ISER_WARN("qp wasn't created");
652                 return (1);
653         }
654
655         /*
656          * Todo: This is a temporary workaround.
657          * We serialize the connection closure using global lock in order to
658          * receive all posted beacons completions.
659          * Without Serialization, in case we open many connections (QPs) on
660          * the same CQ, we might miss beacons because of missing interrupts.
661          */
662         sx_xlock(&ig.close_conns_mutex);
663
664         /*
665          * In case we didn't already clean up the cma_id (peer initiated
666          * a disconnection), we need to Cause the CMA to change the QP
667          * state to ERROR.
668          */
669         if (ib_conn->cma_id) {
670                 err = rdma_disconnect(ib_conn->cma_id);
671                 if (err)
672                         ISER_ERR("Failed to disconnect, conn: 0x%p err %d",
673                                 iser_conn, err);
674
675                 mtx_lock(&ib_conn->beacon.flush_lock);
676                 memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr));
677                 ib_conn->beacon.send.wr_id = ISER_BEACON_WRID;
678                 ib_conn->beacon.send.opcode = IB_WR_SEND;
679                 /* post an indication that all send flush errors were consumed */
680                 err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr);
681                 if (err) {
682                         ISER_ERR("conn %p failed to post send_beacon", ib_conn);
683                         mtx_unlock(&ib_conn->beacon.flush_lock);
684                         goto out;
685                 }
686
687                 ISER_DBG("before send cv_wait: %p", iser_conn);
688                 cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
689                 ISER_DBG("after send cv_wait: %p", iser_conn);
690
691                 memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr));
692                 ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID;
693                 /* post an indication that all recv flush errors were consumed */
694                 err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr);
695                 if (err) {
696                         ISER_ERR("conn %p failed to post recv_beacon", ib_conn);
697                         mtx_unlock(&ib_conn->beacon.flush_lock);
698                         goto out;
699                 }
700
701                 ISER_DBG("before recv cv_wait: %p", iser_conn);
702                 cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
703                 mtx_unlock(&ib_conn->beacon.flush_lock);
704                 ISER_DBG("after recv cv_wait: %p", iser_conn);
705         }
706 out:
707         sx_xunlock(&ig.close_conns_mutex);
708         return (1);
709 }
710
711 /**
712  * Called with state mutex held
713  **/
714 static void
715 iser_connect_error(struct rdma_cm_id *cma_id)
716 {
717         struct iser_conn *iser_conn;
718
719         iser_conn = cma_id->context;
720
721         ISER_ERR("conn %p", iser_conn);
722
723         iser_conn->state = ISER_CONN_TERMINATING;
724
725         cv_signal(&iser_conn->up_cv);
726 }
727
728 /**
729  * Called with state mutex held
730  **/
731 static void
732 iser_addr_handler(struct rdma_cm_id *cma_id)
733 {
734         struct iser_device *device;
735         struct iser_conn   *iser_conn;
736         struct ib_conn   *ib_conn;
737         int    ret;
738
739         iser_conn = cma_id->context;
740
741         ib_conn = &iser_conn->ib_conn;
742         device = iser_device_find_by_ib_device(cma_id);
743         if (!device) {
744                 ISER_ERR("conn %p device lookup/creation failed",
745                          iser_conn);
746                 iser_connect_error(cma_id);
747                 return;
748         }
749
750         ib_conn->device = device;
751
752         ret = rdma_resolve_route(cma_id, 1000);
753         if (ret) {
754                 ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret);
755                 iser_connect_error(cma_id);
756                 return;
757         }
758 }
759
760 /**
761  * Called with state mutex held
762  **/
763 static void
764 iser_route_handler(struct rdma_cm_id *cma_id)
765 {
766         struct rdma_conn_param conn_param;
767         int    ret;
768         struct iser_cm_hdr req_hdr;
769         struct iser_conn *iser_conn = cma_id->context;
770         struct ib_conn *ib_conn = &iser_conn->ib_conn;
771         struct iser_device *device = ib_conn->device;
772
773         ret = iser_create_ib_conn_res(ib_conn);
774         if (ret)
775                 goto failure;
776
777         memset(&conn_param, 0, sizeof conn_param);
778         conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
779         conn_param.retry_count         = 7;
780         conn_param.rnr_retry_count     = 6;
781         /*
782          * Initiaotr depth should not be set, but in order to compat
783          * with old targets, we keep this value set.
784          */
785         conn_param.initiator_depth     = 1;
786
787         memset(&req_hdr, 0, sizeof(req_hdr));
788         req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
789                         ISER_SEND_W_INV_NOT_SUPPORTED);
790         conn_param.private_data         = (void *)&req_hdr;
791         conn_param.private_data_len     = sizeof(struct iser_cm_hdr);
792
793         ret = rdma_connect(cma_id, &conn_param);
794         if (ret) {
795                 ISER_ERR("conn %p failure connecting: %d", iser_conn, ret);
796                 goto failure;
797         }
798
799         return;
800 failure:
801         iser_connect_error(cma_id);
802 }
803
804 /**
805  * Called with state mutex held
806  **/
807 static void
808 iser_connected_handler(struct rdma_cm_id *cma_id)
809 {
810         struct iser_conn *iser_conn;
811         struct ib_qp_attr attr;
812         struct ib_qp_init_attr init_attr;
813
814         iser_conn = cma_id->context;
815
816         (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
817
818         ISER_INFO("remote qpn:%x my qpn:%x",
819                   attr.dest_qp_num, cma_id->qp->qp_num);
820
821         iser_conn->state = ISER_CONN_UP;
822
823         cv_signal(&iser_conn->up_cv);
824 }
825
826 /**
827  * Called with state mutex held
828  **/
829 static void
830 iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy)
831 {
832         struct iser_conn *iser_conn = cma_id->context;
833
834         if (iser_conn_terminate(iser_conn))
835                 iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
836
837 }
838
839 int
840 iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
841 {
842         struct iser_conn *iser_conn;
843         int ret = 0;
844
845         iser_conn = cma_id->context;
846         ISER_INFO("event %d status %d conn %p id %p",
847                   event->event, event->status, cma_id->context, cma_id);
848
849         sx_xlock(&iser_conn->state_mutex);
850         switch (event->event) {
851         case RDMA_CM_EVENT_ADDR_RESOLVED:
852                 iser_addr_handler(cma_id);
853                 break;
854         case RDMA_CM_EVENT_ROUTE_RESOLVED:
855                 iser_route_handler(cma_id);
856                 break;
857         case RDMA_CM_EVENT_ESTABLISHED:
858                 iser_connected_handler(cma_id);
859                 break;
860         case RDMA_CM_EVENT_ADDR_ERROR:
861         case RDMA_CM_EVENT_ROUTE_ERROR:
862         case RDMA_CM_EVENT_CONNECT_ERROR:
863         case RDMA_CM_EVENT_UNREACHABLE:
864         case RDMA_CM_EVENT_REJECTED:
865                 iser_connect_error(cma_id);
866                 break;
867         case RDMA_CM_EVENT_DISCONNECTED:
868         case RDMA_CM_EVENT_ADDR_CHANGE:
869         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
870                 iser_cleanup_handler(cma_id, false);
871                 break;
872         default:
873                 ISER_ERR("Unexpected RDMA CM event (%d)", event->event);
874                 break;
875         }
876         sx_xunlock(&iser_conn->state_mutex);
877
878         return (ret);
879 }
880
881 int
882 iser_post_recvl(struct iser_conn *iser_conn)
883 {
884         struct ib_recv_wr rx_wr, *rx_wr_failed;
885         struct ib_conn *ib_conn = &iser_conn->ib_conn;
886         struct ib_sge     sge;
887         int ib_ret;
888
889         sge.addr   = iser_conn->login_resp_dma;
890         sge.length = ISER_RX_LOGIN_SIZE;
891         sge.lkey   = ib_conn->device->mr->lkey;
892
893         rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
894         rx_wr.sg_list = &sge;
895         rx_wr.num_sge = 1;
896         rx_wr.next    = NULL;
897
898         ib_conn->post_recv_buf_count++;
899         ib_ret  = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
900         if (ib_ret) {
901                 ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
902                 ib_conn->post_recv_buf_count--;
903         }
904
905         return (ib_ret);
906 }
907
908 int
909 iser_post_recvm(struct iser_conn *iser_conn, int count)
910 {
911         struct ib_recv_wr *rx_wr, *rx_wr_failed;
912         int i, ib_ret;
913         struct ib_conn *ib_conn = &iser_conn->ib_conn;
914         unsigned int my_rx_head = iser_conn->rx_desc_head;
915         struct iser_rx_desc *rx_desc;
916
917         for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
918                 rx_desc         = &iser_conn->rx_descs[my_rx_head];
919                 rx_wr->wr_id    = (uintptr_t)rx_desc;
920                 rx_wr->sg_list  = &rx_desc->rx_sg;
921                 rx_wr->num_sge  = 1;
922                 rx_wr->next     = rx_wr + 1;
923                 my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos;
924         }
925
926         rx_wr--;
927         rx_wr->next = NULL; /* mark end of work requests list */
928
929         ib_conn->post_recv_buf_count += count;
930         ib_ret  = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
931         if (ib_ret) {
932                 ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
933                 ib_conn->post_recv_buf_count -= count;
934         } else
935                 iser_conn->rx_desc_head = my_rx_head;
936
937         return (ib_ret);
938 }
939
940 /**
941  * iser_start_send - Initiate a Send DTO operation
942  *
943  * returns 0 on success, -1 on failure
944  */
945 int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
946                    bool signal)
947 {
948         int               ib_ret;
949         struct ib_send_wr send_wr, *send_wr_failed;
950
951         ib_dma_sync_single_for_device(ib_conn->device->ib_device,
952                                       tx_desc->dma_addr, ISER_HEADERS_LEN,
953                                       DMA_TO_DEVICE);
954
955         send_wr.next       = NULL;
956         send_wr.wr_id      = (uintptr_t)tx_desc;
957         send_wr.sg_list    = tx_desc->tx_sg;
958         send_wr.num_sge    = tx_desc->num_sge;
959         send_wr.opcode     = IB_WR_SEND;
960         send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
961
962         ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
963         if (ib_ret)
964                 ISER_ERR("ib_post_send failed, ret:%d", ib_ret);
965
966         return (ib_ret);
967 }