3 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * iSCSI Common Layer for RDMA.
34 #include <sys/cdefs.h>
35 #include <sys/param.h>
36 #include <sys/capsicum.h>
37 #include <sys/condvar.h>
40 #include <sys/kernel.h>
41 #include <sys/kthread.h>
44 #include <sys/mutex.h>
45 #include <sys/module.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
53 #include <sys/taskqueue.h>
56 #include <netinet/in.h>
57 #include <netinet/tcp.h>
58 #include <dev/iscsi/icl.h>
59 #include <dev/iscsi/iscsi_proto.h>
60 #include <icl_conn_if.h>
62 #include <cam/cam_ccb.h>
63 #include <rdma/ib_verbs.h>
64 #include <rdma/ib_fmr_pool.h>
65 #include <rdma/rdma_cm.h>
68 #define ISER_DBG(X, ...) \
70 if (unlikely(iser_debug > 2)) \
71 printf("DEBUG: %s: " X "\n", \
72 __func__, ## __VA_ARGS__); \
75 #define ISER_INFO(X, ...) \
77 if (unlikely(iser_debug > 1)) \
78 printf("INFO: %s: " X "\n", \
79 __func__, ## __VA_ARGS__); \
82 #define ISER_WARN(X, ...) \
84 if (unlikely(iser_debug > 0)) { \
85 printf("WARNING: %s: " X "\n", \
86 __func__, ## __VA_ARGS__); \
90 #define ISER_ERR(X, ...) \
91 printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)
97 #define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL
98 #define ISER_BEACON_WRID 0xfffffffffffffffeULL
101 #define SIZE_4K (1ULL << SHIFT_4K)
102 #define MASK_4K (~(SIZE_4K-1))
104 /* support up to 512KB in one RDMA */
105 #define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K)
106 #define ISER_DEF_XMIT_CMDS_MAX 256
108 /* the max RX (recv) WR supported by the iSER QP is defined by *
109 * max_recv_wr = commands_max + recv_beacon */
110 #define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX + 1)
111 #define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2)
114 /* Maximal bounds on received asynchronous PDUs */
115 #define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */
116 #define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */
118 /* the max TX (send) WR supported by the iSER QP is defined by *
119 * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect *
120 * to have at max for SCSI command. The tx posting & completion handling code *
121 * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
122 * send WR. D=8 comes from 64K/8K */
124 #define ISER_INFLIGHT_DATAOUTS 8
126 /* the send_beacon increase the max_send_wr by 1 */
127 #define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \
128 (1 + ISER_INFLIGHT_DATAOUTS) + \
129 ISER_MAX_TX_MISC_PDUS + \
130 ISER_MAX_RX_MISC_PDUS + 1)
132 #define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \
133 - ISER_MAX_TX_MISC_PDUS \
134 - ISER_MAX_RX_MISC_PDUS - 1) / \
135 (1 + ISER_INFLIGHT_DATAOUTS))
137 #define ISER_WC_BATCH_COUNT 16
138 #define ISER_SIGNAL_CMD_COUNT 32
140 /* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might *
141 * encounter a CQ overrun state. */
142 #define ISCSI_ISER_MAX_CONN 8
143 #define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
144 #define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN)
145 #define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
148 #define ISER_ZBVA_NOT_SUPPORTED 0x80
149 #define ISER_SEND_W_INV_NOT_SUPPORTED 0x40
151 #define ISCSI_DEF_MAX_RECV_SEG_LEN 8192
152 #define ISCSI_OPCODE_MASK 0x3f
154 #define icl_to_iser_conn(ic) \
155 container_of(ic, struct iser_conn, icl_conn)
156 #define icl_to_iser_pdu(ip) \
157 container_of(ip, struct icl_iser_pdu, icl_pdu)
160 * struct iser_hdr - iSER header
162 * @flags: flags support (zbva, remote_inv)
164 * @write_stag: write rkey
165 * @write_va: write virtual address
166 * @reaf_stag: read rkey
167 * @read_va: read virtual address
176 } __attribute__((packed));
183 /* Constant PDU lengths calculations */
184 #define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)
186 #define ISER_RECV_DATA_SEG_LEN 128
187 #define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
189 #define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
191 enum iser_conn_state {
192 ISER_CONN_INIT, /* descriptor allocd, no conn */
193 ISER_CONN_PENDING, /* in the process of being established */
194 ISER_CONN_UP, /* up and running */
195 ISER_CONN_TERMINATING, /* in the process of being terminated */
196 ISER_CONN_DOWN, /* shut down */
200 enum iser_task_status {
201 ISER_TASK_STATUS_INIT = 0,
202 ISER_TASK_STATUS_STARTED,
203 ISER_TASK_STATUS_COMPLETED
207 ISER_DIR_IN = 0, /* to initiator */
208 ISER_DIR_OUT, /* from initiator */
213 * struct iser_mem_reg - iSER memory registration info
215 * @sge: memory region sg element
216 * @rkey: memory region remote key
217 * @mem_h: pointer to registration context (FMR/Fastreg)
219 struct iser_mem_reg {
225 enum iser_desc_type {
227 ISCSI_TX_SCSI_COMMAND,
232 * struct iser_data_buf - iSER data buffer
234 * @sg: pointer to the sg list
235 * @size: num entries of this sg
236 * @data_len: total beffer byte len
237 * @dma_nents: returned by dma_map_sg
238 * @copy_buf: allocated copy buf for SGs unaligned
239 * for rdma which are copied
240 * @orig_sg: pointer to the original sg list (in case
242 * @sg_single: SG-ified clone of a non SG SC or
245 struct iser_data_buf {
246 struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
249 unsigned long data_len;
250 unsigned int dma_nents;
252 struct scatterlist *orig_sg;
253 struct scatterlist sg_single;
256 /* fwd declarations */
262 * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
264 * @iser_header: iser header
265 * @iscsi_header: iscsi header (bhs)
266 * @type: command/control/dataout
267 * @dma_addr: header buffer dma_address
268 * @tx_sg: sg[0] points to iser/iscsi headers
269 * sg[1] optionally points to either of immediate data
270 * unsolicited data-out or control
271 * @num_sge: number sges used on this TX task
272 * @mapped: indicates if the descriptor is dma mapped
274 struct iser_tx_desc {
275 struct iser_hdr iser_header;
276 struct iscsi_bhs iscsi_header __attribute__((packed));
277 enum iser_desc_type type;
279 struct ib_sge tx_sg[2];
284 #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \
285 sizeof(u64) + sizeof(struct ib_sge)))
287 * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
289 * @iser_header: iser header
290 * @iscsi_header: iscsi header
291 * @data: received data segment
292 * @dma_addr: receive buffer dma address
293 * @rx_sg: ib_sge of receive buffer
294 * @pad: for sense data TODO: Modify to maximum sense length supported
296 struct iser_rx_desc {
297 struct iser_hdr iser_header;
298 struct iscsi_bhs iscsi_header;
299 char data[ISER_RECV_DATA_SEG_LEN];
302 char pad[ISER_RX_PAD_SIZE];
303 } __attribute__((packed));
305 struct icl_iser_pdu {
306 struct icl_pdu icl_pdu;
307 struct iser_tx_desc desc;
308 struct iser_conn *iser_conn;
309 enum iser_task_status status;
310 struct ccb_scsiio *csio;
312 int dir[ISER_DIRS_NUM];
313 struct iser_mem_reg rdma_reg[ISER_DIRS_NUM];
314 struct iser_data_buf data[ISER_DIRS_NUM];
318 * struct iser_comp - iSER completion context
320 * @device: pointer to device handle
321 * @cq: completion queue
322 * @wcs: work completion array
323 * @tq: taskqueue handle
324 * @task: task to run task_fn
325 * @active_qps: Number of active QPs attached
326 * to completion context
329 struct iser_device *device;
331 struct ib_wc wcs[ISER_WC_BATCH_COUNT];
332 struct taskqueue *tq;
338 * struct iser_device - iSER device handle
340 * @ib_device: RDMA device
341 * @pd: Protection Domain for this device
342 * @dev_attr: Device attributes container
343 * @mr: Global DMA memory region
344 * @event_handler: IB events handle routine
345 * @ig_list: entry in devices list
346 * @refcount: Reference counter, dominated by open iser connections
347 * @comps_used: Number of completion contexts used, Min between online
348 * cpus and device max completion vectors
349 * @comps: Dinamically allocated array of completion handlers
352 struct ib_device *ib_device;
354 struct ib_device_attr dev_attr;
356 struct ib_event_handler event_handler;
357 struct list_head ig_list;
360 struct iser_comp *comps;
364 * struct iser_reg_resources - Fast registration recources
367 * @mr_valid: is mr valid indicator
369 struct iser_reg_resources {
375 * struct fast_reg_descriptor - Fast registration descriptor
377 * @list: entry in connection fastreg pool
378 * @rsc: data buffer registration resources
380 struct fast_reg_descriptor {
381 struct list_head list;
382 struct iser_reg_resources rsc;
387 * struct iser_beacon - beacon to signal all flush errors were drained
391 * @flush_lock: protects flush_cv
392 * @flush_cv: condition variable for beacon flush
396 struct ib_send_wr send;
397 struct ib_recv_wr recv;
399 struct mtx flush_lock;
404 * struct ib_conn - Infiniband related objects
406 * @cma_id: rdma_cm connection maneger handle
407 * @qp: Connection Queue-pair
408 * @device: reference to iser device
409 * @comp: iser completion context
412 struct rdma_cm_id *cma_id;
414 int post_recv_buf_count;
416 struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX];
417 struct iser_device *device;
418 struct iser_comp *comp;
419 struct iser_beacon beacon;
423 struct ib_fmr_pool *pool;
424 struct iser_page_vec *page_vec;
427 struct list_head pool;
434 struct icl_conn icl_conn;
435 struct ib_conn ib_conn;
437 struct list_head conn_list;
438 struct sx state_mutex;
439 enum iser_conn_state state;
440 int qp_max_recv_dtos;
444 char *login_req_buf, *login_resp_buf;
445 u64 login_req_dma, login_resp_dma;
446 unsigned int rx_desc_head;
447 struct iser_rx_desc *rx_descs;
453 * struct iser_global: iSER global context
455 * @device_list_mutex: protects device_list
456 * @device_list: iser devices global list
457 * @connlist_mutex: protects connlist
458 * @connlist: iser connections global list
459 * @desc_cache: kmem cache for tx dataout
460 * @close_conns_mutex: serializes conns closure
463 struct sx device_list_mutex;
464 struct list_head device_list;
465 struct mtx connlist_mutex;
466 struct list_head connlist;
467 struct sx close_conns_mutex;
470 extern struct iser_global ig;
471 extern int iser_debug;
474 iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *);
477 iser_post_recvl(struct iser_conn *);
480 iser_post_recvm(struct iser_conn *, int);
483 iser_alloc_login_buf(struct iser_conn *iser_conn);
486 iser_free_login_buf(struct iser_conn *iser_conn);
489 iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool);
492 iser_snd_completion(struct iser_tx_desc *, struct ib_conn *);
495 iser_rcv_completion(struct iser_rx_desc *, unsigned long,
499 iser_pdu_free(struct icl_conn *, struct icl_pdu *);
502 iser_new_pdu(struct icl_conn *ic, int flags);
505 iser_alloc_rx_descriptors(struct iser_conn *, int);
508 iser_free_rx_descriptors(struct iser_conn *);
511 iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *);
514 iser_send_control(struct iser_conn *, struct icl_iser_pdu *);
517 iser_send_command(struct iser_conn *, struct icl_iser_pdu *);
520 iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
523 iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
526 iser_create_fastreg_pool(struct ib_conn *, unsigned);
529 iser_free_fastreg_pool(struct ib_conn *);
532 iser_dma_map_task_data(struct icl_iser_pdu *,
533 struct iser_data_buf *, enum iser_data_dir,
534 enum dma_data_direction);
537 iser_conn_terminate(struct iser_conn *);
540 iser_free_ib_conn_res(struct iser_conn *, bool);
543 iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *,
544 enum dma_data_direction);
547 iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
549 #endif /* !ICL_ISER_H */