2 * Copyright (c) 2012 Chelsio Communications, Inc.
5 * Chelsio T5xx iSCSI driver
7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
35 #include "opt_inet6.h"
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
45 #include <sys/errno.h>
46 #include <sys/gsb_crc32.h>
47 #include <sys/kthread.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
53 #include <sys/mutex.h>
54 #include <sys/condvar.h>
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/toecore.h>
59 #include <netinet/tcp_var.h>
60 #include <netinet/tcp_fsm.h>
62 #include <cam/scsi/scsi_all.h>
63 #include <cam/scsi/scsi_da.h>
64 #include <cam/ctl/ctl_io.h>
65 #include <cam/ctl/ctl.h>
66 #include <cam/ctl/ctl_backend.h>
67 #include <cam/ctl/ctl_error.h>
68 #include <cam/ctl/ctl_frontend.h>
69 #include <cam/ctl/ctl_debug.h>
70 #include <cam/ctl/ctl_ha.h>
71 #include <cam/ctl/ctl_ioctl.h>
73 #include <dev/iscsi/icl.h>
74 #include <dev/iscsi/iscsi_proto.h>
75 #include <dev/iscsi/iscsi_ioctl.h>
76 #include <dev/iscsi/iscsi.h>
77 #include <cam/ctl/ctl_frontend_iscsi.h>
80 #include <cam/cam_ccb.h>
81 #include <cam/cam_xpt.h>
82 #include <cam/cam_debug.h>
83 #include <cam/cam_sim.h>
84 #include <cam/cam_xpt_sim.h>
85 #include <cam/cam_xpt_periph.h>
86 #include <cam/cam_periph.h>
87 #include <cam/cam_compat.h>
88 #include <cam/scsi/scsi_message.h>
90 #include "common/common.h"
91 #include "common/t4_msg.h"
92 #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */
93 #include "tom/t4_tom.h"
96 static int worker_thread_count;
97 static struct cxgbei_worker_thread_softc *cwt_softc;
98 static struct proc *cxgbei_proc;
101 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
102 uint32_t *max_rx_data_len, struct ppod_region *pr)
104 uint32_t tx_len, rx_len, r, v;
106 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
107 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
109 r = t4_read_reg(sc, A_TP_PARA_REG2);
110 rx_len = min(rx_len, G_MAXRXDATA(r));
111 tx_len = min(tx_len, G_MAXRXDATA(r));
113 r = t4_read_reg(sc, A_TP_PARA_REG7);
114 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
115 rx_len = min(rx_len, v);
116 tx_len = min(tx_len, v);
119 * AHS is not supported by the kernel so we'll not account for
120 * it either in our PDU len -> data segment len conversions.
122 rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
123 ISCSI_DATA_DIGEST_SIZE;
124 tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
125 ISCSI_DATA_DIGEST_SIZE;
128 * DDP can place only 4 pages for a single PDU. A single
129 * request might use larger pages than the smallest page size,
130 * but that cannot be guaranteed. Assume the smallest DDP
131 * page size for this limit.
133 rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
135 if (chip_id(sc) == CHELSIO_T5) {
136 tx_len = min(tx_len, 15360);
138 rx_len = rounddown2(rx_len, 512);
139 tx_len = rounddown2(tx_len, 512);
142 *max_tx_data_len = tx_len;
143 *max_rx_data_len = rx_len;
147 * Initialize the software state of the iSCSI ULP driver.
149 * ENXIO means firmware didn't set up something that it was supposed to.
152 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
154 struct sysctl_oid *oid;
155 struct sysctl_oid_list *children;
156 struct ppod_region *pr;
160 MPASS(sc->vres.iscsi.size > 0);
164 r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
165 rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
167 device_printf(sc->dev,
168 "%s: failed to initialize the iSCSI page pod region: %u.\n",
173 r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
174 r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
175 if (r != pr->pr_tag_mask) {
177 * Recent firmwares are supposed to set up the iSCSI tagmask
178 * but we'll do it ourselves it the computed value doesn't match
179 * what's in the register.
181 device_printf(sc->dev,
182 "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
184 t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
185 V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
188 read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
190 sysctl_ctx_init(&ci->ctx);
191 oid = device_get_sysctl_tree(sc->dev); /* dev.t5nex.X */
192 children = SYSCTL_CHILDREN(oid);
194 oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
195 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
196 children = SYSCTL_CHILDREN(oid);
198 ci->ddp_threshold = 2048;
199 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
200 CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
202 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len",
203 CTLFLAG_RD, &ci->max_rx_data_len, 0,
204 "Maximum receive data segment length");
205 SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len",
206 CTLFLAG_RD, &ci->max_tx_data_len, 0,
207 "Maximum transmit data segment length");
213 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
215 struct adapter *sc = iq->adapter;
216 struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
217 u_int tid = GET_TID(cpl);
218 struct toepcb *toep = lookup_tid(sc, tid);
220 struct icl_cxgbei_pdu *icp;
221 uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
222 uint16_t len = be16toh(cpl->len);
225 MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
227 ip = icl_cxgbei_new_pdu(M_NOWAIT);
229 CXGBE_UNIMPLEMENTED("PDU allocation failure");
230 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
231 ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
233 icp->icp_seq = ntohl(cpl->seq);
234 icp->icp_flags = ICPF_RX_HDR;
236 /* This is the start of a new PDU. There should be no old state. */
237 MPASS(toep->ulpcb2 == NULL);
241 CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
242 __func__, tid, len, len_ddp, icp);
250 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
252 struct adapter *sc = iq->adapter;
253 struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *);
254 u_int tid = GET_TID(cpl);
255 struct toepcb *toep = lookup_tid(sc, tid);
256 struct icl_cxgbei_pdu *icp = toep->ulpcb2;
260 MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
264 * T6 completion enabled, start of a new pdu. Header
265 * will come in completion CPL.
267 ip = icl_cxgbei_new_pdu(M_NOWAIT);
269 CXGBE_UNIMPLEMENTED("PDU allocation failure");
272 /* T5 mode, header is already received. */
273 MPASS(icp->icp_flags == ICPF_RX_HDR);
274 MPASS(icp->ip.ip_data_mbuf == NULL);
275 MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
278 /* Trim the cpl header from mbuf. */
279 m_adj(m, sizeof(*cpl));
281 icp->icp_flags |= ICPF_RX_FLBUF;
282 icp->ip.ip_data_mbuf = m;
283 toep->ofld_rxq->rx_iscsi_fl_pdus++;
284 toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
287 * For T6, save the icp for further processing in the
288 * completion handler.
290 if (icp->icp_flags == ICPF_RX_FLBUF) {
291 MPASS(toep->ulpcb2 == NULL);
296 CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
297 be16toh(cpl->len), icp);
304 mbuf_crc32c_helper(void *arg, void *data, u_int len)
306 uint32_t *digestp = arg;
308 *digestp = calculate_crc32c(*digestp, data, len);
313 parse_pdus(struct toepcb *toep, struct icl_cxgbei_conn *icc, struct sockbuf *sb)
315 struct iscsi_bhs bhs;
318 u_int ahs_len, data_len, header_len, pdu_len, total_len;
319 uint32_t calc_digest, wire_digest;
321 total_len = sbused(sb);
322 CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, toep->tid,
325 m = sbcut_locked(sb, total_len);
326 KASSERT(m_length(m, NULL) == total_len,
327 ("sbcut returned less data (%u vs %u)", total_len,
330 header_len = sizeof(struct iscsi_bhs);
331 if (icc->ic.ic_header_crc32c)
332 header_len += ISCSI_HEADER_DIGEST_SIZE;
334 if (total_len < sizeof(struct iscsi_bhs)) {
335 ICL_WARN("truncated pre-offload PDU with len %u",
340 m_copydata(m, 0, sizeof(struct iscsi_bhs), (caddr_t)&bhs);
342 ahs_len = bhs.bhs_total_ahs_len * 4;
343 data_len = bhs.bhs_data_segment_len[0] << 16 |
344 bhs.bhs_data_segment_len[1] << 8 |
345 bhs.bhs_data_segment_len[0];
346 pdu_len = header_len + ahs_len + roundup2(data_len, 4);
347 if (icc->ic.ic_data_crc32c && data_len != 0)
348 pdu_len += ISCSI_DATA_DIGEST_SIZE;
350 if (total_len < pdu_len) {
351 ICL_WARN("truncated pre-offload PDU len %u vs %u",
358 ICL_WARN("received pre-offload PDU with AHS");
363 if (icc->ic.ic_header_crc32c) {
364 m_copydata(m, sizeof(struct iscsi_bhs),
365 sizeof(wire_digest), (caddr_t)&wire_digest);
367 calc_digest = calculate_crc32c(0xffffffff,
368 (caddr_t)&bhs, sizeof(bhs));
369 calc_digest ^= 0xffffffff;
370 if (calc_digest != wire_digest) {
371 ICL_WARN("received pre-offload PDU 0x%02x "
372 "with invalid header digest (0x%x vs 0x%x)",
373 bhs.bhs_opcode, wire_digest, calc_digest);
374 toep->ofld_rxq->rx_iscsi_header_digest_errors++;
380 m_adj(m, header_len);
382 if (icc->ic.ic_data_crc32c && data_len != 0) {
383 m_copydata(m, data_len, sizeof(wire_digest),
384 (caddr_t)&wire_digest);
386 calc_digest = 0xffffffff;
387 m_apply(m, 0, roundup2(data_len, 4), mbuf_crc32c_helper,
389 calc_digest ^= 0xffffffff;
390 if (calc_digest != wire_digest) {
391 ICL_WARN("received pre-offload PDU 0x%02x "
392 "with invalid data digest (0x%x vs 0x%x)",
393 bhs.bhs_opcode, wire_digest, calc_digest);
394 toep->ofld_rxq->rx_iscsi_data_digest_errors++;
400 ip = icl_cxgbei_new_pdu(M_NOWAIT);
402 CXGBE_UNIMPLEMENTED("PDU allocation failure");
403 icl_cxgbei_new_pdu_set_conn(ip, &icc->ic);
405 ip->ip_data_len = data_len;
407 ip->ip_data_mbuf = m;
409 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
411 total_len -= pdu_len;
412 if (total_len == 0) {
419 m = m_split(m, roundup2(data_len, 4), M_NOWAIT);
421 ICL_WARN("failed to split mbuf chain for "
424 /* Don't free the mbuf chain as 'ip' owns it. */
427 if (icc->ic.ic_data_crc32c)
428 m_adj(m, ISCSI_DATA_DIGEST_SIZE);
434 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
436 struct adapter *sc = iq->adapter;
437 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
438 u_int tid = GET_TID(cpl);
439 struct toepcb *toep = lookup_tid(sc, tid);
440 struct inpcb *inp = toep->inp;
444 struct icl_cxgbei_conn *icc;
446 struct icl_cxgbei_pdu *icp = toep->ulpcb2;
449 struct epoch_tracker et;
453 /* Must already be assembling a PDU. */
455 MPASS(icp->icp_flags & ICPF_RX_HDR); /* Data is optional. */
456 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
458 pdu_len = be16toh(cpl->len); /* includes everything. */
459 val = be32toh(cpl->ddpvld);
463 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
464 __func__, tid, pdu_len, val, icp->icp_flags);
467 icp->icp_flags |= ICPF_RX_STATUS;
469 if (val & F_DDP_PADDING_ERR) {
470 ICL_WARN("received PDU 0x%02x with invalid padding",
471 ip->ip_bhs->bhs_opcode);
472 toep->ofld_rxq->rx_iscsi_padding_errors++;
474 if (val & F_DDP_HDRCRC_ERR) {
475 ICL_WARN("received PDU 0x%02x with invalid header digest",
476 ip->ip_bhs->bhs_opcode);
477 toep->ofld_rxq->rx_iscsi_header_digest_errors++;
479 if (val & F_DDP_DATACRC_ERR) {
480 ICL_WARN("received PDU 0x%02x with invalid data digest",
481 ip->ip_bhs->bhs_opcode);
482 toep->ofld_rxq->rx_iscsi_data_digest_errors++;
484 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
485 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
486 MPASS(ip->ip_data_len > 0);
487 icp->icp_flags |= ICPF_RX_DDP;
488 toep->ofld_rxq->rx_iscsi_ddp_pdus++;
489 toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
493 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
494 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
495 __func__, tid, pdu_len, inp->inp_flags);
497 icl_cxgbei_conn_pdu_free(NULL, ip);
503 * T6+ does not report data PDUs received via DDP without F
504 * set. This can result in gaps in the TCP sequence space.
507 MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
508 tp->rcv_nxt = icp->icp_seq + pdu_len;
509 tp->t_rcvtime = ticks;
512 * Don't update the window size or return credits since RX
513 * flow control is disabled.
516 so = inp->inp_socket;
521 if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
523 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
524 __func__, tid, pdu_len, icc, sb->sb_state);
528 CURVNET_SET(so->so_vnet);
531 tp = tcp_drop(tp, ECONNRESET);
537 icl_cxgbei_conn_pdu_free(NULL, ip);
541 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
543 if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
544 F_DDP_DATACRC_ERR)) != 0) {
548 icl_cxgbei_conn_pdu_free(NULL, ip);
554 if (__predict_false(sbused(sb)) != 0) {
556 * PDUs were received before the tid transitioned to ULP mode.
557 * Convert them to icl_cxgbei_pdus and send them to ICL before
560 if (!parse_pdus(toep, icc, sb)) {
564 icl_cxgbei_conn_pdu_free(NULL, ip);
570 icl_cxgbei_new_pdu_set_conn(ip, ic);
572 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
573 if ((icc->rx_flags & RXF_ACTIVE) == 0) {
574 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
576 mtx_lock(&cwt->cwt_lock);
577 icc->rx_flags |= RXF_ACTIVE;
578 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
579 if (cwt->cwt_state == CWT_SLEEPING) {
580 cwt->cwt_state = CWT_RUNNING;
581 cv_signal(&cwt->cwt_cv);
583 mtx_unlock(&cwt->cwt_lock);
594 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
596 struct epoch_tracker et;
597 struct adapter *sc = iq->adapter;
598 struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
599 u_int tid = GET_TID(cpl);
600 struct toepcb *toep = lookup_tid(sc, tid);
601 struct icl_cxgbei_pdu *icp = toep->ulpcb2;
603 struct cxgbei_cmp *cmp;
604 struct inpcb *inp = toep->inp;
606 uint16_t len = be16toh(cpl->len);
607 u_int data_digest_len;
612 struct icl_cxgbei_conn *icc;
614 struct iscsi_bhs_data_out *bhsdo;
615 u_int val = be32toh(cpl->ddpvld);
616 u_int npdus, pdu_len;
617 uint32_t prev_seg_len;
620 MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
622 if ((val & F_DDP_PDU) == 0) {
624 MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
629 /* T6 completion enabled, start of a new PDU. */
630 ip = icl_cxgbei_new_pdu(M_NOWAIT);
632 CXGBE_UNIMPLEMENTED("PDU allocation failure");
635 pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
639 "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
640 __func__, tid, pdu_len, val, icp);
644 m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
645 bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
646 ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
647 bhsdo->bhsdo_data_segment_len[1] << 8 |
648 bhsdo->bhsdo_data_segment_len[2];
649 icp->icp_seq = ntohl(cpl->seq);
650 icp->icp_flags |= ICPF_RX_HDR;
651 icp->icp_flags |= ICPF_RX_STATUS;
653 if (val & F_DDP_PADDING_ERR) {
654 ICL_WARN("received PDU 0x%02x with invalid padding",
655 ip->ip_bhs->bhs_opcode);
656 toep->ofld_rxq->rx_iscsi_padding_errors++;
658 if (val & F_DDP_HDRCRC_ERR) {
659 ICL_WARN("received PDU 0x%02x with invalid header digest",
660 ip->ip_bhs->bhs_opcode);
661 toep->ofld_rxq->rx_iscsi_header_digest_errors++;
663 if (val & F_DDP_DATACRC_ERR) {
664 ICL_WARN("received PDU 0x%02x with invalid data digest",
665 ip->ip_bhs->bhs_opcode);
666 toep->ofld_rxq->rx_iscsi_data_digest_errors++;
670 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
671 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
672 __func__, tid, pdu_len, inp->inp_flags);
674 icl_cxgbei_conn_pdu_free(NULL, ip);
683 * If icc is NULL, the connection is being closed in
684 * icl_cxgbei_conn_close(), just drop this data.
687 if (__predict_false(icc == NULL)) {
688 CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
689 __func__, tid, pdu_len, icc);
692 * Update rcv_nxt so the sequence number of the FIN
693 * doesn't appear wrong.
695 tp->rcv_nxt = icp->icp_seq + pdu_len;
696 tp->t_rcvtime = ticks;
699 icl_cxgbei_conn_pdu_free(NULL, ip);
705 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
707 if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
708 F_DDP_DATACRC_ERR)) != 0) {
711 icl_cxgbei_conn_pdu_free(NULL, ip);
719 data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
720 ISCSI_DATA_DIGEST_SIZE : 0;
721 MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
724 if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
725 MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
726 MPASS(ip->ip_data_len > 0);
727 icp->icp_flags |= ICPF_RX_DDP;
728 bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
730 switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
731 case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
732 cmp = cxgbei_find_cmp(icc,
733 be32toh(bhsdo->bhsdo_initiator_task_tag));
735 case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
736 cmp = cxgbei_find_cmp(icc,
737 be32toh(bhsdo->bhsdo_target_transfer_tag));
740 __assert_unreachable();
745 * The difference between the end of the last burst
746 * and the offset of the last PDU in this burst is
747 * the additional data received via DDP.
749 prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
750 cmp->next_buffer_offset;
752 if (prev_seg_len != 0) {
753 uint32_t orig_datasn;
756 * Return a "large" PDU representing the burst
757 * of PDUs. Adjust the offset and length of
758 * this PDU to represent the entire burst.
760 ip->ip_data_len += prev_seg_len;
761 bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
762 bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
763 bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
764 bhsdo->bhsdo_buffer_offset =
765 htobe32(cmp->next_buffer_offset);
767 orig_datasn = htobe32(bhsdo->bhsdo_datasn);
768 npdus = orig_datasn - cmp->last_datasn;
769 bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
770 cmp->last_datasn = orig_datasn;
771 ip->ip_additional_pdus = npdus - 1;
773 MPASS(htobe32(bhsdo->bhsdo_datasn) ==
774 cmp->last_datasn + 1);
776 cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
779 cmp->next_buffer_offset += ip->ip_data_len;
780 toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
781 toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
783 MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
784 MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
787 tp->rcv_nxt = icp->icp_seq + pdu_len;
788 tp->t_rcvtime = ticks;
791 * Don't update the window size or return credits since RX
792 * flow control is disabled.
795 so = inp->inp_socket;
798 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
800 "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
801 __func__, tid, pdu_len, icc, sb->sb_state);
805 CURVNET_SET(so->so_vnet);
808 tp = tcp_drop(tp, ECONNRESET);
814 icl_cxgbei_conn_pdu_free(NULL, ip);
820 if (__predict_false(sbused(sb)) != 0) {
822 * PDUs were received before the tid transitioned to ULP mode.
823 * Convert them to icl_cxgbei_pdus and send them to ICL before
826 if (!parse_pdus(toep, icc, sb)) {
830 icl_cxgbei_conn_pdu_free(NULL, ip);
836 icl_cxgbei_new_pdu_set_conn(ip, ic);
838 /* Enqueue the PDU to the received pdus queue. */
839 STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
840 if ((icc->rx_flags & RXF_ACTIVE) == 0) {
841 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
843 mtx_lock(&cwt->cwt_lock);
844 icc->rx_flags |= RXF_ACTIVE;
845 TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
846 if (cwt->cwt_state == CWT_SLEEPING) {
847 cwt->cwt_state = CWT_RUNNING;
848 cv_signal(&cwt->cwt_cv);
850 mtx_unlock(&cwt->cwt_lock);
862 cxgbei_activate(struct adapter *sc)
864 struct cxgbei_data *ci;
867 ASSERT_SYNCHRONIZED_OP(sc);
869 if (uld_active(sc, ULD_ISCSI)) {
870 KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
875 if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
876 device_printf(sc->dev,
877 "not iSCSI offload capable, or capability disabled.\n");
881 /* per-adapter softc for iSCSI */
882 ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
886 rc = cxgbei_init(sc, ci);
892 sc->iscsi_ulp_softc = ci;
898 cxgbei_deactivate(struct adapter *sc)
900 struct cxgbei_data *ci = sc->iscsi_ulp_softc;
902 ASSERT_SYNCHRONIZED_OP(sc);
905 sysctl_ctx_free(&ci->ctx);
906 t4_free_ppod_region(&ci->pr);
908 sc->iscsi_ulp_softc = NULL;
915 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
918 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
921 /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
922 if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
923 (void) t4_activate_uld(sc, ULD_ISCSI);
925 end_synchronized_op(sc, 0);
929 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
932 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
935 if (uld_active(sc, ULD_ISCSI))
936 (void) t4_deactivate_uld(sc, ULD_ISCSI);
938 end_synchronized_op(sc, 0);
941 static struct uld_info cxgbei_uld_info = {
943 .activate = cxgbei_activate,
944 .deactivate = cxgbei_deactivate,
950 struct cxgbei_worker_thread_softc *cwt = arg;
951 struct icl_cxgbei_conn *icc = NULL;
955 STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
959 mtx_lock(&cwt->cwt_lock);
960 MPASS(cwt->cwt_state == 0);
961 cwt->cwt_state = CWT_RUNNING;
962 cv_signal(&cwt->cwt_cv);
964 while (__predict_true(cwt->cwt_state != CWT_STOP)) {
965 cwt->cwt_state = CWT_RUNNING;
966 while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
967 TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
968 mtx_unlock(&cwt->cwt_lock);
971 sb = &ic->ic_socket->so_rcv;
974 MPASS(icc->rx_flags & RXF_ACTIVE);
975 if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
976 MPASS(STAILQ_EMPTY(&rx_pdus));
977 STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
980 /* Hand over PDUs to ICL. */
981 while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
982 STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
987 MPASS(STAILQ_EMPTY(&rx_pdus));
989 MPASS(icc->rx_flags & RXF_ACTIVE);
990 if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
991 __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
992 icc->rx_flags &= ~RXF_ACTIVE;
995 * More PDUs were received while we were busy
996 * handing over the previous batch to ICL.
997 * Re-add this connection to the end of the
1000 mtx_lock(&cwt->cwt_lock);
1001 TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
1003 mtx_unlock(&cwt->cwt_lock);
1007 mtx_lock(&cwt->cwt_lock);
1010 /* Inner loop doesn't check for CWT_STOP, do that first. */
1011 if (__predict_false(cwt->cwt_state == CWT_STOP))
1013 cwt->cwt_state = CWT_SLEEPING;
1014 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1017 MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
1018 mtx_assert(&cwt->cwt_lock, MA_OWNED);
1019 cwt->cwt_state = CWT_STOPPED;
1020 cv_signal(&cwt->cwt_cv);
1021 mtx_unlock(&cwt->cwt_lock);
1026 start_worker_threads(void)
1029 struct cxgbei_worker_thread_softc *cwt;
1031 worker_thread_count = min(mp_ncpus, 32);
1032 cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
1035 MPASS(cxgbei_proc == NULL);
1036 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
1037 mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
1038 cv_init(&cwt->cwt_cv, "cwt cv");
1039 TAILQ_INIT(&cwt->rx_head);
1040 rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
1043 printf("cxgbei: failed to start thread #%d/%d (%d)\n",
1044 i + 1, worker_thread_count, rc);
1045 mtx_destroy(&cwt->cwt_lock);
1046 cv_destroy(&cwt->cwt_cv);
1047 bzero(cwt, sizeof(*cwt));
1049 free(cwt_softc, M_CXGBE);
1050 worker_thread_count = 0;
1055 /* Not fatal, carry on with fewer threads. */
1056 worker_thread_count = i;
1061 /* Wait for thread to start before moving on to the next one. */
1062 mtx_lock(&cwt->cwt_lock);
1063 while (cwt->cwt_state == 0)
1064 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1065 mtx_unlock(&cwt->cwt_lock);
1068 MPASS(cwt_softc != NULL);
1069 MPASS(worker_thread_count > 0);
1074 stop_worker_threads(void)
1077 struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
1079 MPASS(worker_thread_count >= 0);
1081 for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
1082 mtx_lock(&cwt->cwt_lock);
1083 MPASS(cwt->cwt_state == CWT_RUNNING ||
1084 cwt->cwt_state == CWT_SLEEPING);
1085 cwt->cwt_state = CWT_STOP;
1086 cv_signal(&cwt->cwt_cv);
1088 cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1089 } while (cwt->cwt_state != CWT_STOPPED);
1090 mtx_unlock(&cwt->cwt_lock);
1091 mtx_destroy(&cwt->cwt_lock);
1092 cv_destroy(&cwt->cwt_cv);
1094 free(cwt_softc, M_CXGBE);
1097 /* Select a worker thread for a connection. */
1099 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
1101 struct adapter *sc = icc->sc;
1102 struct toepcb *toep = icc->toep;
1105 n = worker_thread_count / sc->sge.nofldrxq;
1107 i = toep->vi->pi->port_id * n + arc4random() % n;
1109 i = arc4random() % worker_thread_count;
1111 CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
1117 cxgbei_mod_load(void)
1121 t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
1122 t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
1123 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
1124 t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
1126 rc = start_worker_threads();
1130 rc = t4_register_uld(&cxgbei_uld_info);
1132 stop_worker_threads();
1136 t4_iterate(cxgbei_activate_all, NULL);
1142 cxgbei_mod_unload(void)
1145 t4_iterate(cxgbei_deactivate_all, NULL);
1147 if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
1150 stop_worker_threads();
1152 t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
1153 t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
1154 t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
1155 t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
1162 cxgbei_modevent(module_t mod, int cmd, void *arg)
1169 rc = cxgbei_mod_load();
1171 rc = icl_cxgbei_mod_load();
1175 rc = icl_cxgbei_mod_unload();
1177 rc = cxgbei_mod_unload();
1184 printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
1191 static moduledata_t cxgbei_mod = {
1197 MODULE_VERSION(cxgbei, 1);
1198 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1199 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
1200 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
1201 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);