]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/iscsi/icl_soft.c
ZFS: MFV 2.0-rc1-ga00c61
[FreeBSD/FreeBSD.git] / sys / dev / iscsi / icl_soft.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed by Edward Tomasz Napierala under sponsorship
8  * from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  */
32
33 /*
34  * Software implementation of iSCSI Common Layer kobj(9) interface.
35  */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39
40 #include <sys/param.h>
41 #include <sys/capsicum.h>
42 #include <sys/condvar.h>
43 #include <sys/conf.h>
44 #include <sys/gsb_crc32.h>
45 #include <sys/file.h>
46 #include <sys/kernel.h>
47 #include <sys/kthread.h>
48 #include <sys/lock.h>
49 #include <sys/mbuf.h>
50 #include <sys/mutex.h>
51 #include <sys/module.h>
52 #include <sys/protosw.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <sys/sysctl.h>
56 #include <sys/systm.h>
57 #include <sys/sx.h>
58 #include <sys/uio.h>
59 #include <vm/uma.h>
60 #include <netinet/in.h>
61 #include <netinet/tcp.h>
62
63 #include <dev/iscsi/icl.h>
64 #include <dev/iscsi/iscsi_proto.h>
65 #include <icl_conn_if.h>
66
67 struct icl_soft_pdu {
68         struct icl_pdu   ip;
69
70         /* soft specific stuff goes here. */
71         u_int            ref_cnt;
72         icl_pdu_cb       cb;
73         int              error;
74 };
75
76 static int coalesce = 1;
77 SYSCTL_INT(_kern_icl, OID_AUTO, coalesce, CTLFLAG_RWTUN,
78     &coalesce, 0, "Try to coalesce PDUs before sending");
79 static int partial_receive_len = 128 * 1024;
80 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
81     &partial_receive_len, 0, "Minimum read size for partially received "
82     "data segment");
83 static int sendspace = 1048576;
84 SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN,
85     &sendspace, 0, "Default send socket buffer size");
86 static int recvspace = 1048576;
87 SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
88     &recvspace, 0, "Default receive socket buffer size");
89
90 static MALLOC_DEFINE(M_ICL_SOFT, "icl_soft", "iSCSI software backend");
91 static uma_zone_t icl_soft_pdu_zone;
92
93 static volatile u_int   icl_ncons;
94
95 #define ICL_CONN_LOCK(X)                mtx_lock(X->ic_lock)
96 #define ICL_CONN_UNLOCK(X)              mtx_unlock(X->ic_lock)
97 #define ICL_CONN_LOCK_ASSERT(X)         mtx_assert(X->ic_lock, MA_OWNED)
98 #define ICL_CONN_LOCK_ASSERT_NOT(X)     mtx_assert(X->ic_lock, MA_NOTOWNED)
99
100 STAILQ_HEAD(icl_pdu_stailq, icl_pdu);
101
102 static icl_conn_new_pdu_t       icl_soft_conn_new_pdu;
103 static icl_conn_pdu_free_t      icl_soft_conn_pdu_free;
104 static icl_conn_pdu_data_segment_length_t
105                                     icl_soft_conn_pdu_data_segment_length;
106 static icl_conn_pdu_append_data_t       icl_soft_conn_pdu_append_data;
107 static icl_conn_pdu_get_data_t  icl_soft_conn_pdu_get_data;
108 static icl_conn_pdu_queue_t     icl_soft_conn_pdu_queue;
109 static icl_conn_pdu_queue_cb_t  icl_soft_conn_pdu_queue_cb;
110 static icl_conn_handoff_t       icl_soft_conn_handoff;
111 static icl_conn_free_t          icl_soft_conn_free;
112 static icl_conn_close_t         icl_soft_conn_close;
113 static icl_conn_task_setup_t    icl_soft_conn_task_setup;
114 static icl_conn_task_done_t     icl_soft_conn_task_done;
115 static icl_conn_transfer_setup_t        icl_soft_conn_transfer_setup;
116 static icl_conn_transfer_done_t icl_soft_conn_transfer_done;
117 #ifdef ICL_KERNEL_PROXY
118 static icl_conn_connect_t       icl_soft_conn_connect;
119 #endif
120
121 static kobj_method_t icl_soft_methods[] = {
122         KOBJMETHOD(icl_conn_new_pdu, icl_soft_conn_new_pdu),
123         KOBJMETHOD(icl_conn_pdu_free, icl_soft_conn_pdu_free),
124         KOBJMETHOD(icl_conn_pdu_data_segment_length,
125             icl_soft_conn_pdu_data_segment_length),
126         KOBJMETHOD(icl_conn_pdu_append_data, icl_soft_conn_pdu_append_data),
127         KOBJMETHOD(icl_conn_pdu_get_data, icl_soft_conn_pdu_get_data),
128         KOBJMETHOD(icl_conn_pdu_queue, icl_soft_conn_pdu_queue),
129         KOBJMETHOD(icl_conn_pdu_queue_cb, icl_soft_conn_pdu_queue_cb),
130         KOBJMETHOD(icl_conn_handoff, icl_soft_conn_handoff),
131         KOBJMETHOD(icl_conn_free, icl_soft_conn_free),
132         KOBJMETHOD(icl_conn_close, icl_soft_conn_close),
133         KOBJMETHOD(icl_conn_task_setup, icl_soft_conn_task_setup),
134         KOBJMETHOD(icl_conn_task_done, icl_soft_conn_task_done),
135         KOBJMETHOD(icl_conn_transfer_setup, icl_soft_conn_transfer_setup),
136         KOBJMETHOD(icl_conn_transfer_done, icl_soft_conn_transfer_done),
137 #ifdef ICL_KERNEL_PROXY
138         KOBJMETHOD(icl_conn_connect, icl_soft_conn_connect),
139 #endif
140         { 0, 0 }
141 };
142
143 DEFINE_CLASS(icl_soft, icl_soft_methods, sizeof(struct icl_conn));
144
145 static void
146 icl_conn_fail(struct icl_conn *ic)
147 {
148         if (ic->ic_socket == NULL)
149                 return;
150
151         /*
152          * XXX
153          */
154         ic->ic_socket->so_error = EDOOFUS;
155         (ic->ic_error)(ic);
156 }
157
158 static struct mbuf *
159 icl_conn_receive(struct icl_conn *ic, size_t len)
160 {
161         struct uio uio;
162         struct socket *so;
163         struct mbuf *m;
164         int error, flags;
165
166         so = ic->ic_socket;
167
168         memset(&uio, 0, sizeof(uio));
169         uio.uio_resid = len;
170
171         flags = MSG_DONTWAIT;
172         error = soreceive(so, NULL, &uio, &m, NULL, &flags);
173         if (error != 0) {
174                 ICL_DEBUG("soreceive error %d", error);
175                 return (NULL);
176         }
177         if (uio.uio_resid != 0) {
178                 m_freem(m);
179                 ICL_DEBUG("short read");
180                 return (NULL);
181         }
182
183         return (m);
184 }
185
186 static int
187 icl_conn_receive_buf(struct icl_conn *ic, void *buf, size_t len)
188 {
189         struct iovec iov[1];
190         struct uio uio;
191         struct socket *so;
192         int error, flags;
193
194         so = ic->ic_socket;
195
196         memset(&uio, 0, sizeof(uio));
197         iov[0].iov_base = buf;
198         iov[0].iov_len = len;
199         uio.uio_iov = iov;
200         uio.uio_iovcnt = 1;
201         uio.uio_offset = 0;
202         uio.uio_resid = len;
203         uio.uio_segflg = UIO_SYSSPACE;
204         uio.uio_rw = UIO_READ;
205
206         flags = MSG_DONTWAIT;
207         error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
208         if (error != 0) {
209                 ICL_DEBUG("soreceive error %d", error);
210                 return (-1);
211         }
212         if (uio.uio_resid != 0) {
213                 ICL_DEBUG("short read");
214                 return (-1);
215         }
216
217         return (0);
218 }
219
220 static void
221 icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
222 {
223         struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
224
225         KASSERT(isp->ref_cnt == 0, ("freeing active PDU"));
226         m_freem(ip->ip_bhs_mbuf);
227         m_freem(ip->ip_ahs_mbuf);
228         m_freem(ip->ip_data_mbuf);
229         uma_zfree(icl_soft_pdu_zone, isp);
230 #ifdef DIAGNOSTIC
231         refcount_release(&ic->ic_outstanding_pdus);
232 #endif
233 }
234
235 static void
236 icl_soft_pdu_call_cb(struct icl_pdu *ip)
237 {
238         struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
239
240         if (isp->cb != NULL)
241                 isp->cb(ip, isp->error);
242 #ifdef DIAGNOSTIC
243         refcount_release(&ip->ip_conn->ic_outstanding_pdus);
244 #endif
245         uma_zfree(icl_soft_pdu_zone, isp);
246 }
247
248 static void
249 icl_soft_pdu_done(struct icl_pdu *ip, int error)
250 {
251         struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
252
253         if (error != 0)
254                 isp->error = error;
255
256         m_freem(ip->ip_bhs_mbuf);
257         ip->ip_bhs_mbuf = NULL;
258         m_freem(ip->ip_ahs_mbuf);
259         ip->ip_ahs_mbuf = NULL;
260         m_freem(ip->ip_data_mbuf);
261         ip->ip_data_mbuf = NULL;
262
263         if (atomic_fetchadd_int(&isp->ref_cnt, -1) == 1)
264                 icl_soft_pdu_call_cb(ip);
265 }
266
267 static void
268 icl_soft_mbuf_done(struct mbuf *mb)
269 {
270         struct icl_soft_pdu *isp = (struct icl_soft_pdu *)mb->m_ext.ext_arg1;
271
272         icl_soft_pdu_call_cb(&isp->ip);
273 }
274
275 /*
276  * Allocate icl_pdu with empty BHS to fill up by the caller.
277  */
278 struct icl_pdu *
279 icl_soft_conn_new_pdu(struct icl_conn *ic, int flags)
280 {
281         struct icl_soft_pdu *isp;
282         struct icl_pdu *ip;
283
284 #ifdef DIAGNOSTIC
285         refcount_acquire(&ic->ic_outstanding_pdus);
286 #endif
287         isp = uma_zalloc(icl_soft_pdu_zone, flags | M_ZERO);
288         if (isp == NULL) {
289                 ICL_WARN("failed to allocate soft PDU");
290 #ifdef DIAGNOSTIC
291                 refcount_release(&ic->ic_outstanding_pdus);
292 #endif
293                 return (NULL);
294         }
295         ip = &isp->ip;
296         ip->ip_conn = ic;
297
298         CTASSERT(sizeof(struct iscsi_bhs) <= MHLEN);
299         ip->ip_bhs_mbuf = m_gethdr(flags, MT_DATA);
300         if (ip->ip_bhs_mbuf == NULL) {
301                 ICL_WARN("failed to allocate BHS mbuf");
302                 icl_soft_conn_pdu_free(ic, ip);
303                 return (NULL);
304         }
305         ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
306         memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
307         ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
308
309         return (ip);
310 }
311
312 static int
313 icl_pdu_ahs_length(const struct icl_pdu *request)
314 {
315
316         return (request->ip_bhs->bhs_total_ahs_len * 4);
317 }
318
319 static size_t
320 icl_pdu_data_segment_length(const struct icl_pdu *request)
321 {
322         uint32_t len = 0;
323
324         len += request->ip_bhs->bhs_data_segment_len[0];
325         len <<= 8;
326         len += request->ip_bhs->bhs_data_segment_len[1];
327         len <<= 8;
328         len += request->ip_bhs->bhs_data_segment_len[2];
329
330         return (len);
331 }
332
333 size_t
334 icl_soft_conn_pdu_data_segment_length(struct icl_conn *ic,
335     const struct icl_pdu *request)
336 {
337
338         return (icl_pdu_data_segment_length(request));
339 }
340
341 static void
342 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
343 {
344
345         response->ip_bhs->bhs_data_segment_len[2] = len;
346         response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
347         response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
348 }
349
350 static size_t
351 icl_pdu_padding(const struct icl_pdu *ip)
352 {
353
354         if ((ip->ip_data_len % 4) != 0)
355                 return (4 - (ip->ip_data_len % 4));
356
357         return (0);
358 }
359
360 static size_t
361 icl_pdu_size(const struct icl_pdu *response)
362 {
363         size_t len;
364
365         KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
366
367         len = sizeof(struct iscsi_bhs) + response->ip_data_len +
368             icl_pdu_padding(response);
369         if (response->ip_conn->ic_header_crc32c)
370                 len += ISCSI_HEADER_DIGEST_SIZE;
371         if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
372                 len += ISCSI_DATA_DIGEST_SIZE;
373
374         return (len);
375 }
376
377 static int
378 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
379 {
380
381         if (icl_conn_receive_buf(request->ip_conn,
382             request->ip_bhs, sizeof(struct iscsi_bhs))) {
383                 ICL_DEBUG("failed to receive BHS");
384                 return (-1);
385         }
386
387         *availablep -= sizeof(struct iscsi_bhs);
388         return (0);
389 }
390
391 static int
392 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
393 {
394
395         request->ip_ahs_len = icl_pdu_ahs_length(request);
396         if (request->ip_ahs_len == 0)
397                 return (0);
398
399         request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
400             request->ip_ahs_len);
401         if (request->ip_ahs_mbuf == NULL) {
402                 ICL_DEBUG("failed to receive AHS");
403                 return (-1);
404         }
405
406         *availablep -= request->ip_ahs_len;
407         return (0);
408 }
409
410 static uint32_t
411 icl_mbuf_to_crc32c(const struct mbuf *m0)
412 {
413         uint32_t digest = 0xffffffff;
414         const struct mbuf *m;
415
416         for (m = m0; m != NULL; m = m->m_next)
417                 digest = calculate_crc32c(digest,
418                     mtod(m, const void *), m->m_len);
419
420         digest = digest ^ 0xffffffff;
421
422         return (digest);
423 }
424
425 static int
426 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
427 {
428         uint32_t received_digest, valid_digest;
429
430         if (request->ip_conn->ic_header_crc32c == false)
431                 return (0);
432
433         CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
434         if (icl_conn_receive_buf(request->ip_conn,
435             &received_digest, ISCSI_HEADER_DIGEST_SIZE)) {
436                 ICL_DEBUG("failed to receive header digest");
437                 return (-1);
438         }
439         *availablep -= ISCSI_HEADER_DIGEST_SIZE;
440
441         /* Temporary attach AHS to BHS to calculate header digest. */
442         request->ip_bhs_mbuf->m_next = request->ip_ahs_mbuf;
443         valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
444         request->ip_bhs_mbuf->m_next = NULL;
445         if (received_digest != valid_digest) {
446                 ICL_WARN("header digest check failed; got 0x%x, "
447                     "should be 0x%x", received_digest, valid_digest);
448                 return (-1);
449         }
450
451         return (0);
452 }
453
454 /*
455  * Return the number of bytes that should be waiting in the receive socket
456  * before icl_pdu_receive_data_segment() gets called.
457  */
458 static size_t
459 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
460 {
461         size_t len;
462
463         len = icl_pdu_data_segment_length(request);
464         if (len == 0)
465                 return (0);
466
467         /*
468          * Account for the parts of data segment already read from
469          * the socket buffer.
470          */
471         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
472         len -= request->ip_data_len;
473
474         /*
475          * Don't always wait for the full data segment to be delivered
476          * to the socket; this might badly affect performance due to
477          * TCP window scaling.
478          */
479         if (len > partial_receive_len) {
480 #if 0
481                 ICL_DEBUG("need %zd bytes of data, limiting to %zd",
482                     len, partial_receive_len));
483 #endif
484                 len = partial_receive_len;
485
486                 return (len);
487         }
488
489         /*
490          * Account for padding.  Note that due to the way code is written,
491          * the icl_pdu_receive_data_segment() must always receive padding
492          * along with the last part of data segment, because it would be
493          * impossible to tell whether we've already received the full data
494          * segment including padding, or without it.
495          */
496         if ((len % 4) != 0)
497                 len += 4 - (len % 4);
498
499 #if 0
500         ICL_DEBUG("need %zd bytes of data", len));
501 #endif
502
503         return (len);
504 }
505
506 static int
507 icl_pdu_receive_data_segment(struct icl_pdu *request,
508     size_t *availablep, bool *more_neededp)
509 {
510         struct icl_conn *ic;
511         size_t len, padding = 0;
512         struct mbuf *m;
513
514         ic = request->ip_conn;
515
516         *more_neededp = false;
517         ic->ic_receive_len = 0;
518
519         len = icl_pdu_data_segment_length(request);
520         if (len == 0)
521                 return (0);
522
523         if ((len % 4) != 0)
524                 padding = 4 - (len % 4);
525
526         /*
527          * Account for already received parts of data segment.
528          */
529         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
530         len -= request->ip_data_len;
531
532         if (len + padding > *availablep) {
533                 /*
534                  * Not enough data in the socket buffer.  Receive as much
535                  * as we can.  Don't receive padding, since, obviously, it's
536                  * not the end of data segment yet.
537                  */
538 #if 0
539                 ICL_DEBUG("limited from %zd to %zd",
540                     len + padding, *availablep - padding));
541 #endif
542                 len = *availablep - padding;
543                 *more_neededp = true;
544                 padding = 0;
545         }
546
547         /*
548          * Must not try to receive padding without at least one byte
549          * of actual data segment.
550          */
551         if (len > 0) {
552                 m = icl_conn_receive(request->ip_conn, len + padding);
553                 if (m == NULL) {
554                         ICL_DEBUG("failed to receive data segment");
555                         return (-1);
556                 }
557
558                 if (request->ip_data_mbuf == NULL)
559                         request->ip_data_mbuf = m;
560                 else
561                         m_cat(request->ip_data_mbuf, m);
562
563                 request->ip_data_len += len;
564                 *availablep -= len + padding;
565         } else
566                 ICL_DEBUG("len 0");
567
568         if (*more_neededp)
569                 ic->ic_receive_len =
570                     icl_pdu_data_segment_receive_len(request);
571
572         return (0);
573 }
574
575 static int
576 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
577 {
578         uint32_t received_digest, valid_digest;
579
580         if (request->ip_conn->ic_data_crc32c == false)
581                 return (0);
582
583         if (request->ip_data_len == 0)
584                 return (0);
585
586         CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
587         if (icl_conn_receive_buf(request->ip_conn,
588             &received_digest, ISCSI_DATA_DIGEST_SIZE)) {
589                 ICL_DEBUG("failed to receive data digest");
590                 return (-1);
591         }
592         *availablep -= ISCSI_DATA_DIGEST_SIZE;
593
594         /*
595          * Note that ip_data_mbuf also contains padding; since digest
596          * calculation is supposed to include that, we iterate over
597          * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
598          */
599         valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
600         if (received_digest != valid_digest) {
601                 ICL_WARN("data digest check failed; got 0x%x, "
602                     "should be 0x%x", received_digest, valid_digest);
603                 return (-1);
604         }
605
606         return (0);
607 }
608
609 /*
610  * Somewhat contrary to the name, this attempts to receive only one
611  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
612  */
613 static struct icl_pdu *
614 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
615 {
616         struct icl_pdu *request;
617         struct socket *so;
618         size_t len;
619         int error;
620         bool more_needed;
621
622         so = ic->ic_socket;
623
624         if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
625                 KASSERT(ic->ic_receive_pdu == NULL,
626                     ("ic->ic_receive_pdu != NULL"));
627                 request = icl_soft_conn_new_pdu(ic, M_NOWAIT);
628                 if (request == NULL) {
629                         ICL_DEBUG("failed to allocate PDU; "
630                             "dropping connection");
631                         icl_conn_fail(ic);
632                         return (NULL);
633                 }
634                 ic->ic_receive_pdu = request;
635         } else {
636                 KASSERT(ic->ic_receive_pdu != NULL,
637                     ("ic->ic_receive_pdu == NULL"));
638                 request = ic->ic_receive_pdu;
639         }
640
641         if (*availablep < ic->ic_receive_len) {
642 #if 0
643                 ICL_DEBUG("not enough data; need %zd, "
644                     "have %zd", ic->ic_receive_len, *availablep);
645 #endif
646                 return (NULL);
647         }
648
649         switch (ic->ic_receive_state) {
650         case ICL_CONN_STATE_BHS:
651                 //ICL_DEBUG("receiving BHS");
652                 error = icl_pdu_receive_bhs(request, availablep);
653                 if (error != 0) {
654                         ICL_DEBUG("failed to receive BHS; "
655                             "dropping connection");
656                         break;
657                 }
658
659                 /*
660                  * We don't enforce any limit for AHS length;
661                  * its length is stored in 8 bit field.
662                  */
663
664                 len = icl_pdu_data_segment_length(request);
665                 if (len > ic->ic_max_data_segment_length) {
666                         ICL_WARN("received data segment "
667                             "length %zd is larger than negotiated "
668                             "MaxDataSegmentLength %zd; "
669                             "dropping connection",
670                             len, ic->ic_max_data_segment_length);
671                         error = EINVAL;
672                         break;
673                 }
674
675                 ic->ic_receive_state = ICL_CONN_STATE_AHS;
676                 ic->ic_receive_len = icl_pdu_ahs_length(request);
677                 break;
678
679         case ICL_CONN_STATE_AHS:
680                 //ICL_DEBUG("receiving AHS");
681                 error = icl_pdu_receive_ahs(request, availablep);
682                 if (error != 0) {
683                         ICL_DEBUG("failed to receive AHS; "
684                             "dropping connection");
685                         break;
686                 }
687                 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
688                 if (ic->ic_header_crc32c == false)
689                         ic->ic_receive_len = 0;
690                 else
691                         ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
692                 break;
693
694         case ICL_CONN_STATE_HEADER_DIGEST:
695                 //ICL_DEBUG("receiving header digest");
696                 error = icl_pdu_check_header_digest(request, availablep);
697                 if (error != 0) {
698                         ICL_DEBUG("header digest failed; "
699                             "dropping connection");
700                         break;
701                 }
702
703                 ic->ic_receive_state = ICL_CONN_STATE_DATA;
704                 ic->ic_receive_len =
705                     icl_pdu_data_segment_receive_len(request);
706                 break;
707
708         case ICL_CONN_STATE_DATA:
709                 //ICL_DEBUG("receiving data segment");
710                 error = icl_pdu_receive_data_segment(request, availablep,
711                     &more_needed);
712                 if (error != 0) {
713                         ICL_DEBUG("failed to receive data segment;"
714                             "dropping connection");
715                         break;
716                 }
717
718                 if (more_needed)
719                         break;
720
721                 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
722                 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
723                         ic->ic_receive_len = 0;
724                 else
725                         ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
726                 break;
727
728         case ICL_CONN_STATE_DATA_DIGEST:
729                 //ICL_DEBUG("receiving data digest");
730                 error = icl_pdu_check_data_digest(request, availablep);
731                 if (error != 0) {
732                         ICL_DEBUG("data digest failed; "
733                             "dropping connection");
734                         break;
735                 }
736
737                 /*
738                  * We've received complete PDU; reset the receive state machine
739                  * and return the PDU.
740                  */
741                 ic->ic_receive_state = ICL_CONN_STATE_BHS;
742                 ic->ic_receive_len = sizeof(struct iscsi_bhs);
743                 ic->ic_receive_pdu = NULL;
744                 return (request);
745
746         default:
747                 panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
748         }
749
750         if (error != 0) {
751                 /*
752                  * Don't free the PDU; it's pointed to by ic->ic_receive_pdu
753                  * and will get freed in icl_soft_conn_close().
754                  */
755                 icl_conn_fail(ic);
756         }
757
758         return (NULL);
759 }
760
761 static void
762 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
763 {
764         struct icl_pdu *response;
765         struct socket *so;
766
767         so = ic->ic_socket;
768
769         /*
770          * This can never happen; we're careful to only mess with ic->ic_socket
771          * pointer when the send/receive threads are not running.
772          */
773         KASSERT(so != NULL, ("NULL socket"));
774
775         for (;;) {
776                 if (ic->ic_disconnecting)
777                         return;
778
779                 if (so->so_error != 0) {
780                         ICL_DEBUG("connection error %d; "
781                             "dropping connection", so->so_error);
782                         icl_conn_fail(ic);
783                         return;
784                 }
785
786                 /*
787                  * Loop until we have a complete PDU or there is not enough
788                  * data in the socket buffer.
789                  */
790                 if (available < ic->ic_receive_len) {
791 #if 0
792                         ICL_DEBUG("not enough data; have %zd, "
793                             "need %zd", available,
794                             ic->ic_receive_len);
795 #endif
796                         return;
797                 }
798
799                 response = icl_conn_receive_pdu(ic, &available);
800                 if (response == NULL)
801                         continue;
802
803                 if (response->ip_ahs_len > 0) {
804                         ICL_WARN("received PDU with unsupported "
805                             "AHS; opcode 0x%x; dropping connection",
806                             response->ip_bhs->bhs_opcode);
807                         icl_soft_conn_pdu_free(ic, response);
808                         icl_conn_fail(ic);
809                         return;
810                 }
811
812                 (ic->ic_receive)(response);
813         }
814 }
815
816 static void
817 icl_receive_thread(void *arg)
818 {
819         struct icl_conn *ic;
820         size_t available;
821         struct socket *so;
822
823         ic = arg;
824         so = ic->ic_socket;
825
826         for (;;) {
827                 if (ic->ic_disconnecting) {
828                         //ICL_DEBUG("terminating");
829                         break;
830                 }
831
832                 /*
833                  * Set the low watermark, to be checked by
834                  * soreadable() in icl_soupcall_receive()
835                  * to avoid unnecessary wakeups until there
836                  * is enough data received to read the PDU.
837                  */
838                 SOCKBUF_LOCK(&so->so_rcv);
839                 available = sbavail(&so->so_rcv);
840                 if (available < ic->ic_receive_len) {
841                         so->so_rcv.sb_lowat = ic->ic_receive_len;
842                         cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
843                 } else
844                         so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
845                 SOCKBUF_UNLOCK(&so->so_rcv);
846
847                 icl_conn_receive_pdus(ic, available);
848         }
849
850         ICL_CONN_LOCK(ic);
851         ic->ic_receive_running = false;
852         cv_signal(&ic->ic_send_cv);
853         ICL_CONN_UNLOCK(ic);
854         kthread_exit();
855 }
856
857 static int
858 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
859 {
860         struct icl_conn *ic;
861
862         if (!soreadable(so))
863                 return (SU_OK);
864
865         ic = arg;
866         cv_signal(&ic->ic_receive_cv);
867         return (SU_OK);
868 }
869
870 static int
871 icl_pdu_finalize(struct icl_pdu *request)
872 {
873         size_t padding, pdu_len;
874         uint32_t digest, zero = 0;
875         int ok;
876         struct icl_conn *ic;
877
878         ic = request->ip_conn;
879
880         icl_pdu_set_data_segment_length(request, request->ip_data_len);
881
882         pdu_len = icl_pdu_size(request);
883
884         if (ic->ic_header_crc32c) {
885                 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
886                 ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
887                     (void *)&digest);
888                 if (ok != 1) {
889                         ICL_WARN("failed to append header digest");
890                         return (1);
891                 }
892         }
893
894         if (request->ip_data_len != 0) {
895                 padding = icl_pdu_padding(request);
896                 if (padding > 0) {
897                         ok = m_append(request->ip_data_mbuf, padding,
898                             (void *)&zero);
899                         if (ok != 1) {
900                                 ICL_WARN("failed to append padding");
901                                 return (1);
902                         }
903                 }
904
905                 if (ic->ic_data_crc32c) {
906                         digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
907
908                         ok = m_append(request->ip_data_mbuf, sizeof(digest),
909                             (void *)&digest);
910                         if (ok != 1) {
911                                 ICL_WARN("failed to append data digest");
912                                 return (1);
913                         }
914                 }
915
916                 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
917                 request->ip_data_mbuf = NULL;
918         }
919
920         request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
921
922         return (0);
923 }
924
925 static void
926 icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
927 {
928         struct icl_pdu *request, *request2;
929         struct socket *so;
930         long available, size, size2;
931         int coalesced, error;
932
933         ICL_CONN_LOCK_ASSERT_NOT(ic);
934
935         so = ic->ic_socket;
936
937         SOCKBUF_LOCK(&so->so_snd);
938         /*
939          * Check how much space do we have for transmit.  We can't just
940          * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE,
941          * as it always frees the mbuf chain passed to it, even in case
942          * of error.
943          */
944         available = sbspace(&so->so_snd);
945
946         /*
947          * Notify the socket upcall that we don't need wakeups
948          * for the time being.
949          */
950         so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1;
951         SOCKBUF_UNLOCK(&so->so_snd);
952
953         while (!STAILQ_EMPTY(queue)) {
954                 request = STAILQ_FIRST(queue);
955                 size = icl_pdu_size(request);
956                 if (available < size) {
957
958                         /*
959                          * Set the low watermark, to be checked by
960                          * sowriteable() in icl_soupcall_send()
961                          * to avoid unnecessary wakeups until there
962                          * is enough space for the PDU to fit.
963                          */
964                         SOCKBUF_LOCK(&so->so_snd);
965                         available = sbspace(&so->so_snd);
966                         if (available < size) {
967 #if 1
968                                 ICL_DEBUG("no space to send; "
969                                     "have %ld, need %ld",
970                                     available, size);
971 #endif
972                                 so->so_snd.sb_lowat = max(size,
973                                     so->so_snd.sb_hiwat / 8);
974                                 SOCKBUF_UNLOCK(&so->so_snd);
975                                 return;
976                         }
977                         SOCKBUF_UNLOCK(&so->so_snd);
978                 }
979                 STAILQ_REMOVE_HEAD(queue, ip_next);
980                 error = icl_pdu_finalize(request);
981                 if (error != 0) {
982                         ICL_DEBUG("failed to finalize PDU; "
983                             "dropping connection");
984                         icl_soft_pdu_done(request, EIO);
985                         icl_conn_fail(ic);
986                         return;
987                 }
988                 if (coalesce) {
989                         coalesced = 1;
990                         for (;;) {
991                                 request2 = STAILQ_FIRST(queue);
992                                 if (request2 == NULL)
993                                         break;
994                                 size2 = icl_pdu_size(request2);
995                                 if (available < size + size2)
996                                         break;
997                                 STAILQ_REMOVE_HEAD(queue, ip_next);
998                                 error = icl_pdu_finalize(request2);
999                                 if (error != 0) {
1000                                         ICL_DEBUG("failed to finalize PDU; "
1001                                             "dropping connection");
1002                                         icl_soft_pdu_done(request, EIO);
1003                                         icl_soft_pdu_done(request2, EIO);
1004                                         icl_conn_fail(ic);
1005                                         return;
1006                                 }
1007                                 m_cat(request->ip_bhs_mbuf, request2->ip_bhs_mbuf);
1008                                 request2->ip_bhs_mbuf = NULL;
1009                                 request->ip_bhs_mbuf->m_pkthdr.len += size2;
1010                                 size += size2;
1011                                 STAILQ_REMOVE_AFTER(queue, request, ip_next);
1012                                 icl_soft_pdu_done(request2, 0);
1013                                 coalesced++;
1014                         }
1015 #if 0
1016                         if (coalesced > 1) {
1017                                 ICL_DEBUG("coalesced %d PDUs into %ld bytes",
1018                                     coalesced, size);
1019                         }
1020 #endif
1021                 }
1022                 available -= size;
1023                 error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
1024                     NULL, MSG_DONTWAIT, curthread);
1025                 request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
1026                 if (error != 0) {
1027                         ICL_DEBUG("failed to send PDU, error %d; "
1028                             "dropping connection", error);
1029                         icl_soft_pdu_done(request, error);
1030                         icl_conn_fail(ic);
1031                         return;
1032                 }
1033                 icl_soft_pdu_done(request, 0);
1034         }
1035 }
1036
1037 static void
1038 icl_send_thread(void *arg)
1039 {
1040         struct icl_conn *ic;
1041         struct icl_pdu_stailq queue;
1042
1043         ic = arg;
1044
1045         STAILQ_INIT(&queue);
1046
1047         ICL_CONN_LOCK(ic);
1048         for (;;) {
1049                 for (;;) {
1050                         /*
1051                          * If the local queue is empty, populate it from
1052                          * the main one.  This way the icl_conn_send_pdus()
1053                          * can go through all the queued PDUs without holding
1054                          * any locks.
1055                          */
1056                         if (STAILQ_EMPTY(&queue))
1057                                 STAILQ_SWAP(&ic->ic_to_send, &queue, icl_pdu);
1058
1059                         ic->ic_check_send_space = false;
1060                         ICL_CONN_UNLOCK(ic);
1061                         icl_conn_send_pdus(ic, &queue);
1062                         ICL_CONN_LOCK(ic);
1063
1064                         /*
1065                          * The icl_soupcall_send() was called since the last
1066                          * call to sbspace(); go around;
1067                          */
1068                         if (ic->ic_check_send_space)
1069                                 continue;
1070
1071                         /*
1072                          * Local queue is empty, but we still have PDUs
1073                          * in the main one; go around.
1074                          */
1075                         if (STAILQ_EMPTY(&queue) &&
1076                             !STAILQ_EMPTY(&ic->ic_to_send))
1077                                 continue;
1078
1079                         /*
1080                          * There might be some stuff in the local queue,
1081                          * which didn't get sent due to not having enough send
1082                          * space.  Wait for socket upcall.
1083                          */
1084                         break;
1085                 }
1086
1087                 if (ic->ic_disconnecting) {
1088                         //ICL_DEBUG("terminating");
1089                         break;
1090                 }
1091
1092                 cv_wait(&ic->ic_send_cv, ic->ic_lock);
1093         }
1094
1095         /*
1096          * We're exiting; move PDUs back to the main queue, so they can
1097          * get freed properly.  At this point ordering doesn't matter.
1098          */
1099         STAILQ_CONCAT(&ic->ic_to_send, &queue);
1100
1101         ic->ic_send_running = false;
1102         cv_signal(&ic->ic_send_cv);
1103         ICL_CONN_UNLOCK(ic);
1104         kthread_exit();
1105 }
1106
1107 static int
1108 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
1109 {
1110         struct icl_conn *ic;
1111
1112         if (!sowriteable(so))
1113                 return (SU_OK);
1114
1115         ic = arg;
1116
1117         ICL_CONN_LOCK(ic);
1118         ic->ic_check_send_space = true;
1119         ICL_CONN_UNLOCK(ic);
1120
1121         cv_signal(&ic->ic_send_cv);
1122
1123         return (SU_OK);
1124 }
1125
1126 static int
1127 icl_soft_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
1128     const void *addr, size_t len, int flags)
1129 {
1130         struct icl_soft_pdu *isp = (struct icl_soft_pdu *)request;
1131         struct mbuf *mb, *newmb;
1132         size_t copylen, off = 0;
1133
1134         KASSERT(len > 0, ("len == 0"));
1135
1136         if (flags & ICL_NOCOPY) {
1137                 newmb = m_get(flags & ~ICL_NOCOPY, MT_DATA);
1138                 if (newmb == NULL) {
1139                         ICL_WARN("failed to allocate mbuf");
1140                         return (ENOMEM);
1141                 }
1142
1143                 newmb->m_flags |= M_RDONLY;
1144                 m_extaddref(newmb, __DECONST(char *, addr), len, &isp->ref_cnt,
1145                     icl_soft_mbuf_done, isp, NULL);
1146                 newmb->m_len = len;
1147         } else {
1148                 newmb = m_getm2(NULL, len, flags, MT_DATA, 0);
1149                 if (newmb == NULL) {
1150                         ICL_WARN("failed to allocate mbuf for %zd bytes", len);
1151                         return (ENOMEM);
1152                 }
1153
1154                 for (mb = newmb; mb != NULL; mb = mb->m_next) {
1155                         copylen = min(M_TRAILINGSPACE(mb), len - off);
1156                         memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
1157                         mb->m_len = copylen;
1158                         off += copylen;
1159                 }
1160                 KASSERT(off == len, ("%s: off != len", __func__));
1161         }
1162
1163         if (request->ip_data_mbuf == NULL) {
1164                 request->ip_data_mbuf = newmb;
1165                 request->ip_data_len = len;
1166         } else {
1167                 m_cat(request->ip_data_mbuf, newmb);
1168                 request->ip_data_len += len;
1169         }
1170
1171         return (0);
1172 }
1173
1174 void
1175 icl_soft_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
1176     size_t off, void *addr, size_t len)
1177 {
1178
1179         m_copydata(ip->ip_data_mbuf, off, len, addr);
1180 }
1181
1182 static void
1183 icl_soft_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
1184 {
1185
1186         icl_soft_conn_pdu_queue_cb(ic, ip, NULL);
1187 }
1188
1189 static void
1190 icl_soft_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip,
1191     icl_pdu_cb cb)
1192 {
1193         struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
1194
1195         ICL_CONN_LOCK_ASSERT(ic);
1196         isp->ref_cnt++;
1197         isp->cb = cb;
1198
1199         if (ic->ic_disconnecting || ic->ic_socket == NULL) {
1200                 ICL_DEBUG("icl_pdu_queue on closed connection");
1201                 icl_soft_pdu_done(ip, ENOTCONN);
1202                 return;
1203         }
1204
1205         if (!STAILQ_EMPTY(&ic->ic_to_send)) {
1206                 STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1207                 /*
1208                  * If the queue is not empty, someone else had already
1209                  * signaled the send thread; no need to do that again,
1210                  * just return.
1211                  */
1212                 return;
1213         }
1214
1215         STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1216         cv_signal(&ic->ic_send_cv);
1217 }
1218
1219 static struct icl_conn *
1220 icl_soft_new_conn(const char *name, struct mtx *lock)
1221 {
1222         struct icl_conn *ic;
1223
1224         refcount_acquire(&icl_ncons);
1225
1226         ic = (struct icl_conn *)kobj_create(&icl_soft_class, M_ICL_SOFT, M_WAITOK | M_ZERO);
1227
1228         STAILQ_INIT(&ic->ic_to_send);
1229         ic->ic_lock = lock;
1230         cv_init(&ic->ic_send_cv, "icl_tx");
1231         cv_init(&ic->ic_receive_cv, "icl_rx");
1232 #ifdef DIAGNOSTIC
1233         refcount_init(&ic->ic_outstanding_pdus, 0);
1234 #endif
1235         ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
1236         ic->ic_name = name;
1237         ic->ic_offload = "None";
1238         ic->ic_unmapped = false;
1239
1240         return (ic);
1241 }
1242
1243 void
1244 icl_soft_conn_free(struct icl_conn *ic)
1245 {
1246
1247 #ifdef DIAGNOSTIC
1248         KASSERT(ic->ic_outstanding_pdus == 0,
1249             ("destroying session with %d outstanding PDUs",
1250              ic->ic_outstanding_pdus));
1251 #endif
1252         cv_destroy(&ic->ic_send_cv);
1253         cv_destroy(&ic->ic_receive_cv);
1254         kobj_delete((struct kobj *)ic, M_ICL_SOFT);
1255         refcount_release(&icl_ncons);
1256 }
1257
1258 static int
1259 icl_conn_start(struct icl_conn *ic)
1260 {
1261         size_t minspace;
1262         struct sockopt opt;
1263         int error, one = 1;
1264
1265         ICL_CONN_LOCK(ic);
1266
1267         /*
1268          * XXX: Ugly hack.
1269          */
1270         if (ic->ic_socket == NULL) {
1271                 ICL_CONN_UNLOCK(ic);
1272                 return (EINVAL);
1273         }
1274
1275         ic->ic_receive_state = ICL_CONN_STATE_BHS;
1276         ic->ic_receive_len = sizeof(struct iscsi_bhs);
1277         ic->ic_disconnecting = false;
1278
1279         ICL_CONN_UNLOCK(ic);
1280
1281         /*
1282          * For sendspace, this is required because the current code cannot
1283          * send a PDU in pieces; thus, the minimum buffer size is equal
1284          * to the maximum PDU size.  "+4" is to account for possible padding.
1285          *
1286          * What we should actually do here is to use autoscaling, but set
1287          * some minimal buffer size to "minspace".  I don't know a way to do
1288          * that, though.
1289          */
1290         minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
1291             ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
1292         if (sendspace < minspace) {
1293                 ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
1294                     minspace);
1295                 sendspace = minspace;
1296         }
1297         if (recvspace < minspace) {
1298                 ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
1299                     minspace);
1300                 recvspace = minspace;
1301         }
1302
1303         error = soreserve(ic->ic_socket, sendspace, recvspace);
1304         if (error != 0) {
1305                 ICL_WARN("soreserve failed with error %d", error);
1306                 icl_soft_conn_close(ic);
1307                 return (error);
1308         }
1309         ic->ic_socket->so_snd.sb_flags |= SB_AUTOSIZE;
1310         ic->ic_socket->so_rcv.sb_flags |= SB_AUTOSIZE;
1311
1312         /*
1313          * Disable Nagle.
1314          */
1315         bzero(&opt, sizeof(opt));
1316         opt.sopt_dir = SOPT_SET;
1317         opt.sopt_level = IPPROTO_TCP;
1318         opt.sopt_name = TCP_NODELAY;
1319         opt.sopt_val = &one;
1320         opt.sopt_valsize = sizeof(one);
1321         error = sosetopt(ic->ic_socket, &opt);
1322         if (error != 0) {
1323                 ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1324                 icl_soft_conn_close(ic);
1325                 return (error);
1326         }
1327
1328         /*
1329          * Register socket upcall, to get notified about incoming PDUs
1330          * and free space to send outgoing ones.
1331          */
1332         SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1333         soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1334         SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1335         SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1336         soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1337         SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1338
1339         /*
1340          * Start threads.
1341          */
1342         ICL_CONN_LOCK(ic);
1343         ic->ic_send_running = ic->ic_receive_running = true;
1344         ICL_CONN_UNLOCK(ic);
1345         error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
1346             ic->ic_name);
1347         if (error != 0) {
1348                 ICL_WARN("kthread_add(9) failed with error %d", error);
1349                 ICL_CONN_LOCK(ic);
1350                 ic->ic_send_running = ic->ic_receive_running = false;
1351                 cv_signal(&ic->ic_send_cv);
1352                 ICL_CONN_UNLOCK(ic);
1353                 icl_soft_conn_close(ic);
1354                 return (error);
1355         }
1356         error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
1357             ic->ic_name);
1358         if (error != 0) {
1359                 ICL_WARN("kthread_add(9) failed with error %d", error);
1360                 ICL_CONN_LOCK(ic);
1361                 ic->ic_receive_running = false;
1362                 cv_signal(&ic->ic_send_cv);
1363                 ICL_CONN_UNLOCK(ic);
1364                 icl_soft_conn_close(ic);
1365                 return (error);
1366         }
1367
1368         return (0);
1369 }
1370
1371 int
1372 icl_soft_conn_handoff(struct icl_conn *ic, int fd)
1373 {
1374         struct file *fp;
1375         struct socket *so;
1376         cap_rights_t rights;
1377         int error;
1378
1379         ICL_CONN_LOCK_ASSERT_NOT(ic);
1380
1381 #ifdef ICL_KERNEL_PROXY
1382         /*
1383          * We're transitioning to Full Feature phase, and we don't
1384          * really care.
1385          */
1386         if (fd == 0) {
1387                 ICL_CONN_LOCK(ic);
1388                 if (ic->ic_socket == NULL) {
1389                         ICL_CONN_UNLOCK(ic);
1390                         ICL_WARN("proxy handoff without connect"); 
1391                         return (EINVAL);
1392                 }
1393                 ICL_CONN_UNLOCK(ic);
1394                 return (0);
1395         }
1396 #endif
1397
1398         /*
1399          * Steal the socket from userland.
1400          */
1401         error = fget(curthread, fd,
1402             cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1403         if (error != 0)
1404                 return (error);
1405         if (fp->f_type != DTYPE_SOCKET) {
1406                 fdrop(fp, curthread);
1407                 return (EINVAL);
1408         }
1409         so = fp->f_data;
1410         if (so->so_type != SOCK_STREAM) {
1411                 fdrop(fp, curthread);
1412                 return (EINVAL);
1413         }
1414
1415         ICL_CONN_LOCK(ic);
1416
1417         if (ic->ic_socket != NULL) {
1418                 ICL_CONN_UNLOCK(ic);
1419                 fdrop(fp, curthread);
1420                 return (EBUSY);
1421         }
1422
1423         ic->ic_socket = fp->f_data;
1424         fp->f_ops = &badfileops;
1425         fp->f_data = NULL;
1426         fdrop(fp, curthread);
1427         ICL_CONN_UNLOCK(ic);
1428
1429         error = icl_conn_start(ic);
1430
1431         return (error);
1432 }
1433
1434 void
1435 icl_soft_conn_close(struct icl_conn *ic)
1436 {
1437         struct icl_pdu *pdu;
1438         struct socket *so;
1439
1440         ICL_CONN_LOCK(ic);
1441
1442         /*
1443          * Wake up the threads, so they can properly terminate.
1444          */
1445         ic->ic_disconnecting = true;
1446         while (ic->ic_receive_running || ic->ic_send_running) {
1447                 cv_signal(&ic->ic_receive_cv);
1448                 cv_signal(&ic->ic_send_cv);
1449                 cv_wait(&ic->ic_send_cv, ic->ic_lock);
1450         }
1451
1452         /* Some other thread could close the connection same time. */
1453         so = ic->ic_socket;
1454         if (so == NULL) {
1455                 ICL_CONN_UNLOCK(ic);
1456                 return;
1457         }
1458         ic->ic_socket = NULL;
1459
1460         /*
1461          * Deregister socket upcalls.
1462          */
1463         ICL_CONN_UNLOCK(ic);
1464         SOCKBUF_LOCK(&so->so_snd);
1465         if (so->so_snd.sb_upcall != NULL)
1466                 soupcall_clear(so, SO_SND);
1467         SOCKBUF_UNLOCK(&so->so_snd);
1468         SOCKBUF_LOCK(&so->so_rcv);
1469         if (so->so_rcv.sb_upcall != NULL)
1470                 soupcall_clear(so, SO_RCV);
1471         SOCKBUF_UNLOCK(&so->so_rcv);
1472         soclose(so);
1473         ICL_CONN_LOCK(ic);
1474
1475         if (ic->ic_receive_pdu != NULL) {
1476                 //ICL_DEBUG("freeing partially received PDU");
1477                 icl_soft_conn_pdu_free(ic, ic->ic_receive_pdu);
1478                 ic->ic_receive_pdu = NULL;
1479         }
1480
1481         /*
1482          * Remove any outstanding PDUs from the send queue.
1483          */
1484         while (!STAILQ_EMPTY(&ic->ic_to_send)) {
1485                 pdu = STAILQ_FIRST(&ic->ic_to_send);
1486                 STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
1487                 icl_soft_pdu_done(pdu, ENOTCONN);
1488         }
1489
1490         KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
1491             ("destroying session with non-empty send queue"));
1492         ICL_CONN_UNLOCK(ic);
1493 }
1494
1495 int
1496 icl_soft_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
1497     struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp)
1498 {
1499
1500         return (0);
1501 }
1502
1503 void
1504 icl_soft_conn_task_done(struct icl_conn *ic, void *prv)
1505 {
1506 }
1507
1508 int
1509 icl_soft_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
1510     uint32_t *transfer_tag, void **prvp)
1511 {
1512
1513         return (0);
1514 }
1515
1516 void
1517 icl_soft_conn_transfer_done(struct icl_conn *ic, void *prv)
1518 {
1519 }
1520
1521 static int
1522 icl_soft_limits(struct icl_drv_limits *idl)
1523 {
1524
1525         idl->idl_max_recv_data_segment_length = 128 * 1024;
1526         idl->idl_max_send_data_segment_length = 128 * 1024;
1527         idl->idl_max_burst_length = 262144;
1528         idl->idl_first_burst_length = 65536;
1529
1530         return (0);
1531 }
1532
1533 #ifdef ICL_KERNEL_PROXY
1534 int
1535 icl_soft_conn_connect(struct icl_conn *ic, int domain, int socktype,
1536     int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa)
1537 {
1538
1539         return (icl_soft_proxy_connect(ic, domain, socktype, protocol,
1540             from_sa, to_sa));
1541 }
1542
1543 int
1544 icl_soft_handoff_sock(struct icl_conn *ic, struct socket *so)
1545 {
1546         int error;
1547
1548         ICL_CONN_LOCK_ASSERT_NOT(ic);
1549
1550         if (so->so_type != SOCK_STREAM)
1551                 return (EINVAL);
1552
1553         ICL_CONN_LOCK(ic);
1554         if (ic->ic_socket != NULL) {
1555                 ICL_CONN_UNLOCK(ic);
1556                 return (EBUSY);
1557         }
1558         ic->ic_socket = so;
1559         ICL_CONN_UNLOCK(ic);
1560
1561         error = icl_conn_start(ic);
1562
1563         return (error);
1564 }
1565 #endif /* ICL_KERNEL_PROXY */
1566
1567 static int
1568 icl_soft_load(void)
1569 {
1570         int error;
1571
1572         icl_soft_pdu_zone = uma_zcreate("icl_soft_pdu",
1573             sizeof(struct icl_soft_pdu), NULL, NULL, NULL, NULL,
1574             UMA_ALIGN_PTR, 0);
1575         refcount_init(&icl_ncons, 0);
1576
1577         /*
1578          * The reason we call this "none" is that to the user,
1579          * it's known as "offload driver"; "offload driver: soft"
1580          * doesn't make much sense.
1581          */
1582         error = icl_register("none", false, 0,
1583             icl_soft_limits, icl_soft_new_conn);
1584         KASSERT(error == 0, ("failed to register"));
1585
1586 #if defined(ICL_KERNEL_PROXY) && 0
1587         /*
1588          * Debugging aid for kernel proxy functionality.
1589          */
1590         error = icl_register("proxytest", true, 0,
1591             icl_soft_limits, icl_soft_new_conn);
1592         KASSERT(error == 0, ("failed to register"));
1593 #endif
1594
1595         return (error);
1596 }
1597
1598 static int
1599 icl_soft_unload(void)
1600 {
1601
1602         if (icl_ncons != 0)
1603                 return (EBUSY);
1604
1605         icl_unregister("none", false);
1606 #if defined(ICL_KERNEL_PROXY) && 0
1607         icl_unregister("proxytest", true);
1608 #endif
1609
1610         uma_zdestroy(icl_soft_pdu_zone);
1611
1612         return (0);
1613 }
1614
1615 static int
1616 icl_soft_modevent(module_t mod, int what, void *arg)
1617 {
1618
1619         switch (what) {
1620         case MOD_LOAD:
1621                 return (icl_soft_load());
1622         case MOD_UNLOAD:
1623                 return (icl_soft_unload());
1624         default:
1625                 return (EINVAL);
1626         }
1627 }
1628
1629 moduledata_t icl_soft_data = {
1630         "icl_soft",
1631         icl_soft_modevent,
1632         0
1633 };
1634
1635 DECLARE_MODULE(icl_soft, icl_soft_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
1636 MODULE_DEPEND(icl_soft, icl, 1, 1, 1);
1637 MODULE_VERSION(icl_soft, 1);