]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/iscsi/icl.c
Get rid of ICL lock; use upper-layer (initiator or target) lock instead.
[FreeBSD/FreeBSD.git] / sys / dev / iscsi / icl.c
1 /*-
2  * Copyright (c) 2012 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 /*
33  * iSCSI Common Layer.  It's used by both the initiator and target to send
34  * and receive iSCSI PDUs.
35  */
36
37 #include <sys/param.h>
38 #include <sys/capsicum.h>
39 #include <sys/condvar.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/lock.h>
45 #include <sys/mbuf.h>
46 #include <sys/mutex.h>
47 #include <sys/module.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/sx.h>
53 #include <sys/uio.h>
54 #include <vm/uma.h>
55 #include <netinet/in.h>
56 #include <netinet/tcp.h>
57
58 #include "icl.h"
59 #include "iscsi_proto.h"
60
61 SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62 static int debug = 1;
63 TUNABLE_INT("kern.icl.debug", &debug);
64 SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65     &debug, 1, "Enable debug messages");
66 static int partial_receive_len = 1 * 1024; /* XXX: More? */
67 TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69     &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70     "data segment");
71
72 static uma_zone_t icl_conn_zone;
73 static uma_zone_t icl_pdu_zone;
74
75 static volatile u_int   icl_ncons;
76
77 #define ICL_DEBUG(X, ...)                                               \
78         do {                                                            \
79                 if (debug > 1)                                          \
80                         printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
81         } while (0)
82
83 #define ICL_WARN(X, ...)                                                \
84         do {                                                            \
85                 if (debug > 0) {                                        \
86                         printf("WARNING: %s: " X "\n",                  \
87                             __func__, ## __VA_ARGS__);                  \
88                 }                                                       \
89         } while (0)
90
91 #define ICL_CONN_LOCK(X)                mtx_lock(X->ic_lock)
92 #define ICL_CONN_UNLOCK(X)              mtx_unlock(X->ic_lock)
93 #define ICL_CONN_LOCK_ASSERT(X)         mtx_assert(X->ic_lock, MA_OWNED)
94 #define ICL_CONN_LOCK_ASSERT_NOT(X)     mtx_assert(X->ic_lock, MA_NOTOWNED)
95
96 static void
97 icl_conn_fail(struct icl_conn *ic)
98 {
99         if (ic->ic_socket == NULL)
100                 return;
101
102         /*
103          * XXX
104          */
105         ic->ic_socket->so_error = EDOOFUS;
106         (ic->ic_error)(ic);
107 }
108
109 static struct mbuf *
110 icl_conn_receive(struct icl_conn *ic, size_t len)
111 {
112         struct uio uio;
113         struct socket *so;
114         struct mbuf *m;
115         int error, flags;
116
117         so = ic->ic_socket;
118
119         memset(&uio, 0, sizeof(uio));
120         uio.uio_resid = len;
121
122         flags = MSG_DONTWAIT;
123         error = soreceive(so, NULL, &uio, &m, NULL, &flags);
124         if (error != 0) {
125                 ICL_DEBUG("soreceive error %d", error);
126                 return (NULL);
127         }
128         if (uio.uio_resid != 0) {
129                 m_freem(m);
130                 ICL_DEBUG("short read");
131                 return (NULL);
132         }
133
134         return (m);
135 }
136
137 static struct icl_pdu *
138 icl_pdu_new(struct icl_conn *ic, int flags)
139 {
140         struct icl_pdu *ip;
141
142 #ifdef DIAGNOSTIC
143         refcount_acquire(&ic->ic_outstanding_pdus);
144 #endif
145         ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
146         if (ip == NULL) {
147                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
148 #ifdef DIAGNOSTIC
149                 refcount_release(&ic->ic_outstanding_pdus);
150 #endif
151                 return (NULL);
152         }
153
154         ip->ip_conn = ic;
155
156         return (ip);
157 }
158
159 void
160 icl_pdu_free(struct icl_pdu *ip)
161 {
162         struct icl_conn *ic;
163
164         ic = ip->ip_conn;
165
166         m_freem(ip->ip_bhs_mbuf);
167         m_freem(ip->ip_ahs_mbuf);
168         m_freem(ip->ip_data_mbuf);
169         uma_zfree(icl_pdu_zone, ip);
170 #ifdef DIAGNOSTIC
171         refcount_release(&ic->ic_outstanding_pdus);
172 #endif
173 }
174
175 /*
176  * Allocate icl_pdu with empty BHS to fill up by the caller.
177  */
178 struct icl_pdu *
179 icl_pdu_new_bhs(struct icl_conn *ic, int flags)
180 {
181         struct icl_pdu *ip;
182
183         ip = icl_pdu_new(ic, flags);
184         if (ip == NULL)
185                 return (NULL);
186
187         ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
188             flags, MT_DATA, M_PKTHDR);
189         if (ip->ip_bhs_mbuf == NULL) {
190                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
191                 icl_pdu_free(ip);
192                 return (NULL);
193         }
194         ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
195         memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
196         ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
197
198         return (ip);
199 }
200
201 static int
202 icl_pdu_ahs_length(const struct icl_pdu *request)
203 {
204
205         return (request->ip_bhs->bhs_total_ahs_len * 4);
206 }
207
208 size_t
209 icl_pdu_data_segment_length(const struct icl_pdu *request)
210 {
211         uint32_t len = 0;
212
213         len += request->ip_bhs->bhs_data_segment_len[0];
214         len <<= 8;
215         len += request->ip_bhs->bhs_data_segment_len[1];
216         len <<= 8;
217         len += request->ip_bhs->bhs_data_segment_len[2];
218
219         return (len);
220 }
221
222 static void
223 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
224 {
225
226         response->ip_bhs->bhs_data_segment_len[2] = len;
227         response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
228         response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
229 }
230
231 static size_t
232 icl_pdu_padding(const struct icl_pdu *ip)
233 {
234
235         if ((ip->ip_data_len % 4) != 0)
236                 return (4 - (ip->ip_data_len % 4));
237
238         return (0);
239 }
240
241 static size_t
242 icl_pdu_size(const struct icl_pdu *response)
243 {
244         size_t len;
245
246         KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
247
248         len = sizeof(struct iscsi_bhs) + response->ip_data_len +
249             icl_pdu_padding(response);
250         if (response->ip_conn->ic_header_crc32c)
251                 len += ISCSI_HEADER_DIGEST_SIZE;
252         if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
253                 len += ISCSI_DATA_DIGEST_SIZE;
254
255         return (len);
256 }
257
258 static int
259 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
260 {
261         struct mbuf *m;
262
263         m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
264         if (m == NULL) {
265                 ICL_DEBUG("failed to receive BHS");
266                 return (-1);
267         }
268
269         request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
270         if (request->ip_bhs_mbuf == NULL) {
271                 ICL_WARN("m_pullup failed");
272                 return (-1);
273         }
274         request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
275
276         /*
277          * XXX: For architectures with strict alignment requirements
278          *      we may need to allocate ip_bhs and copy the data into it.
279          *      For some reason, though, not doing this doesn't seem
280          *      to cause problems; tested on sparc64.
281          */
282
283         *availablep -= sizeof(struct iscsi_bhs);
284         return (0);
285 }
286
287 static int
288 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
289 {
290
291         request->ip_ahs_len = icl_pdu_ahs_length(request);
292         if (request->ip_ahs_len == 0)
293                 return (0);
294
295         request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
296             request->ip_ahs_len);
297         if (request->ip_ahs_mbuf == NULL) {
298                 ICL_DEBUG("failed to receive AHS");
299                 return (-1);
300         }
301
302         *availablep -= request->ip_ahs_len;
303         return (0);
304 }
305
306 static uint32_t
307 icl_mbuf_to_crc32c(const struct mbuf *m0)
308 {
309         uint32_t digest = 0xffffffff;
310         const struct mbuf *m;
311
312         for (m = m0; m != NULL; m = m->m_next)
313                 digest = calculate_crc32c(digest,
314                     mtod(m, const void *), m->m_len);
315
316         digest = digest ^ 0xffffffff;
317
318         return (digest);
319 }
320
321 static int
322 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
323 {
324         struct mbuf *m;
325         uint32_t received_digest, valid_digest;
326
327         if (request->ip_conn->ic_header_crc32c == false)
328                 return (0);
329
330         m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
331         if (m == NULL) {
332                 ICL_DEBUG("failed to receive header digest");
333                 return (-1);
334         }
335
336         CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
337         m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
338         m_freem(m);
339
340         *availablep -= ISCSI_HEADER_DIGEST_SIZE;
341
342         /*
343          * XXX: Handle AHS.
344          */
345         valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
346         if (received_digest != valid_digest) {
347                 ICL_WARN("header digest check failed; got 0x%x, "
348                     "should be 0x%x", received_digest, valid_digest);
349                 return (-1);
350         }
351
352         return (0);
353 }
354
355 /*
356  * Return the number of bytes that should be waiting in the receive socket
357  * before icl_pdu_receive_data_segment() gets called.
358  */
359 static size_t
360 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
361 {
362         size_t len;
363
364         len = icl_pdu_data_segment_length(request);
365         if (len == 0)
366                 return (0);
367
368         /*
369          * Account for the parts of data segment already read from
370          * the socket buffer.
371          */
372         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
373         len -= request->ip_data_len;
374
375         /*
376          * Don't always wait for the full data segment to be delivered
377          * to the socket; this might badly affect performance due to
378          * TCP window scaling.
379          */
380         if (len > partial_receive_len) {
381 #if 0
382                 ICL_DEBUG("need %zd bytes of data, limiting to %zd",
383                     len, partial_receive_len));
384 #endif
385                 len = partial_receive_len;
386
387                 return (len);
388         }
389
390         /*
391          * Account for padding.  Note that due to the way code is written,
392          * the icl_pdu_receive_data_segment() must always receive padding
393          * along with the last part of data segment, because it would be
394          * impossible to tell whether we've already received the full data
395          * segment including padding, or without it.
396          */
397         if ((len % 4) != 0)
398                 len += 4 - (len % 4);
399
400 #if 0
401         ICL_DEBUG("need %zd bytes of data", len));
402 #endif
403
404         return (len);
405 }
406
407 static int
408 icl_pdu_receive_data_segment(struct icl_pdu *request,
409     size_t *availablep, bool *more_neededp)
410 {
411         struct icl_conn *ic;
412         size_t len, padding = 0;
413         struct mbuf *m;
414
415         ic = request->ip_conn;
416
417         *more_neededp = false;
418         ic->ic_receive_len = 0;
419
420         len = icl_pdu_data_segment_length(request);
421         if (len == 0)
422                 return (0);
423
424         if ((len % 4) != 0)
425                 padding = 4 - (len % 4);
426
427         /*
428          * Account for already received parts of data segment.
429          */
430         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
431         len -= request->ip_data_len;
432
433         if (len + padding > *availablep) {
434                 /*
435                  * Not enough data in the socket buffer.  Receive as much
436                  * as we can.  Don't receive padding, since, obviously, it's
437                  * not the end of data segment yet.
438                  */
439 #if 0
440                 ICL_DEBUG("limited from %zd to %zd",
441                     len + padding, *availablep - padding));
442 #endif
443                 len = *availablep - padding;
444                 *more_neededp = true;
445                 padding = 0;
446         }
447
448         /*
449          * Must not try to receive padding without at least one byte
450          * of actual data segment.
451          */
452         if (len > 0) {
453                 m = icl_conn_receive(request->ip_conn, len + padding);
454                 if (m == NULL) {
455                         ICL_DEBUG("failed to receive data segment");
456                         return (-1);
457                 }
458
459                 if (request->ip_data_mbuf == NULL)
460                         request->ip_data_mbuf = m;
461                 else
462                         m_cat(request->ip_data_mbuf, m);
463
464                 request->ip_data_len += len;
465                 *availablep -= len + padding;
466         } else
467                 ICL_DEBUG("len 0");
468
469         if (*more_neededp)
470                 ic->ic_receive_len =
471                     icl_pdu_data_segment_receive_len(request);
472
473         return (0);
474 }
475
476 static int
477 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
478 {
479         struct mbuf *m;
480         uint32_t received_digest, valid_digest;
481
482         if (request->ip_conn->ic_data_crc32c == false)
483                 return (0);
484
485         if (request->ip_data_len == 0)
486                 return (0);
487
488         m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
489         if (m == NULL) {
490                 ICL_DEBUG("failed to receive data digest");
491                 return (-1);
492         }
493
494         CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
495         m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
496         m_freem(m);
497
498         *availablep -= ISCSI_DATA_DIGEST_SIZE;
499
500         /*
501          * Note that ip_data_mbuf also contains padding; since digest
502          * calculation is supposed to include that, we iterate over
503          * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
504          */
505         valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
506         if (received_digest != valid_digest) {
507                 ICL_WARN("data digest check failed; got 0x%x, "
508                     "should be 0x%x", received_digest, valid_digest);
509                 return (-1);
510         }
511
512         return (0);
513 }
514
515 /*
516  * Somewhat contrary to the name, this attempts to receive only one
517  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
518  */
519 static struct icl_pdu *
520 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
521 {
522         struct icl_pdu *request;
523         struct socket *so;
524         size_t len;
525         int error;
526         bool more_needed;
527
528         so = ic->ic_socket;
529
530         if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
531                 KASSERT(ic->ic_receive_pdu == NULL,
532                     ("ic->ic_receive_pdu != NULL"));
533                 request = icl_pdu_new(ic, M_NOWAIT);
534                 if (request == NULL) {
535                         ICL_DEBUG("failed to allocate PDU; "
536                             "dropping connection");
537                         icl_conn_fail(ic);
538                         return (NULL);
539                 }
540                 ic->ic_receive_pdu = request;
541         } else {
542                 KASSERT(ic->ic_receive_pdu != NULL,
543                     ("ic->ic_receive_pdu == NULL"));
544                 request = ic->ic_receive_pdu;
545         }
546
547         if (*availablep < ic->ic_receive_len) {
548 #if 0
549                 ICL_DEBUG("not enough data; need %zd, "
550                     "have %zd", ic->ic_receive_len, *availablep);
551 #endif
552                 return (NULL);
553         }
554
555         switch (ic->ic_receive_state) {
556         case ICL_CONN_STATE_BHS:
557                 //ICL_DEBUG("receiving BHS");
558                 error = icl_pdu_receive_bhs(request, availablep);
559                 if (error != 0) {
560                         ICL_DEBUG("failed to receive BHS; "
561                             "dropping connection");
562                         break;
563                 }
564
565                 /*
566                  * We don't enforce any limit for AHS length;
567                  * its length is stored in 8 bit field.
568                  */
569
570                 len = icl_pdu_data_segment_length(request);
571                 if (len > ic->ic_max_data_segment_length) {
572                         ICL_WARN("received data segment "
573                             "length %zd is larger than negotiated "
574                             "MaxDataSegmentLength %zd; "
575                             "dropping connection",
576                             len, ic->ic_max_data_segment_length);
577                         error = EINVAL;
578                         break;
579                 }
580
581                 ic->ic_receive_state = ICL_CONN_STATE_AHS;
582                 ic->ic_receive_len = icl_pdu_ahs_length(request);
583                 break;
584
585         case ICL_CONN_STATE_AHS:
586                 //ICL_DEBUG("receiving AHS");
587                 error = icl_pdu_receive_ahs(request, availablep);
588                 if (error != 0) {
589                         ICL_DEBUG("failed to receive AHS; "
590                             "dropping connection");
591                         break;
592                 }
593                 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
594                 if (ic->ic_header_crc32c == false)
595                         ic->ic_receive_len = 0;
596                 else
597                         ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
598                 break;
599
600         case ICL_CONN_STATE_HEADER_DIGEST:
601                 //ICL_DEBUG("receiving header digest");
602                 error = icl_pdu_check_header_digest(request, availablep);
603                 if (error != 0) {
604                         ICL_DEBUG("header digest failed; "
605                             "dropping connection");
606                         break;
607                 }
608
609                 ic->ic_receive_state = ICL_CONN_STATE_DATA;
610                 ic->ic_receive_len =
611                     icl_pdu_data_segment_receive_len(request);
612                 break;
613
614         case ICL_CONN_STATE_DATA:
615                 //ICL_DEBUG("receiving data segment");
616                 error = icl_pdu_receive_data_segment(request, availablep,
617                     &more_needed);
618                 if (error != 0) {
619                         ICL_DEBUG("failed to receive data segment;"
620                             "dropping connection");
621                         break;
622                 }
623
624                 if (more_needed)
625                         break;
626
627                 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
628                 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
629                         ic->ic_receive_len = 0;
630                 else
631                         ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
632                 break;
633
634         case ICL_CONN_STATE_DATA_DIGEST:
635                 //ICL_DEBUG("receiving data digest");
636                 error = icl_pdu_check_data_digest(request, availablep);
637                 if (error != 0) {
638                         ICL_DEBUG("data digest failed; "
639                             "dropping connection");
640                         break;
641                 }
642
643                 /*
644                  * We've received complete PDU; reset the receive state machine
645                  * and return the PDU.
646                  */
647                 ic->ic_receive_state = ICL_CONN_STATE_BHS;
648                 ic->ic_receive_len = sizeof(struct iscsi_bhs);
649                 ic->ic_receive_pdu = NULL;
650                 return (request);
651
652         default:
653                 panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
654         }
655
656         if (error != 0) {
657                 icl_pdu_free(request);
658                 icl_conn_fail(ic);
659         }
660
661         return (NULL);
662 }
663
664 static void
665 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
666 {
667         struct icl_pdu *response;
668         struct socket *so;
669
670         so = ic->ic_socket;
671
672         /*
673          * This can never happen; we're careful to only mess with ic->ic_socket
674          * pointer when the send/receive threads are not running.
675          */
676         KASSERT(so != NULL, ("NULL socket"));
677
678         for (;;) {
679                 if (ic->ic_disconnecting)
680                         return;
681
682                 if (so->so_error != 0) {
683                         ICL_DEBUG("connection error %d; "
684                             "dropping connection", so->so_error);
685                         icl_conn_fail(ic);
686                         return;
687                 }
688
689                 /*
690                  * Loop until we have a complete PDU or there is not enough
691                  * data in the socket buffer.
692                  */
693                 if (available < ic->ic_receive_len) {
694 #if 0
695                         ICL_DEBUG("not enough data; have %zd, "
696                             "need %zd", available,
697                             ic->ic_receive_len);
698 #endif
699                         return;
700                 }
701
702                 response = icl_conn_receive_pdu(ic, &available);
703                 if (response == NULL)
704                         continue;
705
706                 if (response->ip_ahs_len > 0) {
707                         ICL_WARN("received PDU with unsupported "
708                             "AHS; opcode 0x%x; dropping connection",
709                             response->ip_bhs->bhs_opcode);
710                         icl_pdu_free(response);
711                         icl_conn_fail(ic);
712                         return;
713                 }
714
715                 (ic->ic_receive)(response);
716         }
717 }
718
719 static void
720 icl_receive_thread(void *arg)
721 {
722         struct icl_conn *ic;
723         size_t available;
724         struct socket *so;
725
726         ic = arg;
727         so = ic->ic_socket;
728
729         ICL_CONN_LOCK(ic);
730         ic->ic_receive_running = true;
731         ICL_CONN_UNLOCK(ic);
732
733         for (;;) {
734                 if (ic->ic_disconnecting) {
735                         //ICL_DEBUG("terminating");
736                         break;
737                 }
738
739                 SOCKBUF_LOCK(&so->so_rcv);
740                 available = so->so_rcv.sb_cc;
741                 if (available < ic->ic_receive_len) {
742                         so->so_rcv.sb_lowat = ic->ic_receive_len;
743                         cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
744                 }
745                 SOCKBUF_UNLOCK(&so->so_rcv);
746
747                 icl_conn_receive_pdus(ic, available);
748         }
749
750         ICL_CONN_LOCK(ic);
751         ic->ic_receive_running = false;
752         ICL_CONN_UNLOCK(ic);
753         kthread_exit();
754 }
755
756 static int
757 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
758 {
759         struct icl_conn *ic;
760
761         ic = arg;
762         cv_signal(&ic->ic_receive_cv);
763         return (SU_OK);
764 }
765
766 static int
767 icl_pdu_send(struct icl_pdu *request)
768 {
769         size_t padding, pdu_len;
770         uint32_t digest, zero = 0;
771         int error, ok;
772         struct socket *so;
773         struct icl_conn *ic;
774
775         ic = request->ip_conn;
776         so = request->ip_conn->ic_socket;
777
778         ICL_CONN_LOCK_ASSERT(ic);
779
780         icl_pdu_set_data_segment_length(request, request->ip_data_len);
781
782         pdu_len = icl_pdu_size(request);
783
784         if (ic->ic_header_crc32c) {
785                 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
786                 ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
787                     (void *)&digest);
788                 if (ok != 1) {
789                         ICL_WARN("failed to append header digest");
790                         return (1);
791                 }
792         }
793
794         if (request->ip_data_len != 0) {
795                 padding = icl_pdu_padding(request);
796                 if (padding > 0) {
797                         ok = m_append(request->ip_data_mbuf, padding,
798                             (void *)&zero);
799                         if (ok != 1) {
800                                 ICL_WARN("failed to append padding");
801                                 return (1);
802                         }
803                 }
804
805                 if (ic->ic_data_crc32c) {
806                         digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
807
808                         ok = m_append(request->ip_data_mbuf, sizeof(digest),
809                             (void *)&digest);
810                         if (ok != 1) {
811                                 ICL_WARN("failed to append header digest");
812                                 return (1);
813                         }
814                 }
815
816                 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
817                 request->ip_data_mbuf = NULL;
818         }
819
820         request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
821
822         error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
823             NULL, MSG_DONTWAIT, curthread);
824         request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
825         if (error != 0) {
826                 ICL_DEBUG("sosend error %d", error);
827                 return (error);
828         }
829
830         return (0);
831 }
832
833 static void
834 icl_conn_send_pdus(struct icl_conn *ic)
835 {
836         struct icl_pdu *request;
837         struct socket *so;
838         size_t available, size;
839         int error;
840
841         ICL_CONN_LOCK_ASSERT(ic);
842
843         so = ic->ic_socket;
844
845         SOCKBUF_LOCK(&so->so_snd);
846         available = sbspace(&so->so_snd);
847         SOCKBUF_UNLOCK(&so->so_snd);
848
849         while (!TAILQ_EMPTY(&ic->ic_to_send)) {
850                 if (ic->ic_disconnecting)
851                         return;
852
853                 request = TAILQ_FIRST(&ic->ic_to_send);
854                 size = icl_pdu_size(request);
855                 if (available < size) {
856                         /*
857                          * Set the low watermark on the socket,
858                          * to avoid waking up until there is enough
859                          * space.
860                          */
861                         SOCKBUF_LOCK(&so->so_snd);
862                         so->so_snd.sb_lowat = size;
863                         SOCKBUF_UNLOCK(&so->so_snd);
864 #if 1
865                         ICL_DEBUG("no space to send; "
866                             "have %zd, need %zd",
867                             available, size);
868 #endif
869                         return;
870                 }
871                 available -= size;
872                 TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
873                 error = icl_pdu_send(request);
874                 if (error != 0) {
875                         ICL_DEBUG("failed to send PDU; "
876                             "dropping connection");
877                         icl_conn_fail(ic);
878                         return;
879                 } 
880                 icl_pdu_free(request);
881         }
882 }
883
884 static void
885 icl_send_thread(void *arg)
886 {
887         struct icl_conn *ic;
888
889         ic = arg;
890
891         ICL_CONN_LOCK(ic);
892         ic->ic_send_running = true;
893
894         for (;;) {
895                 if (ic->ic_disconnecting) {
896                         //ICL_DEBUG("terminating");
897                         break;
898                 }
899                 icl_conn_send_pdus(ic);
900                 cv_wait(&ic->ic_send_cv, ic->ic_lock);
901         }
902
903         ic->ic_send_running = false;
904         ICL_CONN_UNLOCK(ic);
905         kthread_exit();
906 }
907
908 static int
909 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
910 {
911         struct icl_conn *ic;
912
913         ic = arg;
914         cv_signal(&ic->ic_send_cv);
915         return (SU_OK);
916 }
917
918 int
919 icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
920 {
921         struct mbuf *mb, *newmb;
922         size_t copylen, off = 0;
923
924         KASSERT(len > 0, ("len == 0"));
925
926         newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
927         if (newmb == NULL) {
928                 ICL_WARN("failed to allocate mbuf for %zd bytes", len);
929                 return (ENOMEM);
930         }
931
932         for (mb = newmb; mb != NULL; mb = mb->m_next) {
933                 copylen = min(M_TRAILINGSPACE(mb), len - off);
934                 memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
935                 mb->m_len = copylen;
936                 off += copylen;
937         }
938         KASSERT(off == len, ("%s: off != len", __func__));
939
940         if (request->ip_data_mbuf == NULL) {
941                 request->ip_data_mbuf = newmb;
942                 request->ip_data_len = len;
943         } else {
944                 m_cat(request->ip_data_mbuf, newmb);
945                 request->ip_data_len += len;
946         }
947
948         return (0);
949 }
950
951 void
952 icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
953 {
954
955         m_copydata(ip->ip_data_mbuf, off, len, addr);
956 }
957
958 void
959 icl_pdu_queue(struct icl_pdu *ip)
960 {
961         struct icl_conn *ic;
962
963         ic = ip->ip_conn;
964
965         ICL_CONN_LOCK_ASSERT(ic);
966
967         if (ic->ic_disconnecting || ic->ic_socket == NULL) {
968                 ICL_DEBUG("icl_pdu_queue on closed connection");
969                 icl_pdu_free(ip);
970                 return;
971         }
972         TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
973         cv_signal(&ic->ic_send_cv);
974 }
975
976 struct icl_conn *
977 icl_conn_new(struct mtx *lock)
978 {
979         struct icl_conn *ic;
980
981         refcount_acquire(&icl_ncons);
982
983         ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
984
985         TAILQ_INIT(&ic->ic_to_send);
986         ic->ic_lock = lock;
987         cv_init(&ic->ic_send_cv, "icl_tx");
988         cv_init(&ic->ic_receive_cv, "icl_rx");
989 #ifdef DIAGNOSTIC
990         refcount_init(&ic->ic_outstanding_pdus, 0);
991 #endif
992         ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
993
994         return (ic);
995 }
996
997 void
998 icl_conn_free(struct icl_conn *ic)
999 {
1000
1001         cv_destroy(&ic->ic_send_cv);
1002         cv_destroy(&ic->ic_receive_cv);
1003         uma_zfree(icl_conn_zone, ic);
1004         refcount_release(&icl_ncons);
1005 }
1006
1007 static int
1008 icl_conn_start(struct icl_conn *ic)
1009 {
1010         size_t bufsize;
1011         struct sockopt opt;
1012         int error, one = 1;
1013
1014         ICL_CONN_LOCK(ic);
1015
1016         /*
1017          * XXX: Ugly hack.
1018          */
1019         if (ic->ic_socket == NULL) {
1020                 ICL_CONN_UNLOCK(ic);
1021                 return (EINVAL);
1022         }
1023
1024         ic->ic_receive_state = ICL_CONN_STATE_BHS;
1025         ic->ic_receive_len = sizeof(struct iscsi_bhs);
1026         ic->ic_disconnecting = false;
1027
1028         ICL_CONN_UNLOCK(ic);
1029
1030         /*
1031          * Use max available sockbuf size for sending.  Do it manually
1032          * instead of sbreserve(9) to work around resource limits.
1033          *
1034          * XXX: This kind of sucks.  On one hand, we don't currently support
1035          *      sending a part of data segment; we always do it in one piece,
1036          *      so we have to make sure it can fit in the socket buffer.
1037          *      Once I've implemented partial send, we'll get rid of this
1038          *      and use autoscaling.
1039          */
1040         bufsize = (sizeof(struct iscsi_bhs) +
1041             ic->ic_max_data_segment_length) * 8;
1042         error = soreserve(ic->ic_socket, bufsize, bufsize);
1043         if (error != 0) {
1044                 ICL_WARN("soreserve failed with error %d", error);
1045                 icl_conn_close(ic);
1046                 return (error);
1047         }
1048
1049         /*
1050          * Disable Nagle.
1051          */
1052         bzero(&opt, sizeof(opt));
1053         opt.sopt_dir = SOPT_SET;
1054         opt.sopt_level = IPPROTO_TCP;
1055         opt.sopt_name = TCP_NODELAY;
1056         opt.sopt_val = &one;
1057         opt.sopt_valsize = sizeof(one);
1058         error = sosetopt(ic->ic_socket, &opt);
1059         if (error != 0) {
1060                 ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1061                 icl_conn_close(ic);
1062                 return (error);
1063         }
1064
1065         /*
1066          * Start threads.
1067          */
1068         error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx");
1069         if (error != 0) {
1070                 ICL_WARN("kthread_add(9) failed with error %d", error);
1071                 icl_conn_close(ic);
1072                 return (error);
1073         }
1074
1075         error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx");
1076         if (error != 0) {
1077                 ICL_WARN("kthread_add(9) failed with error %d", error);
1078                 icl_conn_close(ic);
1079                 return (error);
1080         }
1081
1082         /*
1083          * Register socket upcall, to get notified about incoming PDUs
1084          * and free space to send outgoing ones.
1085          */
1086         SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1087         soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1088         SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1089         SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1090         soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1091         SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1092
1093         return (0);
1094 }
1095
1096 int
1097 icl_conn_handoff(struct icl_conn *ic, int fd)
1098 {
1099         struct file *fp;
1100         struct socket *so;
1101         cap_rights_t rights;
1102         int error;
1103
1104         ICL_CONN_LOCK_ASSERT_NOT(ic);
1105
1106         /*
1107          * Steal the socket from userland.
1108          */
1109         error = fget(curthread, fd,
1110             cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1111         if (error != 0)
1112                 return (error);
1113         if (fp->f_type != DTYPE_SOCKET) {
1114                 fdrop(fp, curthread);
1115                 return (EINVAL);
1116         }
1117         so = fp->f_data;
1118         if (so->so_type != SOCK_STREAM) {
1119                 fdrop(fp, curthread);
1120                 return (EINVAL);
1121         }
1122
1123         ICL_CONN_LOCK(ic);
1124
1125         if (ic->ic_socket != NULL) {
1126                 ICL_CONN_UNLOCK(ic);
1127                 fdrop(fp, curthread);
1128                 return (EBUSY);
1129         }
1130
1131         ic->ic_socket = fp->f_data;
1132         fp->f_ops = &badfileops;
1133         fp->f_data = NULL;
1134         fdrop(fp, curthread);
1135         ICL_CONN_UNLOCK(ic);
1136
1137         error = icl_conn_start(ic);
1138
1139         return (error);
1140 }
1141
1142 void
1143 icl_conn_shutdown(struct icl_conn *ic)
1144 {
1145         ICL_CONN_LOCK_ASSERT_NOT(ic);
1146
1147         ICL_CONN_LOCK(ic);
1148         if (ic->ic_socket == NULL) {
1149                 ICL_CONN_UNLOCK(ic);
1150                 return;
1151         }
1152         ICL_CONN_UNLOCK(ic);
1153
1154         soshutdown(ic->ic_socket, SHUT_RDWR);
1155 }
1156
1157 void
1158 icl_conn_close(struct icl_conn *ic)
1159 {
1160         struct icl_pdu *pdu;
1161
1162         ICL_CONN_LOCK_ASSERT_NOT(ic);
1163
1164         ICL_CONN_LOCK(ic);
1165         if (ic->ic_socket == NULL) {
1166                 ICL_CONN_UNLOCK(ic);
1167                 return;
1168         }
1169
1170         ic->ic_disconnecting = true;
1171
1172         /*
1173          * Wake up the threads, so they can properly terminate.
1174          */
1175         cv_signal(&ic->ic_receive_cv);
1176         cv_signal(&ic->ic_send_cv);
1177         while (ic->ic_receive_running || ic->ic_send_running) {
1178                 //ICL_DEBUG("waiting for send/receive threads to terminate");
1179                 ICL_CONN_UNLOCK(ic);
1180                 cv_signal(&ic->ic_receive_cv);
1181                 cv_signal(&ic->ic_send_cv);
1182                 pause("icl_close", 1 * hz);
1183                 ICL_CONN_LOCK(ic);
1184         }
1185         //ICL_DEBUG("send/receive threads terminated");
1186
1187         soclose(ic->ic_socket);
1188         ic->ic_socket = NULL;
1189
1190         if (ic->ic_receive_pdu != NULL) {
1191                 //ICL_DEBUG("freeing partially received PDU");
1192                 icl_pdu_free(ic->ic_receive_pdu);
1193                 ic->ic_receive_pdu = NULL;
1194         }
1195
1196         /*
1197          * Remove any outstanding PDUs from the send queue.
1198          */
1199         while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1200                 pdu = TAILQ_FIRST(&ic->ic_to_send);
1201                 TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1202                 icl_pdu_free(pdu);
1203         }
1204
1205         KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1206             ("destroying session with non-empty send queue"));
1207         /*
1208          * XXX
1209          */
1210 #if 0
1211         KASSERT(ic->ic_outstanding_pdus == 0,
1212             ("destroying session with %d outstanding PDUs",
1213              ic->ic_outstanding_pdus));
1214 #endif
1215         ICL_CONN_UNLOCK(ic);
1216 }
1217
1218 bool
1219 icl_conn_connected(struct icl_conn *ic)
1220 {
1221         ICL_CONN_LOCK_ASSERT_NOT(ic);
1222
1223         ICL_CONN_LOCK(ic);
1224         if (ic->ic_socket == NULL) {
1225                 ICL_CONN_UNLOCK(ic);
1226                 return (false);
1227         }
1228         if (ic->ic_socket->so_error != 0) {
1229                 ICL_CONN_UNLOCK(ic);
1230                 return (false);
1231         }
1232         ICL_CONN_UNLOCK(ic);
1233         return (true);
1234 }
1235
1236 #ifdef ICL_KERNEL_PROXY
1237 int
1238 icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1239 {
1240         int error;
1241
1242         ICL_CONN_LOCK_ASSERT_NOT(ic);
1243
1244         if (so->so_type != SOCK_STREAM)
1245                 return (EINVAL);
1246
1247         ICL_CONN_LOCK(ic);
1248         if (ic->ic_socket != NULL) {
1249                 ICL_CONN_UNLOCK(ic);
1250                 return (EBUSY);
1251         }
1252         ic->ic_socket = so;
1253         ICL_CONN_UNLOCK(ic);
1254
1255         error = icl_conn_start(ic);
1256
1257         return (error);
1258 }
1259 #endif /* ICL_KERNEL_PROXY */
1260
1261 static int
1262 icl_unload(void)
1263 {
1264
1265         if (icl_ncons != 0)
1266                 return (EBUSY);
1267
1268         uma_zdestroy(icl_conn_zone);
1269         uma_zdestroy(icl_pdu_zone);
1270
1271         return (0);
1272 }
1273
1274 static void
1275 icl_load(void)
1276 {
1277
1278         icl_conn_zone = uma_zcreate("icl_conn",
1279             sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1280             UMA_ALIGN_PTR, 0);
1281         icl_pdu_zone = uma_zcreate("icl_pdu",
1282             sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1283             UMA_ALIGN_PTR, 0);
1284
1285         refcount_init(&icl_ncons, 0);
1286 }
1287
1288 static int
1289 icl_modevent(module_t mod, int what, void *arg)
1290 {
1291
1292         switch (what) {
1293         case MOD_LOAD:
1294                 icl_load();
1295                 return (0);
1296         case MOD_UNLOAD:
1297                 return (icl_unload());
1298         default:
1299                 return (EINVAL);
1300         }
1301 }
1302
1303 moduledata_t icl_data = {
1304         "icl",
1305         icl_modevent,
1306         0
1307 };
1308
1309 DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1310 MODULE_VERSION(icl, 1);