]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - sys/dev/iscsi/icl.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / sys / dev / iscsi / icl.c
1 /*-
2  * Copyright (c) 2012 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 /*
33  * iSCSI Common Layer.  It's used by both the initiator and target to send
34  * and receive iSCSI PDUs.
35  */
36
37 #include <sys/param.h>
38 #include <sys/capability.h>
39 #include <sys/condvar.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/lock.h>
45 #include <sys/mbuf.h>
46 #include <sys/mutex.h>
47 #include <sys/module.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/sx.h>
53 #include <sys/uio.h>
54 #include <vm/uma.h>
55 #include <netinet/in.h>
56 #include <netinet/tcp.h>
57
58 #include "icl.h"
59 #include "iscsi_proto.h"
60
61 SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62 static int debug = 1;
63 TUNABLE_INT("kern.icl.debug", &debug);
64 SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65     &debug, 1, "Enable debug messages");
66 static int partial_receive_len = 1 * 1024; /* XXX: More? */
67 TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69     &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70     "data segment");
71
72 static uma_zone_t icl_conn_zone;
73 static uma_zone_t icl_pdu_zone;
74
75 static volatile u_int   icl_ncons;
76
77 #define ICL_DEBUG(X, ...)                                       \
78         if (debug > 1) {                                        \
79                 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
80         } while (0)
81
82 #define ICL_WARN(X, ...)                                        \
83         if (debug > 0) {                                        \
84                 printf("WARNING: %s: " X "\n",                  \
85                     __func__, ## __VA_ARGS__);                  \
86         } while (0)
87
88 #define ICL_CONN_LOCK(X)                mtx_lock(&X->ic_lock)
89 #define ICL_CONN_UNLOCK(X)              mtx_unlock(&X->ic_lock)
90 #define ICL_CONN_LOCK_ASSERT(X)         mtx_assert(&X->ic_lock, MA_OWNED)
91
92 static void
93 icl_conn_fail(struct icl_conn *ic)
94 {
95         if (ic->ic_socket == NULL)
96                 return;
97
98         /*
99          * XXX
100          */
101         ic->ic_socket->so_error = EDOOFUS;
102         (ic->ic_error)(ic);
103 }
104
105 static struct mbuf *
106 icl_conn_receive(struct icl_conn *ic, size_t len)
107 {
108         struct uio uio;
109         struct socket *so;
110         struct mbuf *m;
111         int error, flags;
112
113         so = ic->ic_socket;
114
115         memset(&uio, 0, sizeof(uio));
116         uio.uio_resid = len;
117
118         flags = MSG_DONTWAIT;
119         error = soreceive(so, NULL, &uio, &m, NULL, &flags);
120         if (error != 0) {
121                 ICL_DEBUG("soreceive error %d", error);
122                 return (NULL);
123         }
124         if (uio.uio_resid != 0) {
125                 m_freem(m);
126                 ICL_DEBUG("short read");
127                 return (NULL);
128         }
129
130         return (m);
131 }
132
133 static struct icl_pdu *
134 icl_pdu_new(struct icl_conn *ic, int flags)
135 {
136         struct icl_pdu *ip;
137
138         refcount_acquire(&ic->ic_outstanding_pdus);
139         ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
140         if (ip == NULL) {
141                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
142                 refcount_release(&ic->ic_outstanding_pdus);
143                 return (NULL);
144         }
145
146         ip->ip_conn = ic;
147
148         return (ip);
149 }
150
151 void
152 icl_pdu_free(struct icl_pdu *ip)
153 {
154         struct icl_conn *ic;
155
156         ic = ip->ip_conn;
157
158         m_freem(ip->ip_bhs_mbuf);
159         m_freem(ip->ip_ahs_mbuf);
160         m_freem(ip->ip_data_mbuf);
161         uma_zfree(icl_pdu_zone, ip);
162         refcount_release(&ic->ic_outstanding_pdus);
163 }
164
165 /*
166  * Allocate icl_pdu with empty BHS to fill up by the caller.
167  */
168 struct icl_pdu *
169 icl_pdu_new_bhs(struct icl_conn *ic, int flags)
170 {
171         struct icl_pdu *ip;
172
173         ip = icl_pdu_new(ic, flags);
174         if (ip == NULL)
175                 return (NULL);
176
177         ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
178             flags, MT_DATA, M_PKTHDR);
179         if (ip->ip_bhs_mbuf == NULL) {
180                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
181                 icl_pdu_free(ip);
182                 return (NULL);
183         }
184         ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
185         memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
186         ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
187
188         return (ip);
189 }
190
191 static int
192 icl_pdu_ahs_length(const struct icl_pdu *request)
193 {
194
195         return (request->ip_bhs->bhs_total_ahs_len * 4);
196 }
197
198 size_t
199 icl_pdu_data_segment_length(const struct icl_pdu *request)
200 {
201         uint32_t len = 0;
202
203         len += request->ip_bhs->bhs_data_segment_len[0];
204         len <<= 8;
205         len += request->ip_bhs->bhs_data_segment_len[1];
206         len <<= 8;
207         len += request->ip_bhs->bhs_data_segment_len[2];
208
209         return (len);
210 }
211
212 static void
213 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
214 {
215
216         response->ip_bhs->bhs_data_segment_len[2] = len;
217         response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
218         response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
219 }
220
221 static size_t
222 icl_pdu_padding(const struct icl_pdu *ip)
223 {
224
225         if ((ip->ip_data_len % 4) != 0)
226                 return (4 - (ip->ip_data_len % 4));
227
228         return (0);
229 }
230
231 static size_t
232 icl_pdu_size(const struct icl_pdu *response)
233 {
234         size_t len;
235
236         KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
237
238         len = sizeof(struct iscsi_bhs) + response->ip_data_len +
239             icl_pdu_padding(response);
240         if (response->ip_conn->ic_header_crc32c)
241                 len += ISCSI_HEADER_DIGEST_SIZE;
242         if (response->ip_conn->ic_data_crc32c)
243                 len += ISCSI_DATA_DIGEST_SIZE;
244
245         return (len);
246 }
247
248 static int
249 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
250 {
251         struct mbuf *m;
252
253         m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
254         if (m == NULL) {
255                 ICL_DEBUG("failed to receive BHS");
256                 return (-1);
257         }
258
259         request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
260         if (request->ip_bhs_mbuf == NULL) {
261                 ICL_WARN("m_pullup failed");
262                 return (-1);
263         }
264         request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
265
266         /*
267          * XXX: For architectures with strict alignment requirements
268          *      we may need to allocate ip_bhs and copy the data into it.
269          *      For some reason, though, not doing this doesn't seem
270          *      to cause problems; tested on sparc64.
271          */
272
273         *availablep -= sizeof(struct iscsi_bhs);
274         return (0);
275 }
276
277 static int
278 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
279 {
280
281         request->ip_ahs_len = icl_pdu_ahs_length(request);
282         if (request->ip_ahs_len == 0)
283                 return (0);
284
285         request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
286             request->ip_ahs_len);
287         if (request->ip_ahs_mbuf == NULL) {
288                 ICL_DEBUG("failed to receive AHS");
289                 return (-1);
290         }
291
292         *availablep -= request->ip_ahs_len;
293         return (0);
294 }
295
296 static uint32_t
297 icl_mbuf_to_crc32c(const struct mbuf *m0)
298 {
299         uint32_t digest = 0xffffffff;
300         const struct mbuf *m;
301
302         for (m = m0; m != NULL; m = m->m_next)
303                 digest = calculate_crc32c(digest,
304                     mtod(m, const void *), m->m_len);
305
306         digest = digest ^ 0xffffffff;
307
308         return (digest);
309 }
310
311 static int
312 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
313 {
314         struct mbuf *m;
315         uint32_t received_digest, valid_digest;
316
317         if (request->ip_conn->ic_header_crc32c == false)
318                 return (0);
319
320         m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
321         if (m == NULL) {
322                 ICL_DEBUG("failed to receive header digest");
323                 return (-1);
324         }
325
326         CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
327         memcpy(&received_digest, mtod(m, void *), ISCSI_HEADER_DIGEST_SIZE);
328         m_freem(m);
329
330         *availablep -= ISCSI_HEADER_DIGEST_SIZE;
331
332         /*
333          * XXX: Handle AHS.
334          */
335         valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
336         if (received_digest != valid_digest) {
337                 ICL_WARN("header digest check failed; got 0x%x, "
338                     "should be 0x%x", received_digest, valid_digest);
339                 return (-1);
340         }
341
342         return (0);
343 }
344
345 /*
346  * Return the number of bytes that should be waiting in the receive socket
347  * before icl_pdu_receive_data_segment() gets called.
348  */
349 static size_t
350 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
351 {
352         size_t len;
353
354         len = icl_pdu_data_segment_length(request);
355         if (len == 0)
356                 return (0);
357
358         /*
359          * Account for the parts of data segment already read from
360          * the socket buffer.
361          */
362         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
363         len -= request->ip_data_len;
364
365         /*
366          * Don't always wait for the full data segment to be delivered
367          * to the socket; this might badly affect performance due to
368          * TCP window scaling.
369          */
370         if (len > partial_receive_len) {
371 #if 0
372                 ICL_DEBUG("need %zd bytes of data, limiting to %zd",
373                     len, partial_receive_len));
374 #endif
375                 len = partial_receive_len;
376
377                 return (len);
378         }
379
380         /*
381          * Account for padding.  Note that due to the way code is written,
382          * the icl_pdu_receive_data_segment() must always receive padding
383          * along with the last part of data segment, because it would be
384          * impossible to tell whether we've already received the full data
385          * segment including padding, or without it.
386          */
387         if ((len % 4) != 0)
388                 len += 4 - (len % 4);
389
390 #if 0
391         ICL_DEBUG("need %zd bytes of data", len));
392 #endif
393
394         return (len);
395 }
396
397 static int
398 icl_pdu_receive_data_segment(struct icl_pdu *request,
399     size_t *availablep, bool *more_neededp)
400 {
401         struct icl_conn *ic;
402         size_t len, padding = 0;
403         struct mbuf *m;
404
405         ic = request->ip_conn;
406
407         *more_neededp = false;
408         ic->ic_receive_len = 0;
409
410         len = icl_pdu_data_segment_length(request);
411         if (len == 0)
412                 return (0);
413
414         if ((len % 4) != 0)
415                 padding = 4 - (len % 4);
416
417         /*
418          * Account for already received parts of data segment.
419          */
420         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
421         len -= request->ip_data_len;
422
423         if (len + padding > *availablep) {
424                 /*
425                  * Not enough data in the socket buffer.  Receive as much
426                  * as we can.  Don't receive padding, since, obviously, it's
427                  * not the end of data segment yet.
428                  */
429 #if 0
430                 ICL_DEBUG("limited from %zd to %zd",
431                     len + padding, *availablep - padding));
432 #endif
433                 len = *availablep - padding;
434                 *more_neededp = true;
435                 padding = 0;
436         }
437
438         /*
439          * Must not try to receive padding without at least one byte
440          * of actual data segment.
441          */
442         if (len > 0) {
443                 m = icl_conn_receive(request->ip_conn, len + padding);
444                 if (m == NULL) {
445                         ICL_DEBUG("failed to receive data segment");
446                         return (-1);
447                 }
448
449                 if (request->ip_data_mbuf == NULL)
450                         request->ip_data_mbuf = m;
451                 else
452                         m_cat(request->ip_data_mbuf, m);
453
454                 request->ip_data_len += len;
455                 *availablep -= len + padding;
456         } else
457                 ICL_DEBUG("len 0");
458
459         if (*more_neededp)
460                 ic->ic_receive_len =
461                     icl_pdu_data_segment_receive_len(request);
462
463         return (0);
464 }
465
466 static int
467 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
468 {
469         struct mbuf *m;
470         uint32_t received_digest, valid_digest;
471
472         if (request->ip_conn->ic_data_crc32c == false)
473                 return (0);
474
475         if (request->ip_data_len == 0)
476                 return (0);
477
478         m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
479         if (m == NULL) {
480                 ICL_DEBUG("failed to receive data digest");
481                 return (-1);
482         }
483
484         CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
485         memcpy(&received_digest, mtod(m, void *), ISCSI_DATA_DIGEST_SIZE);
486         m_freem(m);
487
488         *availablep -= ISCSI_DATA_DIGEST_SIZE;
489
490         /*
491          * Note that ip_data_mbuf also contains padding; since digest
492          * calculation is supposed to include that, we iterate over
493          * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
494          */
495         valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
496         if (received_digest != valid_digest) {
497                 ICL_WARN("data digest check failed; got 0x%x, "
498                     "should be 0x%x", received_digest, valid_digest);
499                 return (-1);
500         }
501
502         return (0);
503 }
504
505 /*
506  * Somewhat contrary to the name, this attempts to receive only one
507  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
508  */
509 static struct icl_pdu *
510 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
511 {
512         struct icl_pdu *request;
513         struct socket *so;
514         size_t len;
515         int error;
516         bool more_needed;
517
518         so = ic->ic_socket;
519
520         if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
521                 KASSERT(ic->ic_receive_pdu == NULL,
522                     ("ic->ic_receive_pdu != NULL"));
523                 request = icl_pdu_new(ic, M_NOWAIT);
524                 if (request == NULL) {
525                         ICL_DEBUG("failed to allocate PDU; "
526                             "dropping connection");
527                         icl_conn_fail(ic);
528                         return (NULL);
529                 }
530                 ic->ic_receive_pdu = request;
531         } else {
532                 KASSERT(ic->ic_receive_pdu != NULL,
533                     ("ic->ic_receive_pdu == NULL"));
534                 request = ic->ic_receive_pdu;
535         }
536
537         if (*availablep < ic->ic_receive_len) {
538 #if 0
539                 ICL_DEBUG("not enough data; need %zd, "
540                     "have %zd", ic->ic_receive_len, *availablep);
541 #endif
542                 return (NULL);
543         }
544
545         switch (ic->ic_receive_state) {
546         case ICL_CONN_STATE_BHS:
547                 //ICL_DEBUG("receiving BHS");
548                 error = icl_pdu_receive_bhs(request, availablep);
549                 if (error != 0) {
550                         ICL_DEBUG("failed to receive BHS; "
551                             "dropping connection");
552                         break;
553                 }
554
555                 /*
556                  * We don't enforce any limit for AHS length;
557                  * its length is stored in 8 bit field.
558                  */
559
560                 len = icl_pdu_data_segment_length(request);
561                 if (len > ic->ic_max_data_segment_length) {
562                         ICL_WARN("received data segment "
563                             "length %zd is larger than negotiated "
564                             "MaxDataSegmentLength %zd; "
565                             "dropping connection",
566                             len, ic->ic_max_data_segment_length);
567                         error = EINVAL;
568                         break;
569                 }
570
571                 ic->ic_receive_state = ICL_CONN_STATE_AHS;
572                 ic->ic_receive_len = icl_pdu_ahs_length(request);
573                 break;
574
575         case ICL_CONN_STATE_AHS:
576                 //ICL_DEBUG("receiving AHS");
577                 error = icl_pdu_receive_ahs(request, availablep);
578                 if (error != 0) {
579                         ICL_DEBUG("failed to receive AHS; "
580                             "dropping connection");
581                         break;
582                 }
583                 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
584                 if (ic->ic_header_crc32c == false)
585                         ic->ic_receive_len = 0;
586                 else
587                         ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
588                 break;
589
590         case ICL_CONN_STATE_HEADER_DIGEST:
591                 //ICL_DEBUG("receiving header digest");
592                 error = icl_pdu_check_header_digest(request, availablep);
593                 if (error != 0) {
594                         ICL_DEBUG("header digest failed; "
595                             "dropping connection");
596                         break;
597                 }
598
599                 ic->ic_receive_state = ICL_CONN_STATE_DATA;
600                 ic->ic_receive_len =
601                     icl_pdu_data_segment_receive_len(request);
602                 break;
603
604         case ICL_CONN_STATE_DATA:
605                 //ICL_DEBUG("receiving data segment");
606                 error = icl_pdu_receive_data_segment(request, availablep,
607                     &more_needed);
608                 if (error != 0) {
609                         ICL_DEBUG("failed to receive data segment;"
610                             "dropping connection");
611                         break;
612                 }
613
614                 if (more_needed)
615                         break;
616
617                 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
618                 if (ic->ic_data_crc32c == false)
619                         ic->ic_receive_len = 0;
620                 else
621                         ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
622                 break;
623
624         case ICL_CONN_STATE_DATA_DIGEST:
625                 //ICL_DEBUG("receiving data digest");
626                 error = icl_pdu_check_data_digest(request, availablep);
627                 if (error != 0) {
628                         ICL_DEBUG("data digest failed; "
629                             "dropping connection");
630                         break;
631                 }
632
633                 /*
634                  * We've received complete PDU; reset the receive state machine
635                  * and return the PDU.
636                  */
637                 ic->ic_receive_state = ICL_CONN_STATE_BHS;
638                 ic->ic_receive_len = sizeof(struct iscsi_bhs);
639                 ic->ic_receive_pdu = NULL;
640                 return (request);
641
642         default:
643                 panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
644         }
645
646         if (error != 0) {
647                 icl_pdu_free(request);
648                 icl_conn_fail(ic);
649         }
650
651         return (NULL);
652 }
653
654 static void
655 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
656 {
657         struct icl_pdu *response;
658         struct socket *so;
659
660         so = ic->ic_socket;
661
662         /*
663          * This can never happen; we're careful to only mess with ic->ic_socket
664          * pointer when the send/receive threads are not running.
665          */
666         KASSERT(so != NULL, ("NULL socket"));
667
668         for (;;) {
669                 if (ic->ic_disconnecting)
670                         return;
671
672                 if (so->so_error != 0) {
673                         ICL_DEBUG("connection error %d; "
674                             "dropping connection", so->so_error);
675                         icl_conn_fail(ic);
676                         return;
677                 }
678
679                 /*
680                  * Loop until we have a complete PDU or there is not enough
681                  * data in the socket buffer.
682                  */
683                 if (available < ic->ic_receive_len) {
684 #if 0
685                         ICL_DEBUG("not enough data; have %zd, "
686                             "need %zd", available,
687                             ic->ic_receive_len);
688 #endif
689                         return;
690                 }
691
692                 response = icl_conn_receive_pdu(ic, &available);
693                 if (response == NULL)
694                         continue;
695
696                 if (response->ip_ahs_len > 0) {
697                         ICL_WARN("received PDU with unsupported "
698                             "AHS; opcode 0x%x; dropping connection",
699                             response->ip_bhs->bhs_opcode);
700                         icl_pdu_free(response);
701                         icl_conn_fail(ic);
702                         return;
703                 }
704
705                 (ic->ic_receive)(response);
706         }
707 }
708
709 static void
710 icl_receive_thread(void *arg)
711 {
712         struct icl_conn *ic;
713         size_t available;
714         struct socket *so;
715
716         ic = arg;
717         so = ic->ic_socket;
718
719         ICL_CONN_LOCK(ic);
720         ic->ic_receive_running = true;
721         ICL_CONN_UNLOCK(ic);
722
723         for (;;) {
724                 if (ic->ic_disconnecting) {
725                         //ICL_DEBUG("terminating");
726                         break;
727                 }
728
729                 SOCKBUF_LOCK(&so->so_rcv);
730                 available = so->so_rcv.sb_cc;
731                 if (available < ic->ic_receive_len) {
732                         so->so_rcv.sb_lowat = ic->ic_receive_len;
733                         cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
734                 }
735                 SOCKBUF_UNLOCK(&so->so_rcv);
736
737                 icl_conn_receive_pdus(ic, available);
738         }
739
740         ICL_CONN_LOCK(ic);
741         ic->ic_receive_running = false;
742         ICL_CONN_UNLOCK(ic);
743         kthread_exit();
744 }
745
746 static int
747 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
748 {
749         struct icl_conn *ic;
750
751         ic = arg;
752         cv_signal(&ic->ic_receive_cv);
753         return (SU_OK);
754 }
755
756 static int
757 icl_pdu_send(struct icl_pdu *request)
758 {
759         size_t padding, pdu_len;
760         uint32_t digest, zero = 0;
761         int error, ok;
762         struct socket *so;
763         struct icl_conn *ic;
764
765         ic = request->ip_conn;
766         so = request->ip_conn->ic_socket;
767
768         ICL_CONN_LOCK_ASSERT(ic);
769
770         icl_pdu_set_data_segment_length(request, request->ip_data_len);
771
772         pdu_len = icl_pdu_size(request);
773
774         if (ic->ic_header_crc32c) {
775                 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
776                 ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
777                     (void *)&digest);
778                 if (ok != 1) {
779                         ICL_WARN("failed to append header digest");
780                         return (1);
781                 }
782         }
783
784         if (request->ip_data_len != 0) {
785                 padding = icl_pdu_padding(request);
786                 if (padding > 0) {
787                         ok = m_append(request->ip_data_mbuf, padding,
788                             (void *)&zero);
789                         if (ok != 1) {
790                                 ICL_WARN("failed to append padding");
791                                 return (1);
792                         }
793                 }
794
795                 if (ic->ic_data_crc32c) {
796                         digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
797
798                         ok = m_append(request->ip_data_mbuf, sizeof(digest),
799                             (void *)&digest);
800                         if (ok != 1) {
801                                 ICL_WARN("failed to append header digest");
802                                 return (1);
803                         }
804                 }
805
806                 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
807                 request->ip_data_mbuf = NULL;
808         }
809
810         request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
811
812         error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
813             NULL, MSG_DONTWAIT, curthread);
814         request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
815         if (error != 0) {
816                 ICL_DEBUG("sosend error %d", error);
817                 return (error);
818         }
819
820         return (0);
821 }
822
823 static void
824 icl_conn_send_pdus(struct icl_conn *ic)
825 {
826         struct icl_pdu *request;
827         struct socket *so;
828         size_t available, size;
829         int error;
830
831         ICL_CONN_LOCK_ASSERT(ic);
832
833         so = ic->ic_socket;
834
835         SOCKBUF_LOCK(&so->so_snd);
836         available = sbspace(&so->so_snd);
837         SOCKBUF_UNLOCK(&so->so_snd);
838
839         while (!TAILQ_EMPTY(&ic->ic_to_send)) {
840                 if (ic->ic_disconnecting)
841                         return;
842
843                 request = TAILQ_FIRST(&ic->ic_to_send);
844                 size = icl_pdu_size(request);
845                 if (available < size) {
846                         /*
847                          * Set the low watermark on the socket,
848                          * to avoid waking up until there is enough
849                          * space.
850                          */
851                         SOCKBUF_LOCK(&so->so_snd);
852                         so->so_snd.sb_lowat = size;
853                         SOCKBUF_UNLOCK(&so->so_snd);
854 #if 1
855                         ICL_DEBUG("no space to send; "
856                             "have %zd, need %zd",
857                             available, size);
858 #endif
859                         return;
860                 }
861                 available -= size;
862                 TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
863                 error = icl_pdu_send(request);
864                 if (error != 0) {
865                         ICL_DEBUG("failed to send PDU; "
866                             "dropping connection");
867                         icl_conn_fail(ic);
868                         return;
869                 } 
870                 icl_pdu_free(request);
871         }
872 }
873
874 static void
875 icl_send_thread(void *arg)
876 {
877         struct icl_conn *ic;
878
879         ic = arg;
880
881         ICL_CONN_LOCK(ic);
882         ic->ic_send_running = true;
883
884         for (;;) {
885                 if (ic->ic_disconnecting) {
886                         //ICL_DEBUG("terminating");
887                         break;
888                 }
889                 icl_conn_send_pdus(ic);
890                 cv_wait(&ic->ic_send_cv, &ic->ic_lock);
891         }
892
893         ic->ic_send_running = false;
894         ICL_CONN_UNLOCK(ic);
895         kthread_exit();
896 }
897
898 static int
899 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
900 {
901         struct icl_conn *ic;
902
903         ic = arg;
904         cv_signal(&ic->ic_send_cv);
905         return (SU_OK);
906 }
907
908 int
909 icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
910 {
911         struct mbuf *mb, *newmb;
912         size_t copylen, off = 0;
913
914         KASSERT(len > 0, ("len == 0"));
915
916         newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
917         if (newmb == NULL) {
918                 ICL_WARN("failed to allocate mbuf for %zd bytes", len);
919                 return (ENOMEM);
920         }
921
922         for (mb = newmb; mb != NULL; mb = mb->m_next) {
923                 copylen = min(M_TRAILINGSPACE(mb), len - off);
924                 memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
925                 mb->m_len = copylen;
926                 off += copylen;
927         }
928         KASSERT(off == len, ("%s: off != len", __func__));
929
930         if (request->ip_data_mbuf == NULL) {
931                 request->ip_data_mbuf = newmb;
932                 request->ip_data_len = len;
933         } else {
934                 m_cat(request->ip_data_mbuf, newmb);
935                 request->ip_data_len += len;
936         }
937
938         return (0);
939 }
940
941 void
942 icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
943 {
944
945         m_copydata(ip->ip_data_mbuf, off, len, addr);
946 }
947
948 void
949 icl_pdu_queue(struct icl_pdu *ip)
950 {
951         struct icl_conn *ic;
952
953         ic = ip->ip_conn;
954
955         ICL_CONN_LOCK(ic);
956         if (ic->ic_disconnecting || ic->ic_socket == NULL) {
957                 ICL_DEBUG("icl_pdu_queue on closed connection");
958                 ICL_CONN_UNLOCK(ic);
959                 icl_pdu_free(ip);
960                 return;
961         }
962         TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
963         ICL_CONN_UNLOCK(ic);
964         cv_signal(&ic->ic_send_cv);
965 }
966
967 struct icl_conn *
968 icl_conn_new(void)
969 {
970         struct icl_conn *ic;
971
972         refcount_acquire(&icl_ncons);
973
974         ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
975
976         TAILQ_INIT(&ic->ic_to_send);
977         mtx_init(&ic->ic_lock, "icl_lock", NULL, MTX_DEF);
978         cv_init(&ic->ic_send_cv, "icl_tx");
979         cv_init(&ic->ic_receive_cv, "icl_rx");
980         refcount_init(&ic->ic_outstanding_pdus, 0);
981         ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
982
983         return (ic);
984 }
985
986 void
987 icl_conn_free(struct icl_conn *ic)
988 {
989
990         mtx_destroy(&ic->ic_lock);
991         cv_destroy(&ic->ic_send_cv);
992         cv_destroy(&ic->ic_receive_cv);
993         uma_zfree(icl_conn_zone, ic);
994         refcount_release(&icl_ncons);
995 }
996
997 static int
998 icl_conn_start(struct icl_conn *ic)
999 {
1000         size_t bufsize;
1001         struct sockopt opt;
1002         int error, one = 1;
1003
1004         ICL_CONN_LOCK(ic);
1005
1006         /*
1007          * XXX: Ugly hack.
1008          */
1009         if (ic->ic_socket == NULL) {
1010                 ICL_CONN_UNLOCK(ic);
1011                 return (EINVAL);
1012         }
1013
1014         ic->ic_receive_state = ICL_CONN_STATE_BHS;
1015         ic->ic_receive_len = sizeof(struct iscsi_bhs);
1016         ic->ic_disconnecting = false;
1017
1018         ICL_CONN_UNLOCK(ic);
1019
1020         /*
1021          * Use max available sockbuf size for sending.  Do it manually
1022          * instead of sbreserve(9) to work around resource limits.
1023          *
1024          * XXX: This kind of sucks.  On one hand, we don't currently support
1025          *      sending a part of data segment; we always do it in one piece,
1026          *      so we have to make sure it can fit in the socket buffer.
1027          *      Once I've implemented partial send, we'll get rid of this
1028          *      and use autoscaling.
1029          */
1030         bufsize = (sizeof(struct iscsi_bhs) +
1031             ic->ic_max_data_segment_length) * 8;
1032         error = soreserve(ic->ic_socket, bufsize, bufsize);
1033         if (error != 0) {
1034                 ICL_WARN("soreserve failed with error %d", error);
1035                 icl_conn_close(ic);
1036                 return (error);
1037         }
1038
1039         /*
1040          * Disable Nagle.
1041          */
1042         bzero(&opt, sizeof(opt));
1043         opt.sopt_dir = SOPT_SET;
1044         opt.sopt_level = IPPROTO_TCP;
1045         opt.sopt_name = TCP_NODELAY;
1046         opt.sopt_val = &one;
1047         opt.sopt_valsize = sizeof(one);
1048         error = sosetopt(ic->ic_socket, &opt);
1049         if (error != 0) {
1050                 ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1051                 icl_conn_close(ic);
1052                 return (error);
1053         }
1054
1055         /*
1056          * Start threads.
1057          */
1058         error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx");
1059         if (error != 0) {
1060                 ICL_WARN("kthread_add(9) failed with error %d", error);
1061                 icl_conn_close(ic);
1062                 return (error);
1063         }
1064
1065         error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx");
1066         if (error != 0) {
1067                 ICL_WARN("kthread_add(9) failed with error %d", error);
1068                 icl_conn_close(ic);
1069                 return (error);
1070         }
1071
1072         /*
1073          * Register socket upcall, to get notified about incoming PDUs
1074          * and free space to send outgoing ones.
1075          */
1076         SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1077         soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1078         SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1079         SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1080         soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1081         SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1082
1083         return (0);
1084 }
1085
1086 int
1087 icl_conn_handoff(struct icl_conn *ic, int fd)
1088 {
1089         struct file *fp;
1090         struct socket *so;
1091         cap_rights_t rights;
1092         int error;
1093
1094         /*
1095          * Steal the socket from userland.
1096          */
1097         error = fget(curthread, fd,
1098             cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1099         if (error != 0)
1100                 return (error);
1101         if (fp->f_type != DTYPE_SOCKET) {
1102                 fdrop(fp, curthread);
1103                 return (EINVAL);
1104         }
1105         so = fp->f_data;
1106         if (so->so_type != SOCK_STREAM) {
1107                 fdrop(fp, curthread);
1108                 return (EINVAL);
1109         }
1110
1111         ICL_CONN_LOCK(ic);
1112
1113         if (ic->ic_socket != NULL) {
1114                 ICL_CONN_UNLOCK(ic);
1115                 fdrop(fp, curthread);
1116                 return (EBUSY);
1117         }
1118
1119         ic->ic_socket = fp->f_data;
1120         fp->f_ops = &badfileops;
1121         fp->f_data = NULL;
1122         fdrop(fp, curthread);
1123         ICL_CONN_UNLOCK(ic);
1124
1125         error = icl_conn_start(ic);
1126
1127         return (error);
1128 }
1129
1130 void
1131 icl_conn_shutdown(struct icl_conn *ic)
1132 {
1133
1134         ICL_CONN_LOCK(ic);
1135         if (ic->ic_socket == NULL) {
1136                 ICL_CONN_UNLOCK(ic);
1137                 return;
1138         }
1139         ICL_CONN_UNLOCK(ic);
1140
1141         soshutdown(ic->ic_socket, SHUT_RDWR);
1142 }
1143
1144 void
1145 icl_conn_close(struct icl_conn *ic)
1146 {
1147         struct icl_pdu *pdu;
1148
1149         ICL_CONN_LOCK(ic);
1150         if (ic->ic_socket == NULL) {
1151                 ICL_CONN_UNLOCK(ic);
1152                 return;
1153         }
1154
1155         ic->ic_disconnecting = true;
1156
1157         /*
1158          * Wake up the threads, so they can properly terminate.
1159          */
1160         cv_signal(&ic->ic_receive_cv);
1161         cv_signal(&ic->ic_send_cv);
1162         while (ic->ic_receive_running || ic->ic_send_running) {
1163                 //ICL_DEBUG("waiting for send/receive threads to terminate");
1164                 ICL_CONN_UNLOCK(ic);
1165                 cv_signal(&ic->ic_receive_cv);
1166                 cv_signal(&ic->ic_send_cv);
1167                 pause("icl_close", 1 * hz);
1168                 ICL_CONN_LOCK(ic);
1169         }
1170         //ICL_DEBUG("send/receive threads terminated");
1171
1172         soclose(ic->ic_socket);
1173         ic->ic_socket = NULL;
1174
1175         if (ic->ic_receive_pdu != NULL) {
1176                 //ICL_DEBUG("freeing partially received PDU");
1177                 icl_pdu_free(ic->ic_receive_pdu);
1178                 ic->ic_receive_pdu = NULL;
1179         }
1180
1181         /*
1182          * Remove any outstanding PDUs from the send queue.
1183          */
1184         while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1185                 pdu = TAILQ_FIRST(&ic->ic_to_send);
1186                 TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1187                 icl_pdu_free(pdu);
1188         }
1189
1190         KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1191             ("destroying session with non-empty send queue"));
1192         /*
1193          * XXX
1194          */
1195 #if 0
1196         KASSERT(ic->ic_outstanding_pdus == 0,
1197             ("destroying session with %d outstanding PDUs",
1198              ic->ic_outstanding_pdus));
1199 #endif
1200         ICL_CONN_UNLOCK(ic);
1201 }
1202
1203 bool
1204 icl_conn_connected(struct icl_conn *ic)
1205 {
1206
1207         ICL_CONN_LOCK(ic);
1208         if (ic->ic_socket == NULL) {
1209                 ICL_CONN_UNLOCK(ic);
1210                 return (false);
1211         }
1212         if (ic->ic_socket->so_error != 0) {
1213                 ICL_CONN_UNLOCK(ic);
1214                 return (false);
1215         }
1216         ICL_CONN_UNLOCK(ic);
1217         return (true);
1218 }
1219
1220 #ifdef ICL_KERNEL_PROXY
1221 int
1222 icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1223 {
1224         int error;
1225
1226         if (so->so_type != SOCK_STREAM)
1227                 return (EINVAL);
1228
1229         ICL_CONN_LOCK(ic);
1230         if (ic->ic_socket != NULL) {
1231                 ICL_CONN_UNLOCK(ic);
1232                 return (EBUSY);
1233         }
1234         ic->ic_socket = so;
1235         ICL_CONN_UNLOCK(ic);
1236
1237         error = icl_conn_start(ic);
1238
1239         return (error);
1240 }
1241 #endif /* ICL_KERNEL_PROXY */
1242
1243 static int
1244 icl_unload(void)
1245 {
1246
1247         if (icl_ncons != 0)
1248                 return (EBUSY);
1249
1250         uma_zdestroy(icl_conn_zone);
1251         uma_zdestroy(icl_pdu_zone);
1252
1253         return (0);
1254 }
1255
1256 static void
1257 icl_load(void)
1258 {
1259
1260         icl_conn_zone = uma_zcreate("icl_conn",
1261             sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1262             UMA_ALIGN_PTR, 0);
1263         icl_pdu_zone = uma_zcreate("icl_pdu",
1264             sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1265             UMA_ALIGN_PTR, 0);
1266
1267         refcount_init(&icl_ncons, 0);
1268 }
1269
1270 static int
1271 icl_modevent(module_t mod, int what, void *arg)
1272 {
1273
1274         switch (what) {
1275         case MOD_LOAD:
1276                 icl_load();
1277                 return (0);
1278         case MOD_UNLOAD:
1279                 return (icl_unload());
1280         default:
1281                 return (EINVAL);
1282         }
1283 }
1284
1285 moduledata_t icl_data = {
1286         "icl",
1287         icl_modevent,
1288         0
1289 };
1290
1291 DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1292 MODULE_VERSION(icl, 1);