]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/iscsi/icl.c
MFV r254750:
[FreeBSD/FreeBSD.git] / sys / dev / iscsi / icl.c
1 /*-
2  * Copyright (c) 2012 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 /*
33  * iSCSI Common Layer.  It's used by both the initiator and target to send
34  * and receive iSCSI PDUs.
35  */
36
37 #include <sys/param.h>
38 #include <sys/capability.h>
39 #include <sys/condvar.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/lock.h>
45 #include <sys/mbuf.h>
46 #include <sys/mutex.h>
47 #include <sys/module.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/sx.h>
53 #include <sys/uio.h>
54 #include <vm/uma.h>
55 #include <netinet/in.h>
56 #include <netinet/tcp.h>
57
58 #include "icl.h"
59 #include "iscsi_proto.h"
60
61 SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62 static int debug = 1;
63 TUNABLE_INT("kern.icl.debug", &debug);
64 SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65     &debug, 1, "Enable debug messages");
66 static int partial_receive_len = 1 * 1024; /* XXX: More? */
67 TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69     &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70     "data segment");
71
72 static uma_zone_t icl_conn_zone;
73 static uma_zone_t icl_pdu_zone;
74
75 static volatile u_int   icl_ncons;
76
77 #define ICL_DEBUG(X, ...)                                       \
78         if (debug > 1) {                                        \
79                 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
80         } while (0)
81
82 #define ICL_WARN(X, ...)                                        \
83         if (debug > 0) {                                        \
84                 printf("WARNING: %s: " X "\n",                  \
85                     __func__, ## __VA_ARGS__);                  \
86         } while (0)
87
88 #define ICL_CONN_LOCK(X)                mtx_lock(&X->ic_lock)
89 #define ICL_CONN_UNLOCK(X)              mtx_unlock(&X->ic_lock)
90 #define ICL_CONN_LOCK_ASSERT(X)         mtx_assert(&X->ic_lock, MA_OWNED)
91
92 static void
93 icl_conn_fail(struct icl_conn *ic)
94 {
95         if (ic->ic_socket == NULL)
96                 return;
97
98         /*
99          * XXX
100          */
101         ic->ic_socket->so_error = EDOOFUS;
102         (ic->ic_error)(ic);
103 }
104
105 static struct mbuf *
106 icl_conn_receive(struct icl_conn *ic, size_t len)
107 {
108         struct uio uio;
109         struct socket *so;
110         struct mbuf *m;
111         int error, flags;
112
113         so = ic->ic_socket;
114
115         memset(&uio, 0, sizeof(uio));
116         uio.uio_resid = len;
117
118         flags = MSG_DONTWAIT;
119         error = soreceive(so, NULL, &uio, &m, NULL, &flags);
120         if (error != 0) {
121                 ICL_DEBUG("soreceive error %d", error);
122                 return (NULL);
123         }
124         if (uio.uio_resid != 0) {
125                 m_freem(m);
126                 ICL_DEBUG("short read");
127                 return (NULL);
128         }
129
130         return (m);
131 }
132
133 static struct icl_pdu *
134 icl_pdu_new(struct icl_conn *ic, int flags)
135 {
136         struct icl_pdu *ip;
137
138         refcount_acquire(&ic->ic_outstanding_pdus);
139         ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
140         if (ip == NULL) {
141                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
142                 refcount_release(&ic->ic_outstanding_pdus);
143                 return (NULL);
144         }
145
146         ip->ip_conn = ic;
147
148         return (ip);
149 }
150
151 void
152 icl_pdu_free(struct icl_pdu *ip)
153 {
154         struct icl_conn *ic;
155
156         ic = ip->ip_conn;
157
158         m_freem(ip->ip_bhs_mbuf);
159         m_freem(ip->ip_ahs_mbuf);
160         m_freem(ip->ip_data_mbuf);
161         uma_zfree(icl_pdu_zone, ip);
162         refcount_release(&ic->ic_outstanding_pdus);
163 }
164
165 /*
166  * Allocate icl_pdu with empty BHS to fill up by the caller.
167  */
168 struct icl_pdu *
169 icl_pdu_new_bhs(struct icl_conn *ic, int flags)
170 {
171         struct icl_pdu *ip;
172
173         ip = icl_pdu_new(ic, flags);
174         if (ip == NULL)
175                 return (NULL);
176
177         ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
178             flags, MT_DATA, M_PKTHDR);
179         if (ip->ip_bhs_mbuf == NULL) {
180                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
181                 icl_pdu_free(ip);
182                 return (NULL);
183         }
184         ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
185         memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
186         ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
187
188         return (ip);
189 }
190
191 static int
192 icl_pdu_ahs_length(const struct icl_pdu *request)
193 {
194
195         return (request->ip_bhs->bhs_total_ahs_len * 4);
196 }
197
198 size_t
199 icl_pdu_data_segment_length(const struct icl_pdu *request)
200 {
201         uint32_t len = 0;
202
203         len += request->ip_bhs->bhs_data_segment_len[0];
204         len <<= 8;
205         len += request->ip_bhs->bhs_data_segment_len[1];
206         len <<= 8;
207         len += request->ip_bhs->bhs_data_segment_len[2];
208
209         return (len);
210 }
211
212 static void
213 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
214 {
215
216         response->ip_bhs->bhs_data_segment_len[2] = len;
217         response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
218         response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
219 }
220
221 static size_t
222 icl_pdu_padding(const struct icl_pdu *ip)
223 {
224
225         if ((ip->ip_data_len % 4) != 0)
226                 return (4 - (ip->ip_data_len % 4));
227
228         return (0);
229 }
230
231 static size_t
232 icl_pdu_size(const struct icl_pdu *response)
233 {
234         size_t len;
235
236         KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
237
238         len = sizeof(struct iscsi_bhs) + response->ip_data_len +
239             icl_pdu_padding(response);
240         if (response->ip_conn->ic_header_crc32c)
241                 len += ISCSI_HEADER_DIGEST_SIZE;
242         if (response->ip_conn->ic_data_crc32c)
243                 len += ISCSI_DATA_DIGEST_SIZE;
244
245         return (len);
246 }
247
248 static int
249 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
250 {
251         struct mbuf *m;
252
253         m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
254         if (m == NULL) {
255                 ICL_DEBUG("failed to receive BHS");
256                 return (-1);
257         }
258
259         request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
260         if (request->ip_bhs_mbuf == NULL) {
261                 ICL_WARN("m_pullup failed");
262                 return (-1);
263         }
264         request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
265
266         /*
267          * XXX: For architectures with strict alignment requirements
268          *      we may need to allocate ip_bhs and copy the data into it.
269          *      For some reason, though, not doing this doesn't seem
270          *      to cause problems; tested on sparc64.
271          */
272
273         *availablep -= sizeof(struct iscsi_bhs);
274         return (0);
275 }
276
277 static int
278 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
279 {
280
281         request->ip_ahs_len = icl_pdu_ahs_length(request);
282         if (request->ip_ahs_len == 0)
283                 return (0);
284
285         request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
286             request->ip_ahs_len);
287         if (request->ip_ahs_mbuf == NULL) {
288                 ICL_DEBUG("failed to receive AHS");
289                 return (-1);
290         }
291
292         *availablep -= request->ip_ahs_len;
293         return (0);
294 }
295
296 static uint32_t
297 icl_mbuf_to_crc32c(const struct mbuf *m0)
298 {
299         uint32_t digest = 0xffffffff;
300         const struct mbuf *m;
301
302         for (m = m0; m != NULL; m = m->m_next)
303                 digest = calculate_crc32c(digest,
304                     mtod(m, const void *), m->m_len);
305
306         digest = digest ^ 0xffffffff;
307
308         return (digest);
309 }
310
311 static int
312 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
313 {
314         struct mbuf *m;
315         uint32_t received_digest, valid_digest;
316
317         if (request->ip_conn->ic_header_crc32c == false)
318                 return (0);
319
320         m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
321         if (m == NULL) {
322                 ICL_DEBUG("failed to receive header digest");
323                 return (-1);
324         }
325
326         CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
327         memcpy(&received_digest, mtod(m, void *), ISCSI_HEADER_DIGEST_SIZE);
328         m_freem(m);
329
330         *availablep -= ISCSI_HEADER_DIGEST_SIZE;
331
332         /*
333          * XXX: Handle AHS.
334          */
335         valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
336         if (received_digest != valid_digest) {
337                 ICL_WARN("header digest check failed; got 0x%x, "
338                     "should be 0x%x", received_digest, valid_digest);
339                 return (-1);
340         }
341
342         return (0);
343 }
344
345 /*
346  * Return the number of bytes that should be waiting in the receive socket
347  * before icl_pdu_receive_data_segment() gets called.
348  */
349 static size_t
350 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
351 {
352         size_t len;
353
354         len = icl_pdu_data_segment_length(request);
355         if (len == 0)
356                 return (0);
357
358         /*
359          * Account for the parts of data segment already read from
360          * the socket buffer.
361          */
362         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
363         len -= request->ip_data_len;
364
365         /*
366          * Don't always wait for the full data segment to be delivered
367          * to the socket; this might badly affect performance due to
368          * TCP window scaling.
369          */
370         if (len > partial_receive_len) {
371 #if 0
372                 ICL_DEBUG("need %zd bytes of data, limiting to %zd",
373                     len, partial_receive_len));
374 #endif
375                 len = partial_receive_len;
376
377                 return (len);
378         }
379
380         /*
381          * Account for padding.  Note that due to the way code is written,
382          * the icl_pdu_receive_data_segment() must always receive padding
383          * along with the last part of data segment, because it would be
384          * impossible to tell whether we've already received the full data
385          * segment including padding, or without it.
386          */
387         if ((len % 4) != 0)
388                 len += 4 - (len % 4);
389
390 #if 0
391         ICL_DEBUG("need %zd bytes of data", len));
392 #endif
393
394         return (len);
395 }
396
397 static int
398 icl_pdu_receive_data_segment(struct icl_pdu *request,
399     size_t *availablep, bool *more_neededp)
400 {
401         struct icl_conn *ic;
402         size_t len, padding = 0;
403         struct mbuf *m;
404
405         ic = request->ip_conn;
406
407         *more_neededp = false;
408         ic->ic_receive_len = 0;
409
410         len = icl_pdu_data_segment_length(request);
411         if (len == 0)
412                 return (0);
413
414         if ((len % 4) != 0)
415                 padding = 4 - (len % 4);
416
417         /*
418          * Account for already received parts of data segment.
419          */
420         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
421         len -= request->ip_data_len;
422
423         if (len + padding > *availablep) {
424                 /*
425                  * Not enough data in the socket buffer.  Receive as much
426                  * as we can.  Don't receive padding, since, obviously, it's
427                  * not the end of data segment yet.
428                  */
429 #if 0
430                 ICL_DEBUG("limited from %zd to %zd",
431                     len + padding, *availablep - padding));
432 #endif
433                 len = *availablep - padding;
434                 *more_neededp = true;
435                 padding = 0;
436         }
437
438         /*
439          * Must not try to receive padding without at least one byte
440          * of actual data segment.
441          */
442         if (len > 0) {
443                 m = icl_conn_receive(request->ip_conn, len + padding);
444                 if (m == NULL) {
445                         ICL_DEBUG("failed to receive data segment");
446                         return (-1);
447                 }
448
449                 if (request->ip_data_mbuf == NULL)
450                         request->ip_data_mbuf = m;
451                 else
452                         m_cat(request->ip_data_mbuf, m);
453
454                 request->ip_data_len += len;
455                 *availablep -= len + padding;
456         } else
457                 ICL_DEBUG("len 0");
458
459         if (*more_neededp)
460                 ic->ic_receive_len =
461                     icl_pdu_data_segment_receive_len(request);
462
463         return (0);
464 }
465
466 static int
467 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
468 {
469         struct mbuf *m;
470         uint32_t received_digest, valid_digest;
471
472         if (request->ip_conn->ic_data_crc32c == false)
473                 return (0);
474
475         if (request->ip_data_len == 0)
476                 return (0);
477
478         m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
479         if (m == NULL) {
480                 ICL_DEBUG("failed to receive data digest");
481                 return (-1);
482         }
483
484         CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
485         memcpy(&received_digest, mtod(m, void *), ISCSI_DATA_DIGEST_SIZE);
486         m_freem(m);
487
488         *availablep -= ISCSI_DATA_DIGEST_SIZE;
489
490         /*
491          * Note that ip_data_mbuf also contains padding; since digest
492          * calculation is supposed to include that, we iterate over
493          * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
494          */
495         valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
496         if (received_digest != valid_digest) {
497                 ICL_WARN("data digest check failed; got 0x%x, "
498                     "should be 0x%x", received_digest, valid_digest);
499                 return (-1);
500         }
501
502         return (0);
503 }
504
505 /*
506  * Somewhat contrary to the name, this attempts to receive only one
507  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
508  */
509 static struct icl_pdu *
510 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
511 {
512         struct icl_pdu *request;
513         struct socket *so;
514         size_t len;
515         int error;
516         bool more_needed;
517
518         so = ic->ic_socket;
519
520         if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
521                 KASSERT(ic->ic_receive_pdu == NULL,
522                     ("ic->ic_receive_pdu != NULL"));
523                 request = icl_pdu_new(ic, M_NOWAIT);
524                 if (request == NULL) {
525                         ICL_DEBUG("failed to allocate PDU; "
526                             "dropping connection");
527                         icl_conn_fail(ic);
528                         return (NULL);
529                 }
530                 ic->ic_receive_pdu = request;
531         } else {
532                 KASSERT(ic->ic_receive_pdu != NULL,
533                     ("ic->ic_receive_pdu == NULL"));
534                 request = ic->ic_receive_pdu;
535         }
536
537         if (*availablep < ic->ic_receive_len) {
538 #if 0
539                 ICL_DEBUG("not enough data; need %zd, "
540                     "have %zd", ic->ic_receive_len, *availablep);
541 #endif
542                 return (NULL);
543         }
544
545         switch (ic->ic_receive_state) {
546         case ICL_CONN_STATE_BHS:
547                 //ICL_DEBUG("receiving BHS");
548                 error = icl_pdu_receive_bhs(request, availablep);
549                 if (error != 0) {
550                         ICL_DEBUG("failed to receive BHS; "
551                             "dropping connection");
552                         break;
553                 }
554
555                 /*
556                  * We don't enforce any limit for AHS length;
557                  * its length is stored in 8 bit field.
558                  */
559
560                 len = icl_pdu_data_segment_length(request);
561                 if (len > ic->ic_max_data_segment_length) {
562                         ICL_WARN("received data segment "
563                             "length %zd is larger than negotiated "
564                             "MaxDataSegmentLength %zd; "
565                             "dropping connection",
566                             len, ic->ic_max_data_segment_length);
567                         break;
568                 }
569
570                 ic->ic_receive_state = ICL_CONN_STATE_AHS;
571                 ic->ic_receive_len = icl_pdu_ahs_length(request);
572                 break;
573
574         case ICL_CONN_STATE_AHS:
575                 //ICL_DEBUG("receiving AHS");
576                 error = icl_pdu_receive_ahs(request, availablep);
577                 if (error != 0) {
578                         ICL_DEBUG("failed to receive AHS; "
579                             "dropping connection");
580                         break;
581                 }
582                 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
583                 if (ic->ic_header_crc32c == false)
584                         ic->ic_receive_len = 0;
585                 else
586                         ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
587                 break;
588
589         case ICL_CONN_STATE_HEADER_DIGEST:
590                 //ICL_DEBUG("receiving header digest");
591                 error = icl_pdu_check_header_digest(request, availablep);
592                 if (error != 0) {
593                         ICL_DEBUG("header digest failed; "
594                             "dropping connection");
595                         break;
596                 }
597
598                 ic->ic_receive_state = ICL_CONN_STATE_DATA;
599                 ic->ic_receive_len =
600                     icl_pdu_data_segment_receive_len(request);
601                 break;
602
603         case ICL_CONN_STATE_DATA:
604                 //ICL_DEBUG("receiving data segment");
605                 error = icl_pdu_receive_data_segment(request, availablep,
606                     &more_needed);
607                 if (error != 0) {
608                         ICL_DEBUG("failed to receive data segment;"
609                             "dropping connection");
610                         break;
611                 }
612
613                 if (more_needed)
614                         break;
615
616                 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
617                 if (ic->ic_data_crc32c == false)
618                         ic->ic_receive_len = 0;
619                 else
620                         ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
621                 break;
622
623         case ICL_CONN_STATE_DATA_DIGEST:
624                 //ICL_DEBUG("receiving data digest");
625                 error = icl_pdu_check_data_digest(request, availablep);
626                 if (error != 0) {
627                         ICL_DEBUG("data digest failed; "
628                             "dropping connection");
629                         break;
630                 }
631
632                 /*
633                  * We've received complete PDU; reset the receive state machine
634                  * and return the PDU.
635                  */
636                 ic->ic_receive_state = ICL_CONN_STATE_BHS;
637                 ic->ic_receive_len = sizeof(struct iscsi_bhs);
638                 ic->ic_receive_pdu = NULL;
639                 return (request);
640
641         default:
642                 panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
643         }
644
645         if (error != 0) {
646                 icl_pdu_free(request);
647                 icl_conn_fail(ic);
648         }
649
650         return (NULL);
651 }
652
653 static void
654 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
655 {
656         struct icl_pdu *response;
657         struct socket *so;
658
659         so = ic->ic_socket;
660
661         /*
662          * This can never happen; we're careful to only mess with ic->ic_socket
663          * pointer when the send/receive threads are not running.
664          */
665         KASSERT(so != NULL, ("NULL socket"));
666
667         for (;;) {
668                 if (ic->ic_disconnecting)
669                         return;
670
671                 if (so->so_error != 0) {
672                         ICL_DEBUG("connection error %d; "
673                             "dropping connection", so->so_error);
674                         icl_conn_fail(ic);
675                         return;
676                 }
677
678                 /*
679                  * Loop until we have a complete PDU or there is not enough
680                  * data in the socket buffer.
681                  */
682                 if (available < ic->ic_receive_len) {
683 #if 0
684                         ICL_DEBUG("not enough data; have %zd, "
685                             "need %zd", available,
686                             ic->ic_receive_len);
687 #endif
688                         return;
689                 }
690
691                 response = icl_conn_receive_pdu(ic, &available);
692                 if (response == NULL)
693                         continue;
694
695                 if (response->ip_ahs_len > 0) {
696                         ICL_WARN("received PDU with unsupported "
697                             "AHS; opcode 0x%x; dropping connection",
698                             response->ip_bhs->bhs_opcode);
699                         icl_pdu_free(response);
700                         icl_conn_fail(ic);
701                         return;
702                 }
703
704                 (ic->ic_receive)(response);
705         }
706 }
707
708 static void
709 icl_receive_thread(void *arg)
710 {
711         struct icl_conn *ic;
712         size_t available;
713         struct socket *so;
714
715         ic = arg;
716         so = ic->ic_socket;
717
718         ICL_CONN_LOCK(ic);
719         ic->ic_receive_running = true;
720         ICL_CONN_UNLOCK(ic);
721
722         for (;;) {
723                 if (ic->ic_disconnecting) {
724                         //ICL_DEBUG("terminating");
725                         ICL_CONN_LOCK(ic);
726                         ic->ic_receive_running = false;
727                         ICL_CONN_UNLOCK(ic);
728                         kthread_exit();
729                         return;
730                 }
731
732                 SOCKBUF_LOCK(&so->so_rcv);
733                 available = so->so_rcv.sb_cc;
734                 if (available < ic->ic_receive_len) {
735                         so->so_rcv.sb_lowat = ic->ic_receive_len;
736                         cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
737                 }
738                 SOCKBUF_UNLOCK(&so->so_rcv);
739
740                 icl_conn_receive_pdus(ic, available);
741         }
742 }
743
744 static int
745 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
746 {
747         struct icl_conn *ic;
748
749         ic = arg;
750         cv_signal(&ic->ic_receive_cv);
751         return (SU_OK);
752 }
753
754 static int
755 icl_pdu_send(struct icl_pdu *request)
756 {
757         size_t padding, pdu_len;
758         uint32_t digest, zero = 0;
759         int error, ok;
760         struct socket *so;
761         struct icl_conn *ic;
762
763         ic = request->ip_conn;
764         so = request->ip_conn->ic_socket;
765
766         ICL_CONN_LOCK_ASSERT(ic);
767
768         icl_pdu_set_data_segment_length(request, request->ip_data_len);
769
770         pdu_len = icl_pdu_size(request);
771
772         if (ic->ic_header_crc32c) {
773                 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
774                 ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
775                     (void *)&digest);
776                 if (ok != 1) {
777                         ICL_WARN("failed to append header digest");
778                         return (1);
779                 }
780         }
781
782         if (request->ip_data_len != 0) {
783                 padding = icl_pdu_padding(request);
784                 if (padding > 0) {
785                         ok = m_append(request->ip_data_mbuf, padding,
786                             (void *)&zero);
787                         if (ok != 1) {
788                                 ICL_WARN("failed to append padding");
789                                 return (1);
790                         }
791                 }
792
793                 if (ic->ic_data_crc32c) {
794                         digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
795
796                         ok = m_append(request->ip_data_mbuf, sizeof(digest),
797                             (void *)&digest);
798                         if (ok != 1) {
799                                 ICL_WARN("failed to append header digest");
800                                 return (1);
801                         }
802                 }
803
804                 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
805                 request->ip_data_mbuf = NULL;
806         }
807
808         request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
809
810         error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
811             NULL, MSG_DONTWAIT, curthread);
812         request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
813         if (error != 0) {
814                 ICL_DEBUG("sosend error %d", error);
815                 return (error);
816         }
817
818         return (0);
819 }
820
821 static void
822 icl_conn_send_pdus(struct icl_conn *ic)
823 {
824         struct icl_pdu *request;
825         struct socket *so;
826         size_t available, size;
827         int error;
828
829         ICL_CONN_LOCK_ASSERT(ic);
830
831         so = ic->ic_socket;
832
833         SOCKBUF_LOCK(&so->so_snd);
834         available = sbspace(&so->so_snd);
835         SOCKBUF_UNLOCK(&so->so_snd);
836
837         while (!TAILQ_EMPTY(&ic->ic_to_send)) {
838                 if (ic->ic_disconnecting)
839                         return;
840
841                 request = TAILQ_FIRST(&ic->ic_to_send);
842                 size = icl_pdu_size(request);
843                 if (available < size) {
844                         /*
845                          * Set the low watermark on the socket,
846                          * to avoid waking up until there is enough
847                          * space.
848                          */
849                         SOCKBUF_LOCK(&so->so_snd);
850                         so->so_snd.sb_lowat = size;
851                         SOCKBUF_UNLOCK(&so->so_snd);
852 #if 1
853                         ICL_DEBUG("no space to send; "
854                             "have %zd, need %zd",
855                             available, size);
856 #endif
857                         return;
858                 }
859                 available -= size;
860                 TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
861                 error = icl_pdu_send(request);
862                 if (error != 0) {
863                         ICL_DEBUG("failed to send PDU; "
864                             "dropping connection");
865                         icl_conn_fail(ic);
866                         return;
867                 } 
868                 icl_pdu_free(request);
869         }
870 }
871
872 static void
873 icl_send_thread(void *arg)
874 {
875         struct icl_conn *ic;
876
877         ic = arg;
878
879         ICL_CONN_LOCK(ic);
880         ic->ic_send_running = true;
881         ICL_CONN_UNLOCK(ic);
882
883         for (;;) {
884                 ICL_CONN_LOCK(ic);
885                 if (ic->ic_disconnecting) {
886                         //ICL_DEBUG("terminating");
887                         ic->ic_send_running = false;
888                         ICL_CONN_UNLOCK(ic);
889                         kthread_exit();
890                         return;
891                 }
892                 if (TAILQ_EMPTY(&ic->ic_to_send))
893                         cv_wait(&ic->ic_send_cv, &ic->ic_lock);
894                 icl_conn_send_pdus(ic);
895                 ICL_CONN_UNLOCK(ic);
896         }
897 }
898
899 static int
900 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
901 {
902         struct icl_conn *ic;
903
904         ic = arg;
905         cv_signal(&ic->ic_send_cv);
906         return (SU_OK);
907 }
908
909 int
910 icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
911 {
912         struct mbuf *mb, *newmb;
913         size_t copylen, off = 0;
914
915         KASSERT(len > 0, ("len == 0"));
916
917         newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
918         if (newmb == NULL) {
919                 ICL_WARN("failed to allocate mbuf for %zd bytes", len);
920                 return (ENOMEM);
921         }
922
923         for (mb = newmb; mb != NULL; mb = mb->m_next) {
924                 copylen = min(M_TRAILINGSPACE(mb), len - off);
925                 memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
926                 mb->m_len = copylen;
927                 off += copylen;
928         }
929         KASSERT(off == len, ("%s: off != len", __func__));
930
931         if (request->ip_data_mbuf == NULL) {
932                 request->ip_data_mbuf = newmb;
933                 request->ip_data_len = len;
934         } else {
935                 m_cat(request->ip_data_mbuf, newmb);
936                 request->ip_data_len += len;
937         }
938
939         return (0);
940 }
941
942 void
943 icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
944 {
945
946         m_copydata(ip->ip_data_mbuf, off, len, addr);
947 }
948
949 void
950 icl_pdu_queue(struct icl_pdu *ip)
951 {
952         struct icl_conn *ic;
953
954         ic = ip->ip_conn;
955
956         ICL_CONN_LOCK(ic);
957         if (ic->ic_disconnecting || ic->ic_socket == NULL) {
958                 ICL_DEBUG("icl_pdu_queue on closed connection");
959                 ICL_CONN_UNLOCK(ic);
960                 return;
961         }
962         TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
963         ICL_CONN_UNLOCK(ic);
964         cv_signal(&ic->ic_send_cv);
965 }
966
967 struct icl_conn *
968 icl_conn_new(void)
969 {
970         struct icl_conn *ic;
971
972         refcount_acquire(&icl_ncons);
973
974         ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
975
976         TAILQ_INIT(&ic->ic_to_send);
977         mtx_init(&ic->ic_lock, "icl_lock", NULL, MTX_DEF);
978         cv_init(&ic->ic_send_cv, "icl_tx");
979         cv_init(&ic->ic_receive_cv, "icl_rx");
980         refcount_init(&ic->ic_outstanding_pdus, 0);
981         ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
982
983         return (ic);
984 }
985
986 void
987 icl_conn_free(struct icl_conn *ic)
988 {
989
990         mtx_destroy(&ic->ic_lock);
991         cv_destroy(&ic->ic_send_cv);
992         cv_destroy(&ic->ic_receive_cv);
993         uma_zfree(icl_conn_zone, ic);
994         refcount_release(&icl_ncons);
995 }
996
997 static int
998 icl_conn_start(struct icl_conn *ic)
999 {
1000         size_t bufsize;
1001         struct sockopt opt;
1002         int error, one = 1;
1003
1004         ICL_CONN_LOCK(ic);
1005
1006         /*
1007          * XXX: Ugly hack.
1008          */
1009         if (ic->ic_socket == NULL) {
1010                 ICL_CONN_UNLOCK(ic);
1011                 return (EINVAL);
1012         }
1013
1014         ic->ic_receive_state = ICL_CONN_STATE_BHS;
1015         ic->ic_receive_len = sizeof(struct iscsi_bhs);
1016         ic->ic_disconnecting = false;
1017
1018         ICL_CONN_UNLOCK(ic);
1019
1020         /*
1021          * Use max available sockbuf size for sending.  Do it manually
1022          * instead of sbreserve(9) to work around resource limits.
1023          *
1024          * XXX: This kind of sucks.  On one hand, we don't currently support
1025          *      sending a part of data segment; we always do it in one piece,
1026          *      so we have to make sure it can fit in the socket buffer.
1027          *      Once I've implemented partial send, we'll get rid of this
1028          *      and use autoscaling.
1029          */
1030         bufsize = (sizeof(struct iscsi_bhs) +
1031             ic->ic_max_data_segment_length) * 8;
1032         error = soreserve(ic->ic_socket, bufsize, bufsize);
1033         if (error != 0) {
1034                 ICL_WARN("soreserve failed with error %d", error);
1035                 icl_conn_close(ic);
1036                 return (error);
1037         }
1038
1039         /*
1040          * Disable Nagle.
1041          */
1042         bzero(&opt, sizeof(opt));
1043         opt.sopt_dir = SOPT_SET;
1044         opt.sopt_level = IPPROTO_TCP;
1045         opt.sopt_name = TCP_NODELAY;
1046         opt.sopt_val = &one;
1047         opt.sopt_valsize = sizeof(one);
1048         error = sosetopt(ic->ic_socket, &opt);
1049         if (error != 0) {
1050                 ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1051                 icl_conn_close(ic);
1052                 return (error);
1053         }
1054
1055         /*
1056          * Start threads.
1057          */
1058         error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx");
1059         if (error != 0) {
1060                 ICL_WARN("kthread_add(9) failed with error %d", error);
1061                 icl_conn_close(ic);
1062                 return (error);
1063         }
1064
1065         error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx");
1066         if (error != 0) {
1067                 ICL_WARN("kthread_add(9) failed with error %d", error);
1068                 icl_conn_close(ic);
1069                 return (error);
1070         }
1071
1072         /*
1073          * Register socket upcall, to get notified about incoming PDUs
1074          * and free space to send outgoing ones.
1075          */
1076         SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1077         soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1078         SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1079         SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1080         soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1081         SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1082
1083         return (0);
1084 }
1085
1086 int
1087 icl_conn_handoff(struct icl_conn *ic, int fd)
1088 {
1089         struct file *fp;
1090         struct socket *so;
1091         cap_rights_t rights;
1092         int error;
1093
1094         /*
1095          * Steal the socket from userland.
1096          */
1097         error = fget(curthread, fd,
1098             cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1099         if (error != 0)
1100                 return (error);
1101         if (fp->f_type != DTYPE_SOCKET) {
1102                 fdrop(fp, curthread);
1103                 return (EINVAL);
1104         }
1105         so = fp->f_data;
1106         if (so->so_type != SOCK_STREAM) {
1107                 fdrop(fp, curthread);
1108                 return (EINVAL);
1109         }
1110
1111         ICL_CONN_LOCK(ic);
1112
1113         if (ic->ic_socket != NULL) {
1114                 ICL_CONN_UNLOCK(ic);
1115                 fdrop(fp, curthread);
1116                 return (EBUSY);
1117         }
1118
1119         ic->ic_socket = fp->f_data;
1120         fp->f_ops = &badfileops;
1121         fp->f_data = NULL;
1122         fdrop(fp, curthread);
1123         ICL_CONN_UNLOCK(ic);
1124
1125         error = icl_conn_start(ic);
1126
1127         return (error);
1128 }
1129
1130 void
1131 icl_conn_shutdown(struct icl_conn *ic)
1132 {
1133
1134         ICL_CONN_LOCK(ic);
1135         if (ic->ic_socket == NULL) {
1136                 ICL_CONN_UNLOCK(ic);
1137                 return;
1138         }
1139         ICL_CONN_UNLOCK(ic);
1140
1141         soshutdown(ic->ic_socket, SHUT_RDWR);
1142 }
1143
1144 void
1145 icl_conn_close(struct icl_conn *ic)
1146 {
1147         struct icl_pdu *pdu;
1148
1149         ICL_CONN_LOCK(ic);
1150         if (ic->ic_socket == NULL) {
1151                 ICL_CONN_UNLOCK(ic);
1152                 return;
1153         }
1154
1155         ic->ic_disconnecting = true;
1156
1157         /*
1158          * Wake up the threads, so they can properly terminate.
1159          */
1160         cv_signal(&ic->ic_receive_cv);
1161         cv_signal(&ic->ic_send_cv);
1162         while (ic->ic_receive_running || ic->ic_send_running) {
1163                 //ICL_DEBUG("waiting for send/receive threads to terminate");
1164                 ICL_CONN_UNLOCK(ic);
1165                 cv_signal(&ic->ic_receive_cv);
1166                 cv_signal(&ic->ic_send_cv);
1167                 pause("icl_close", 1 * hz);
1168                 ICL_CONN_LOCK(ic);
1169         }
1170         //ICL_DEBUG("send/receive threads terminated");
1171
1172         soclose(ic->ic_socket);
1173         ic->ic_socket = NULL;
1174
1175         if (ic->ic_receive_pdu != NULL) {
1176                 //ICL_DEBUG("freeing partially received PDU");
1177                 icl_pdu_free(ic->ic_receive_pdu);
1178                 ic->ic_receive_pdu = NULL;
1179         }
1180
1181         /*
1182          * Remove any outstanding PDUs from the send queue.
1183          */
1184         while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1185                 pdu = TAILQ_FIRST(&ic->ic_to_send);
1186                 TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1187                 icl_pdu_free(pdu);
1188         }
1189
1190         KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1191             ("destroying session with non-empty send queue"));
1192         /*
1193          * XXX
1194          */
1195 #if 0
1196         KASSERT(ic->ic_outstanding_pdus == 0,
1197             ("destroying session with %d outstanding PDUs",
1198              ic->ic_outstanding_pdus));
1199 #endif
1200         ICL_CONN_UNLOCK(ic);
1201 }
1202
1203 bool
1204 icl_conn_connected(struct icl_conn *ic)
1205 {
1206
1207         ICL_CONN_LOCK(ic);
1208         if (ic->ic_socket == NULL) {
1209                 ICL_CONN_UNLOCK(ic);
1210                 return (false);
1211         }
1212         if (ic->ic_socket->so_error != 0) {
1213                 ICL_CONN_UNLOCK(ic);
1214                 return (false);
1215         }
1216         ICL_CONN_UNLOCK(ic);
1217         return (true);
1218 }
1219
1220 #ifdef ICL_KERNEL_PROXY
1221 int
1222 icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1223 {
1224         int error;
1225
1226         if (so->so_type != SOCK_STREAM)
1227                 return (EINVAL);
1228
1229         ICL_CONN_LOCK(ic);
1230         if (ic->ic_socket != NULL) {
1231                 ICL_CONN_UNLOCK(ic);
1232                 return (EBUSY);
1233         }
1234         ic->ic_socket = so;
1235         ICL_CONN_UNLOCK(ic);
1236
1237         error = icl_conn_start(ic);
1238
1239         return (error);
1240 }
1241 #endif /* ICL_KERNEL_PROXY */
1242
1243 static int
1244 icl_unload(void)
1245 {
1246
1247         if (icl_ncons != 0)
1248                 return (EBUSY);
1249
1250         uma_zdestroy(icl_conn_zone);
1251         uma_zdestroy(icl_pdu_zone);
1252
1253         return (0);
1254 }
1255
1256 static void
1257 icl_load(void)
1258 {
1259
1260         icl_conn_zone = uma_zcreate("icl_conn",
1261             sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1262             UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1263         icl_pdu_zone = uma_zcreate("icl_pdu",
1264             sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1265             UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1266
1267         refcount_init(&icl_ncons, 0);
1268 }
1269
1270 static int
1271 icl_modevent(module_t mod, int what, void *arg)
1272 {
1273
1274         switch (what) {
1275         case MOD_LOAD:
1276                 icl_load();
1277                 return (0);
1278         case MOD_UNLOAD:
1279                 return (icl_unload());
1280         default:
1281                 return (EINVAL);
1282         }
1283 }
1284
1285 moduledata_t icl_data = {
1286         "icl",
1287         icl_modevent,
1288         0
1289 };
1290
1291 DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1292 MODULE_VERSION(icl, 1);