]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/dev/iscsi/icl.c
MFC r264110:
[FreeBSD/stable/10.git] / sys / dev / iscsi / icl.c
1 /*-
2  * Copyright (c) 2012 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 /*
33  * iSCSI Common Layer.  It's used by both the initiator and target to send
34  * and receive iSCSI PDUs.
35  */
36
37 #include <sys/param.h>
38 #include <sys/capability.h>
39 #include <sys/condvar.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/lock.h>
45 #include <sys/mbuf.h>
46 #include <sys/mutex.h>
47 #include <sys/module.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/sx.h>
53 #include <sys/uio.h>
54 #include <vm/uma.h>
55 #include <netinet/in.h>
56 #include <netinet/tcp.h>
57
58 #include "icl.h"
59 #include "iscsi_proto.h"
60
61 SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62 static int debug = 1;
63 TUNABLE_INT("kern.icl.debug", &debug);
64 SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RWTUN,
65     &debug, 1, "Enable debug messages");
66 static int partial_receive_len = 1 * 1024; /* XXX: More? */
67 TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
69     &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70     "data segment");
71 static int sendspace = 1048576;
72 TUNABLE_INT("kern.icl.sendspace", &sendspace);
73 SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN,
74     &sendspace, 1048576, "Default send socket buffer size");
75 static int recvspace = 1048576;
76 TUNABLE_INT("kern.icl.recvspace", &recvspace);
77 SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
78     &recvspace, 1048576, "Default receive socket buffer size");
79
80 static uma_zone_t icl_conn_zone;
81 static uma_zone_t icl_pdu_zone;
82
83 static volatile u_int   icl_ncons;
84
85 #define ICL_DEBUG(X, ...)                                       \
86         if (debug > 1) {                                        \
87                 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
88         } while (0)
89
90 #define ICL_WARN(X, ...)                                        \
91         if (debug > 0) {                                        \
92                 printf("WARNING: %s: " X "\n",                  \
93                     __func__, ## __VA_ARGS__);                  \
94         } while (0)
95
96 #define ICL_CONN_LOCK(X)                mtx_lock(X->ic_lock)
97 #define ICL_CONN_UNLOCK(X)              mtx_unlock(X->ic_lock)
98 #define ICL_CONN_LOCK_ASSERT(X)         mtx_assert(X->ic_lock, MA_OWNED)
99 #define ICL_CONN_LOCK_ASSERT_NOT(X)     mtx_assert(X->ic_lock, MA_NOTOWNED)
100
101 static void
102 icl_conn_fail(struct icl_conn *ic)
103 {
104         if (ic->ic_socket == NULL)
105                 return;
106
107         /*
108          * XXX
109          */
110         ic->ic_socket->so_error = EDOOFUS;
111         (ic->ic_error)(ic);
112 }
113
114 static struct mbuf *
115 icl_conn_receive(struct icl_conn *ic, size_t len)
116 {
117         struct uio uio;
118         struct socket *so;
119         struct mbuf *m;
120         int error, flags;
121
122         so = ic->ic_socket;
123
124         memset(&uio, 0, sizeof(uio));
125         uio.uio_resid = len;
126
127         flags = MSG_DONTWAIT;
128         error = soreceive(so, NULL, &uio, &m, NULL, &flags);
129         if (error != 0) {
130                 ICL_DEBUG("soreceive error %d", error);
131                 return (NULL);
132         }
133         if (uio.uio_resid != 0) {
134                 m_freem(m);
135                 ICL_DEBUG("short read");
136                 return (NULL);
137         }
138
139         return (m);
140 }
141
142 static struct icl_pdu *
143 icl_pdu_new(struct icl_conn *ic, int flags)
144 {
145         struct icl_pdu *ip;
146
147 #ifdef DIAGNOSTIC
148         refcount_acquire(&ic->ic_outstanding_pdus);
149 #endif
150         ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
151         if (ip == NULL) {
152                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
153 #ifdef DIAGNOSTIC
154                 refcount_release(&ic->ic_outstanding_pdus);
155 #endif
156                 return (NULL);
157         }
158
159         ip->ip_conn = ic;
160
161         return (ip);
162 }
163
164 void
165 icl_pdu_free(struct icl_pdu *ip)
166 {
167         struct icl_conn *ic;
168
169         ic = ip->ip_conn;
170
171         m_freem(ip->ip_bhs_mbuf);
172         m_freem(ip->ip_ahs_mbuf);
173         m_freem(ip->ip_data_mbuf);
174         uma_zfree(icl_pdu_zone, ip);
175 #ifdef DIAGNOSTIC
176         refcount_release(&ic->ic_outstanding_pdus);
177 #endif
178 }
179
180 /*
181  * Allocate icl_pdu with empty BHS to fill up by the caller.
182  */
183 struct icl_pdu *
184 icl_pdu_new_bhs(struct icl_conn *ic, int flags)
185 {
186         struct icl_pdu *ip;
187
188         ip = icl_pdu_new(ic, flags);
189         if (ip == NULL)
190                 return (NULL);
191
192         ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
193             flags, MT_DATA, M_PKTHDR);
194         if (ip->ip_bhs_mbuf == NULL) {
195                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
196                 icl_pdu_free(ip);
197                 return (NULL);
198         }
199         ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
200         memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
201         ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
202
203         return (ip);
204 }
205
206 static int
207 icl_pdu_ahs_length(const struct icl_pdu *request)
208 {
209
210         return (request->ip_bhs->bhs_total_ahs_len * 4);
211 }
212
213 size_t
214 icl_pdu_data_segment_length(const struct icl_pdu *request)
215 {
216         uint32_t len = 0;
217
218         len += request->ip_bhs->bhs_data_segment_len[0];
219         len <<= 8;
220         len += request->ip_bhs->bhs_data_segment_len[1];
221         len <<= 8;
222         len += request->ip_bhs->bhs_data_segment_len[2];
223
224         return (len);
225 }
226
227 static void
228 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
229 {
230
231         response->ip_bhs->bhs_data_segment_len[2] = len;
232         response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
233         response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
234 }
235
236 static size_t
237 icl_pdu_padding(const struct icl_pdu *ip)
238 {
239
240         if ((ip->ip_data_len % 4) != 0)
241                 return (4 - (ip->ip_data_len % 4));
242
243         return (0);
244 }
245
246 static size_t
247 icl_pdu_size(const struct icl_pdu *response)
248 {
249         size_t len;
250
251         KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
252
253         len = sizeof(struct iscsi_bhs) + response->ip_data_len +
254             icl_pdu_padding(response);
255         if (response->ip_conn->ic_header_crc32c)
256                 len += ISCSI_HEADER_DIGEST_SIZE;
257         if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
258                 len += ISCSI_DATA_DIGEST_SIZE;
259
260         return (len);
261 }
262
263 static int
264 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
265 {
266         struct mbuf *m;
267
268         m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
269         if (m == NULL) {
270                 ICL_DEBUG("failed to receive BHS");
271                 return (-1);
272         }
273
274         request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
275         if (request->ip_bhs_mbuf == NULL) {
276                 ICL_WARN("m_pullup failed");
277                 return (-1);
278         }
279         request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
280
281         /*
282          * XXX: For architectures with strict alignment requirements
283          *      we may need to allocate ip_bhs and copy the data into it.
284          *      For some reason, though, not doing this doesn't seem
285          *      to cause problems; tested on sparc64.
286          */
287
288         *availablep -= sizeof(struct iscsi_bhs);
289         return (0);
290 }
291
292 static int
293 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
294 {
295
296         request->ip_ahs_len = icl_pdu_ahs_length(request);
297         if (request->ip_ahs_len == 0)
298                 return (0);
299
300         request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
301             request->ip_ahs_len);
302         if (request->ip_ahs_mbuf == NULL) {
303                 ICL_DEBUG("failed to receive AHS");
304                 return (-1);
305         }
306
307         *availablep -= request->ip_ahs_len;
308         return (0);
309 }
310
311 static uint32_t
312 icl_mbuf_to_crc32c(const struct mbuf *m0)
313 {
314         uint32_t digest = 0xffffffff;
315         const struct mbuf *m;
316
317         for (m = m0; m != NULL; m = m->m_next)
318                 digest = calculate_crc32c(digest,
319                     mtod(m, const void *), m->m_len);
320
321         digest = digest ^ 0xffffffff;
322
323         return (digest);
324 }
325
326 static int
327 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
328 {
329         struct mbuf *m;
330         uint32_t received_digest, valid_digest;
331
332         if (request->ip_conn->ic_header_crc32c == false)
333                 return (0);
334
335         m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
336         if (m == NULL) {
337                 ICL_DEBUG("failed to receive header digest");
338                 return (-1);
339         }
340
341         CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
342         m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
343         m_freem(m);
344
345         *availablep -= ISCSI_HEADER_DIGEST_SIZE;
346
347         /*
348          * XXX: Handle AHS.
349          */
350         valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
351         if (received_digest != valid_digest) {
352                 ICL_WARN("header digest check failed; got 0x%x, "
353                     "should be 0x%x", received_digest, valid_digest);
354                 return (-1);
355         }
356
357         return (0);
358 }
359
360 /*
361  * Return the number of bytes that should be waiting in the receive socket
362  * before icl_pdu_receive_data_segment() gets called.
363  */
364 static size_t
365 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
366 {
367         size_t len;
368
369         len = icl_pdu_data_segment_length(request);
370         if (len == 0)
371                 return (0);
372
373         /*
374          * Account for the parts of data segment already read from
375          * the socket buffer.
376          */
377         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
378         len -= request->ip_data_len;
379
380         /*
381          * Don't always wait for the full data segment to be delivered
382          * to the socket; this might badly affect performance due to
383          * TCP window scaling.
384          */
385         if (len > partial_receive_len) {
386 #if 0
387                 ICL_DEBUG("need %zd bytes of data, limiting to %zd",
388                     len, partial_receive_len));
389 #endif
390                 len = partial_receive_len;
391
392                 return (len);
393         }
394
395         /*
396          * Account for padding.  Note that due to the way code is written,
397          * the icl_pdu_receive_data_segment() must always receive padding
398          * along with the last part of data segment, because it would be
399          * impossible to tell whether we've already received the full data
400          * segment including padding, or without it.
401          */
402         if ((len % 4) != 0)
403                 len += 4 - (len % 4);
404
405 #if 0
406         ICL_DEBUG("need %zd bytes of data", len));
407 #endif
408
409         return (len);
410 }
411
412 static int
413 icl_pdu_receive_data_segment(struct icl_pdu *request,
414     size_t *availablep, bool *more_neededp)
415 {
416         struct icl_conn *ic;
417         size_t len, padding = 0;
418         struct mbuf *m;
419
420         ic = request->ip_conn;
421
422         *more_neededp = false;
423         ic->ic_receive_len = 0;
424
425         len = icl_pdu_data_segment_length(request);
426         if (len == 0)
427                 return (0);
428
429         if ((len % 4) != 0)
430                 padding = 4 - (len % 4);
431
432         /*
433          * Account for already received parts of data segment.
434          */
435         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
436         len -= request->ip_data_len;
437
438         if (len + padding > *availablep) {
439                 /*
440                  * Not enough data in the socket buffer.  Receive as much
441                  * as we can.  Don't receive padding, since, obviously, it's
442                  * not the end of data segment yet.
443                  */
444 #if 0
445                 ICL_DEBUG("limited from %zd to %zd",
446                     len + padding, *availablep - padding));
447 #endif
448                 len = *availablep - padding;
449                 *more_neededp = true;
450                 padding = 0;
451         }
452
453         /*
454          * Must not try to receive padding without at least one byte
455          * of actual data segment.
456          */
457         if (len > 0) {
458                 m = icl_conn_receive(request->ip_conn, len + padding);
459                 if (m == NULL) {
460                         ICL_DEBUG("failed to receive data segment");
461                         return (-1);
462                 }
463
464                 if (request->ip_data_mbuf == NULL)
465                         request->ip_data_mbuf = m;
466                 else
467                         m_cat(request->ip_data_mbuf, m);
468
469                 request->ip_data_len += len;
470                 *availablep -= len + padding;
471         } else
472                 ICL_DEBUG("len 0");
473
474         if (*more_neededp)
475                 ic->ic_receive_len =
476                     icl_pdu_data_segment_receive_len(request);
477
478         return (0);
479 }
480
481 static int
482 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
483 {
484         struct mbuf *m;
485         uint32_t received_digest, valid_digest;
486
487         if (request->ip_conn->ic_data_crc32c == false)
488                 return (0);
489
490         if (request->ip_data_len == 0)
491                 return (0);
492
493         m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
494         if (m == NULL) {
495                 ICL_DEBUG("failed to receive data digest");
496                 return (-1);
497         }
498
499         CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
500         m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
501         m_freem(m);
502
503         *availablep -= ISCSI_DATA_DIGEST_SIZE;
504
505         /*
506          * Note that ip_data_mbuf also contains padding; since digest
507          * calculation is supposed to include that, we iterate over
508          * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
509          */
510         valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
511         if (received_digest != valid_digest) {
512                 ICL_WARN("data digest check failed; got 0x%x, "
513                     "should be 0x%x", received_digest, valid_digest);
514                 return (-1);
515         }
516
517         return (0);
518 }
519
520 /*
521  * Somewhat contrary to the name, this attempts to receive only one
522  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
523  */
524 static struct icl_pdu *
525 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
526 {
527         struct icl_pdu *request;
528         struct socket *so;
529         size_t len;
530         int error;
531         bool more_needed;
532
533         so = ic->ic_socket;
534
535         if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
536                 KASSERT(ic->ic_receive_pdu == NULL,
537                     ("ic->ic_receive_pdu != NULL"));
538                 request = icl_pdu_new(ic, M_NOWAIT);
539                 if (request == NULL) {
540                         ICL_DEBUG("failed to allocate PDU; "
541                             "dropping connection");
542                         icl_conn_fail(ic);
543                         return (NULL);
544                 }
545                 ic->ic_receive_pdu = request;
546         } else {
547                 KASSERT(ic->ic_receive_pdu != NULL,
548                     ("ic->ic_receive_pdu == NULL"));
549                 request = ic->ic_receive_pdu;
550         }
551
552         if (*availablep < ic->ic_receive_len) {
553 #if 0
554                 ICL_DEBUG("not enough data; need %zd, "
555                     "have %zd", ic->ic_receive_len, *availablep);
556 #endif
557                 return (NULL);
558         }
559
560         switch (ic->ic_receive_state) {
561         case ICL_CONN_STATE_BHS:
562                 //ICL_DEBUG("receiving BHS");
563                 error = icl_pdu_receive_bhs(request, availablep);
564                 if (error != 0) {
565                         ICL_DEBUG("failed to receive BHS; "
566                             "dropping connection");
567                         break;
568                 }
569
570                 /*
571                  * We don't enforce any limit for AHS length;
572                  * its length is stored in 8 bit field.
573                  */
574
575                 len = icl_pdu_data_segment_length(request);
576                 if (len > ic->ic_max_data_segment_length) {
577                         ICL_WARN("received data segment "
578                             "length %zd is larger than negotiated "
579                             "MaxDataSegmentLength %zd; "
580                             "dropping connection",
581                             len, ic->ic_max_data_segment_length);
582                         error = EINVAL;
583                         break;
584                 }
585
586                 ic->ic_receive_state = ICL_CONN_STATE_AHS;
587                 ic->ic_receive_len = icl_pdu_ahs_length(request);
588                 break;
589
590         case ICL_CONN_STATE_AHS:
591                 //ICL_DEBUG("receiving AHS");
592                 error = icl_pdu_receive_ahs(request, availablep);
593                 if (error != 0) {
594                         ICL_DEBUG("failed to receive AHS; "
595                             "dropping connection");
596                         break;
597                 }
598                 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
599                 if (ic->ic_header_crc32c == false)
600                         ic->ic_receive_len = 0;
601                 else
602                         ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
603                 break;
604
605         case ICL_CONN_STATE_HEADER_DIGEST:
606                 //ICL_DEBUG("receiving header digest");
607                 error = icl_pdu_check_header_digest(request, availablep);
608                 if (error != 0) {
609                         ICL_DEBUG("header digest failed; "
610                             "dropping connection");
611                         break;
612                 }
613
614                 ic->ic_receive_state = ICL_CONN_STATE_DATA;
615                 ic->ic_receive_len =
616                     icl_pdu_data_segment_receive_len(request);
617                 break;
618
619         case ICL_CONN_STATE_DATA:
620                 //ICL_DEBUG("receiving data segment");
621                 error = icl_pdu_receive_data_segment(request, availablep,
622                     &more_needed);
623                 if (error != 0) {
624                         ICL_DEBUG("failed to receive data segment;"
625                             "dropping connection");
626                         break;
627                 }
628
629                 if (more_needed)
630                         break;
631
632                 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
633                 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
634                         ic->ic_receive_len = 0;
635                 else
636                         ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
637                 break;
638
639         case ICL_CONN_STATE_DATA_DIGEST:
640                 //ICL_DEBUG("receiving data digest");
641                 error = icl_pdu_check_data_digest(request, availablep);
642                 if (error != 0) {
643                         ICL_DEBUG("data digest failed; "
644                             "dropping connection");
645                         break;
646                 }
647
648                 /*
649                  * We've received complete PDU; reset the receive state machine
650                  * and return the PDU.
651                  */
652                 ic->ic_receive_state = ICL_CONN_STATE_BHS;
653                 ic->ic_receive_len = sizeof(struct iscsi_bhs);
654                 ic->ic_receive_pdu = NULL;
655                 return (request);
656
657         default:
658                 panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
659         }
660
661         if (error != 0) {
662                 icl_pdu_free(request);
663                 icl_conn_fail(ic);
664         }
665
666         return (NULL);
667 }
668
669 static void
670 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
671 {
672         struct icl_pdu *response;
673         struct socket *so;
674
675         so = ic->ic_socket;
676
677         /*
678          * This can never happen; we're careful to only mess with ic->ic_socket
679          * pointer when the send/receive threads are not running.
680          */
681         KASSERT(so != NULL, ("NULL socket"));
682
683         for (;;) {
684                 if (ic->ic_disconnecting)
685                         return;
686
687                 if (so->so_error != 0) {
688                         ICL_DEBUG("connection error %d; "
689                             "dropping connection", so->so_error);
690                         icl_conn_fail(ic);
691                         return;
692                 }
693
694                 /*
695                  * Loop until we have a complete PDU or there is not enough
696                  * data in the socket buffer.
697                  */
698                 if (available < ic->ic_receive_len) {
699 #if 0
700                         ICL_DEBUG("not enough data; have %zd, "
701                             "need %zd", available,
702                             ic->ic_receive_len);
703 #endif
704                         return;
705                 }
706
707                 response = icl_conn_receive_pdu(ic, &available);
708                 if (response == NULL)
709                         continue;
710
711                 if (response->ip_ahs_len > 0) {
712                         ICL_WARN("received PDU with unsupported "
713                             "AHS; opcode 0x%x; dropping connection",
714                             response->ip_bhs->bhs_opcode);
715                         icl_pdu_free(response);
716                         icl_conn_fail(ic);
717                         return;
718                 }
719
720                 (ic->ic_receive)(response);
721         }
722 }
723
724 static void
725 icl_receive_thread(void *arg)
726 {
727         struct icl_conn *ic;
728         size_t available;
729         struct socket *so;
730
731         ic = arg;
732         so = ic->ic_socket;
733
734         ICL_CONN_LOCK(ic);
735         ic->ic_receive_running = true;
736         ICL_CONN_UNLOCK(ic);
737
738         for (;;) {
739                 if (ic->ic_disconnecting) {
740                         //ICL_DEBUG("terminating");
741                         break;
742                 }
743
744                 SOCKBUF_LOCK(&so->so_rcv);
745                 available = so->so_rcv.sb_cc;
746                 if (available < ic->ic_receive_len) {
747                         so->so_rcv.sb_lowat = ic->ic_receive_len;
748                         cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
749                 }
750                 SOCKBUF_UNLOCK(&so->so_rcv);
751
752                 icl_conn_receive_pdus(ic, available);
753         }
754
755         ICL_CONN_LOCK(ic);
756         ic->ic_receive_running = false;
757         ICL_CONN_UNLOCK(ic);
758         kthread_exit();
759 }
760
761 static int
762 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
763 {
764         struct icl_conn *ic;
765
766         ic = arg;
767         cv_signal(&ic->ic_receive_cv);
768         return (SU_OK);
769 }
770
771 static int
772 icl_pdu_send(struct icl_pdu *request)
773 {
774         size_t padding, pdu_len;
775         uint32_t digest, zero = 0;
776         int error, ok;
777         struct socket *so;
778         struct icl_conn *ic;
779
780         ic = request->ip_conn;
781         so = request->ip_conn->ic_socket;
782
783         ICL_CONN_LOCK_ASSERT(ic);
784
785         icl_pdu_set_data_segment_length(request, request->ip_data_len);
786
787         pdu_len = icl_pdu_size(request);
788
789         if (ic->ic_header_crc32c) {
790                 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
791                 ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
792                     (void *)&digest);
793                 if (ok != 1) {
794                         ICL_WARN("failed to append header digest");
795                         return (1);
796                 }
797         }
798
799         if (request->ip_data_len != 0) {
800                 padding = icl_pdu_padding(request);
801                 if (padding > 0) {
802                         ok = m_append(request->ip_data_mbuf, padding,
803                             (void *)&zero);
804                         if (ok != 1) {
805                                 ICL_WARN("failed to append padding");
806                                 return (1);
807                         }
808                 }
809
810                 if (ic->ic_data_crc32c) {
811                         digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
812
813                         ok = m_append(request->ip_data_mbuf, sizeof(digest),
814                             (void *)&digest);
815                         if (ok != 1) {
816                                 ICL_WARN("failed to append header digest");
817                                 return (1);
818                         }
819                 }
820
821                 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
822                 request->ip_data_mbuf = NULL;
823         }
824
825         request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
826
827         error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
828             NULL, MSG_DONTWAIT, curthread);
829         request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
830         if (error != 0) {
831                 ICL_DEBUG("sosend error %d", error);
832                 return (error);
833         }
834
835         return (0);
836 }
837
838 static void
839 icl_conn_send_pdus(struct icl_conn *ic)
840 {
841         struct icl_pdu *request;
842         struct socket *so;
843         size_t available, size;
844         int error;
845
846         ICL_CONN_LOCK_ASSERT(ic);
847
848         so = ic->ic_socket;
849
850         SOCKBUF_LOCK(&so->so_snd);
851         available = sbspace(&so->so_snd);
852         SOCKBUF_UNLOCK(&so->so_snd);
853
854         while (!STAILQ_EMPTY(&ic->ic_to_send)) {
855                 if (ic->ic_disconnecting)
856                         return;
857
858                 request = STAILQ_FIRST(&ic->ic_to_send);
859                 size = icl_pdu_size(request);
860                 if (available < size) {
861                         /*
862                          * Set the low watermark on the socket,
863                          * to avoid waking up until there is enough
864                          * space.
865                          */
866                         SOCKBUF_LOCK(&so->so_snd);
867                         so->so_snd.sb_lowat = size;
868                         SOCKBUF_UNLOCK(&so->so_snd);
869 #if 1
870                         ICL_DEBUG("no space to send; "
871                             "have %zd, need %zd",
872                             available, size);
873 #endif
874                         return;
875                 }
876                 available -= size;
877                 STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
878                 error = icl_pdu_send(request);
879                 if (error != 0) {
880                         ICL_DEBUG("failed to send PDU; "
881                             "dropping connection");
882                         icl_conn_fail(ic);
883                         return;
884                 } 
885                 icl_pdu_free(request);
886         }
887 }
888
889 static void
890 icl_send_thread(void *arg)
891 {
892         struct icl_conn *ic;
893
894         ic = arg;
895
896         ICL_CONN_LOCK(ic);
897         ic->ic_send_running = true;
898
899         for (;;) {
900                 if (ic->ic_disconnecting) {
901                         //ICL_DEBUG("terminating");
902                         break;
903                 }
904                 icl_conn_send_pdus(ic);
905                 cv_wait(&ic->ic_send_cv, ic->ic_lock);
906         }
907
908         ic->ic_send_running = false;
909         ICL_CONN_UNLOCK(ic);
910         kthread_exit();
911 }
912
913 static int
914 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
915 {
916         struct icl_conn *ic;
917
918         ic = arg;
919         cv_signal(&ic->ic_send_cv);
920         return (SU_OK);
921 }
922
923 int
924 icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
925 {
926         struct mbuf *mb, *newmb;
927         size_t copylen, off = 0;
928
929         KASSERT(len > 0, ("len == 0"));
930
931         newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
932         if (newmb == NULL) {
933                 ICL_WARN("failed to allocate mbuf for %zd bytes", len);
934                 return (ENOMEM);
935         }
936
937         for (mb = newmb; mb != NULL; mb = mb->m_next) {
938                 copylen = min(M_TRAILINGSPACE(mb), len - off);
939                 memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
940                 mb->m_len = copylen;
941                 off += copylen;
942         }
943         KASSERT(off == len, ("%s: off != len", __func__));
944
945         if (request->ip_data_mbuf == NULL) {
946                 request->ip_data_mbuf = newmb;
947                 request->ip_data_len = len;
948         } else {
949                 m_cat(request->ip_data_mbuf, newmb);
950                 request->ip_data_len += len;
951         }
952
953         return (0);
954 }
955
956 void
957 icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
958 {
959
960         m_copydata(ip->ip_data_mbuf, off, len, addr);
961 }
962
963 void
964 icl_pdu_queue(struct icl_pdu *ip)
965 {
966         struct icl_conn *ic;
967
968         ic = ip->ip_conn;
969
970         ICL_CONN_LOCK_ASSERT(ic);
971
972         if (ic->ic_disconnecting || ic->ic_socket == NULL) {
973                 ICL_DEBUG("icl_pdu_queue on closed connection");
974                 icl_pdu_free(ip);
975                 return;
976         }
977         STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
978         cv_signal(&ic->ic_send_cv);
979 }
980
981 struct icl_conn *
982 icl_conn_new(const char *name, struct mtx *lock)
983 {
984         struct icl_conn *ic;
985
986         refcount_acquire(&icl_ncons);
987
988         ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
989
990         STAILQ_INIT(&ic->ic_to_send);
991         ic->ic_lock = lock;
992         cv_init(&ic->ic_send_cv, "icl_tx");
993         cv_init(&ic->ic_receive_cv, "icl_rx");
994 #ifdef DIAGNOSTIC
995         refcount_init(&ic->ic_outstanding_pdus, 0);
996 #endif
997         ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
998         ic->ic_name = name;
999
1000         return (ic);
1001 }
1002
1003 void
1004 icl_conn_free(struct icl_conn *ic)
1005 {
1006
1007         cv_destroy(&ic->ic_send_cv);
1008         cv_destroy(&ic->ic_receive_cv);
1009         uma_zfree(icl_conn_zone, ic);
1010         refcount_release(&icl_ncons);
1011 }
1012
1013 static int
1014 icl_conn_start(struct icl_conn *ic)
1015 {
1016         size_t minspace;
1017         struct sockopt opt;
1018         int error, one = 1;
1019
1020         ICL_CONN_LOCK(ic);
1021
1022         /*
1023          * XXX: Ugly hack.
1024          */
1025         if (ic->ic_socket == NULL) {
1026                 ICL_CONN_UNLOCK(ic);
1027                 return (EINVAL);
1028         }
1029
1030         ic->ic_receive_state = ICL_CONN_STATE_BHS;
1031         ic->ic_receive_len = sizeof(struct iscsi_bhs);
1032         ic->ic_disconnecting = false;
1033
1034         ICL_CONN_UNLOCK(ic);
1035
1036         /*
1037          * For sendspace, this is required because the current code cannot
1038          * send a PDU in pieces; thus, the minimum buffer size is equal
1039          * to the maximum PDU size.  "+4" is to account for possible padding.
1040          *
1041          * What we should actually do here is to use autoscaling, but set
1042          * some minimal buffer size to "minspace".  I don't know a way to do
1043          * that, though.
1044          */
1045         minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
1046             ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
1047         if (sendspace < minspace) {
1048                 ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
1049                     minspace);
1050                 sendspace = minspace;
1051         }
1052         if (recvspace < minspace) {
1053                 ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
1054                     minspace);
1055                 recvspace = minspace;
1056         }
1057
1058         error = soreserve(ic->ic_socket, sendspace, recvspace);
1059         if (error != 0) {
1060                 ICL_WARN("soreserve failed with error %d", error);
1061                 icl_conn_close(ic);
1062                 return (error);
1063         }
1064
1065         /*
1066          * Disable Nagle.
1067          */
1068         bzero(&opt, sizeof(opt));
1069         opt.sopt_dir = SOPT_SET;
1070         opt.sopt_level = IPPROTO_TCP;
1071         opt.sopt_name = TCP_NODELAY;
1072         opt.sopt_val = &one;
1073         opt.sopt_valsize = sizeof(one);
1074         error = sosetopt(ic->ic_socket, &opt);
1075         if (error != 0) {
1076                 ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1077                 icl_conn_close(ic);
1078                 return (error);
1079         }
1080
1081         /*
1082          * Start threads.
1083          */
1084         error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
1085             ic->ic_name);
1086         if (error != 0) {
1087                 ICL_WARN("kthread_add(9) failed with error %d", error);
1088                 icl_conn_close(ic);
1089                 return (error);
1090         }
1091
1092         error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
1093             ic->ic_name);
1094         if (error != 0) {
1095                 ICL_WARN("kthread_add(9) failed with error %d", error);
1096                 icl_conn_close(ic);
1097                 return (error);
1098         }
1099
1100         /*
1101          * Register socket upcall, to get notified about incoming PDUs
1102          * and free space to send outgoing ones.
1103          */
1104         SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1105         soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1106         SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1107         SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1108         soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1109         SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1110
1111         return (0);
1112 }
1113
1114 int
1115 icl_conn_handoff(struct icl_conn *ic, int fd)
1116 {
1117         struct file *fp;
1118         struct socket *so;
1119         cap_rights_t rights;
1120         int error;
1121
1122         ICL_CONN_LOCK_ASSERT_NOT(ic);
1123
1124         /*
1125          * Steal the socket from userland.
1126          */
1127         error = fget(curthread, fd,
1128             cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1129         if (error != 0)
1130                 return (error);
1131         if (fp->f_type != DTYPE_SOCKET) {
1132                 fdrop(fp, curthread);
1133                 return (EINVAL);
1134         }
1135         so = fp->f_data;
1136         if (so->so_type != SOCK_STREAM) {
1137                 fdrop(fp, curthread);
1138                 return (EINVAL);
1139         }
1140
1141         ICL_CONN_LOCK(ic);
1142
1143         if (ic->ic_socket != NULL) {
1144                 ICL_CONN_UNLOCK(ic);
1145                 fdrop(fp, curthread);
1146                 return (EBUSY);
1147         }
1148
1149         ic->ic_socket = fp->f_data;
1150         fp->f_ops = &badfileops;
1151         fp->f_data = NULL;
1152         fdrop(fp, curthread);
1153         ICL_CONN_UNLOCK(ic);
1154
1155         error = icl_conn_start(ic);
1156
1157         return (error);
1158 }
1159
1160 void
1161 icl_conn_shutdown(struct icl_conn *ic)
1162 {
1163         ICL_CONN_LOCK_ASSERT_NOT(ic);
1164
1165         ICL_CONN_LOCK(ic);
1166         if (ic->ic_socket == NULL) {
1167                 ICL_CONN_UNLOCK(ic);
1168                 return;
1169         }
1170         ICL_CONN_UNLOCK(ic);
1171
1172         soshutdown(ic->ic_socket, SHUT_RDWR);
1173 }
1174
1175 void
1176 icl_conn_close(struct icl_conn *ic)
1177 {
1178         struct icl_pdu *pdu;
1179
1180         ICL_CONN_LOCK_ASSERT_NOT(ic);
1181
1182         ICL_CONN_LOCK(ic);
1183         if (ic->ic_socket == NULL) {
1184                 ICL_CONN_UNLOCK(ic);
1185                 return;
1186         }
1187
1188         ic->ic_disconnecting = true;
1189
1190         /*
1191          * Wake up the threads, so they can properly terminate.
1192          */
1193         cv_signal(&ic->ic_receive_cv);
1194         cv_signal(&ic->ic_send_cv);
1195         while (ic->ic_receive_running || ic->ic_send_running) {
1196                 //ICL_DEBUG("waiting for send/receive threads to terminate");
1197                 ICL_CONN_UNLOCK(ic);
1198                 cv_signal(&ic->ic_receive_cv);
1199                 cv_signal(&ic->ic_send_cv);
1200                 pause("icl_close", 1 * hz);
1201                 ICL_CONN_LOCK(ic);
1202         }
1203         //ICL_DEBUG("send/receive threads terminated");
1204
1205         soclose(ic->ic_socket);
1206         ic->ic_socket = NULL;
1207
1208         if (ic->ic_receive_pdu != NULL) {
1209                 //ICL_DEBUG("freeing partially received PDU");
1210                 icl_pdu_free(ic->ic_receive_pdu);
1211                 ic->ic_receive_pdu = NULL;
1212         }
1213
1214         /*
1215          * Remove any outstanding PDUs from the send queue.
1216          */
1217         while (!STAILQ_EMPTY(&ic->ic_to_send)) {
1218                 pdu = STAILQ_FIRST(&ic->ic_to_send);
1219                 STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
1220                 icl_pdu_free(pdu);
1221         }
1222
1223         KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
1224             ("destroying session with non-empty send queue"));
1225 #ifdef DIAGNOSTIC
1226         KASSERT(ic->ic_outstanding_pdus == 0,
1227             ("destroying session with %d outstanding PDUs",
1228              ic->ic_outstanding_pdus));
1229 #endif
1230         ICL_CONN_UNLOCK(ic);
1231 }
1232
1233 bool
1234 icl_conn_connected(struct icl_conn *ic)
1235 {
1236         ICL_CONN_LOCK_ASSERT_NOT(ic);
1237
1238         ICL_CONN_LOCK(ic);
1239         if (ic->ic_socket == NULL) {
1240                 ICL_CONN_UNLOCK(ic);
1241                 return (false);
1242         }
1243         if (ic->ic_socket->so_error != 0) {
1244                 ICL_CONN_UNLOCK(ic);
1245                 return (false);
1246         }
1247         ICL_CONN_UNLOCK(ic);
1248         return (true);
1249 }
1250
1251 #ifdef ICL_KERNEL_PROXY
1252 int
1253 icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1254 {
1255         int error;
1256
1257         ICL_CONN_LOCK_ASSERT_NOT(ic);
1258
1259         if (so->so_type != SOCK_STREAM)
1260                 return (EINVAL);
1261
1262         ICL_CONN_LOCK(ic);
1263         if (ic->ic_socket != NULL) {
1264                 ICL_CONN_UNLOCK(ic);
1265                 return (EBUSY);
1266         }
1267         ic->ic_socket = so;
1268         ICL_CONN_UNLOCK(ic);
1269
1270         error = icl_conn_start(ic);
1271
1272         return (error);
1273 }
1274 #endif /* ICL_KERNEL_PROXY */
1275
1276 static int
1277 icl_unload(void)
1278 {
1279
1280         if (icl_ncons != 0)
1281                 return (EBUSY);
1282
1283         uma_zdestroy(icl_conn_zone);
1284         uma_zdestroy(icl_pdu_zone);
1285
1286         return (0);
1287 }
1288
1289 static void
1290 icl_load(void)
1291 {
1292
1293         icl_conn_zone = uma_zcreate("icl_conn",
1294             sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1295             UMA_ALIGN_PTR, 0);
1296         icl_pdu_zone = uma_zcreate("icl_pdu",
1297             sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1298             UMA_ALIGN_PTR, 0);
1299
1300         refcount_init(&icl_ncons, 0);
1301 }
1302
1303 static int
1304 icl_modevent(module_t mod, int what, void *arg)
1305 {
1306
1307         switch (what) {
1308         case MOD_LOAD:
1309                 icl_load();
1310                 return (0);
1311         case MOD_UNLOAD:
1312                 return (icl_unload());
1313         default:
1314                 return (EINVAL);
1315         }
1316 }
1317
1318 moduledata_t icl_data = {
1319         "icl",
1320         icl_modevent,
1321         0
1322 };
1323
1324 DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1325 MODULE_VERSION(icl, 1);