]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/dev/iscsi/icl.c
Copy head (r256279) to stable/10 as part of the 10.0-RELEASE cycle.
[FreeBSD/stable/10.git] / sys / dev / iscsi / icl.c
1 /*-
2  * Copyright (c) 2012 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 /*
33  * iSCSI Common Layer.  It's used by both the initiator and target to send
34  * and receive iSCSI PDUs.
35  */
36
37 #include <sys/param.h>
38 #include <sys/capability.h>
39 #include <sys/condvar.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/lock.h>
45 #include <sys/mbuf.h>
46 #include <sys/mutex.h>
47 #include <sys/module.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/sx.h>
53 #include <sys/uio.h>
54 #include <vm/uma.h>
55 #include <netinet/in.h>
56 #include <netinet/tcp.h>
57
58 #include "icl.h"
59 #include "iscsi_proto.h"
60
61 SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62 static int debug = 1;
63 TUNABLE_INT("kern.icl.debug", &debug);
64 SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65     &debug, 1, "Enable debug messages");
66 static int partial_receive_len = 1 * 1024; /* XXX: More? */
67 TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69     &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70     "data segment");
71
72 static uma_zone_t icl_conn_zone;
73 static uma_zone_t icl_pdu_zone;
74
75 static volatile u_int   icl_ncons;
76
77 #define ICL_DEBUG(X, ...)                                       \
78         if (debug > 1) {                                        \
79                 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
80         } while (0)
81
82 #define ICL_WARN(X, ...)                                        \
83         if (debug > 0) {                                        \
84                 printf("WARNING: %s: " X "\n",                  \
85                     __func__, ## __VA_ARGS__);                  \
86         } while (0)
87
88 #define ICL_CONN_LOCK(X)                mtx_lock(&X->ic_lock)
89 #define ICL_CONN_UNLOCK(X)              mtx_unlock(&X->ic_lock)
90 #define ICL_CONN_LOCK_ASSERT(X)         mtx_assert(&X->ic_lock, MA_OWNED)
91
92 static void
93 icl_conn_fail(struct icl_conn *ic)
94 {
95         if (ic->ic_socket == NULL)
96                 return;
97
98         /*
99          * XXX
100          */
101         ic->ic_socket->so_error = EDOOFUS;
102         (ic->ic_error)(ic);
103 }
104
105 static struct mbuf *
106 icl_conn_receive(struct icl_conn *ic, size_t len)
107 {
108         struct uio uio;
109         struct socket *so;
110         struct mbuf *m;
111         int error, flags;
112
113         so = ic->ic_socket;
114
115         memset(&uio, 0, sizeof(uio));
116         uio.uio_resid = len;
117
118         flags = MSG_DONTWAIT;
119         error = soreceive(so, NULL, &uio, &m, NULL, &flags);
120         if (error != 0) {
121                 ICL_DEBUG("soreceive error %d", error);
122                 return (NULL);
123         }
124         if (uio.uio_resid != 0) {
125                 m_freem(m);
126                 ICL_DEBUG("short read");
127                 return (NULL);
128         }
129
130         return (m);
131 }
132
133 static struct icl_pdu *
134 icl_pdu_new(struct icl_conn *ic, int flags)
135 {
136         struct icl_pdu *ip;
137
138         refcount_acquire(&ic->ic_outstanding_pdus);
139         ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
140         if (ip == NULL) {
141                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
142                 refcount_release(&ic->ic_outstanding_pdus);
143                 return (NULL);
144         }
145
146         ip->ip_conn = ic;
147
148         return (ip);
149 }
150
151 void
152 icl_pdu_free(struct icl_pdu *ip)
153 {
154         struct icl_conn *ic;
155
156         ic = ip->ip_conn;
157
158         m_freem(ip->ip_bhs_mbuf);
159         m_freem(ip->ip_ahs_mbuf);
160         m_freem(ip->ip_data_mbuf);
161         uma_zfree(icl_pdu_zone, ip);
162         refcount_release(&ic->ic_outstanding_pdus);
163 }
164
165 /*
166  * Allocate icl_pdu with empty BHS to fill up by the caller.
167  */
168 struct icl_pdu *
169 icl_pdu_new_bhs(struct icl_conn *ic, int flags)
170 {
171         struct icl_pdu *ip;
172
173         ip = icl_pdu_new(ic, flags);
174         if (ip == NULL)
175                 return (NULL);
176
177         ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
178             flags, MT_DATA, M_PKTHDR);
179         if (ip->ip_bhs_mbuf == NULL) {
180                 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
181                 icl_pdu_free(ip);
182                 return (NULL);
183         }
184         ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
185         memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
186         ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
187
188         return (ip);
189 }
190
191 static int
192 icl_pdu_ahs_length(const struct icl_pdu *request)
193 {
194
195         return (request->ip_bhs->bhs_total_ahs_len * 4);
196 }
197
198 size_t
199 icl_pdu_data_segment_length(const struct icl_pdu *request)
200 {
201         uint32_t len = 0;
202
203         len += request->ip_bhs->bhs_data_segment_len[0];
204         len <<= 8;
205         len += request->ip_bhs->bhs_data_segment_len[1];
206         len <<= 8;
207         len += request->ip_bhs->bhs_data_segment_len[2];
208
209         return (len);
210 }
211
212 static void
213 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
214 {
215
216         response->ip_bhs->bhs_data_segment_len[2] = len;
217         response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
218         response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
219 }
220
221 static size_t
222 icl_pdu_padding(const struct icl_pdu *ip)
223 {
224
225         if ((ip->ip_data_len % 4) != 0)
226                 return (4 - (ip->ip_data_len % 4));
227
228         return (0);
229 }
230
231 static size_t
232 icl_pdu_size(const struct icl_pdu *response)
233 {
234         size_t len;
235
236         KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
237
238         len = sizeof(struct iscsi_bhs) + response->ip_data_len +
239             icl_pdu_padding(response);
240         if (response->ip_conn->ic_header_crc32c)
241                 len += ISCSI_HEADER_DIGEST_SIZE;
242         if (response->ip_conn->ic_data_crc32c)
243                 len += ISCSI_DATA_DIGEST_SIZE;
244
245         return (len);
246 }
247
248 static int
249 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
250 {
251         struct mbuf *m;
252
253         m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
254         if (m == NULL) {
255                 ICL_DEBUG("failed to receive BHS");
256                 return (-1);
257         }
258
259         request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
260         if (request->ip_bhs_mbuf == NULL) {
261                 ICL_WARN("m_pullup failed");
262                 return (-1);
263         }
264         request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
265
266         /*
267          * XXX: For architectures with strict alignment requirements
268          *      we may need to allocate ip_bhs and copy the data into it.
269          *      For some reason, though, not doing this doesn't seem
270          *      to cause problems; tested on sparc64.
271          */
272
273         *availablep -= sizeof(struct iscsi_bhs);
274         return (0);
275 }
276
277 static int
278 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
279 {
280
281         request->ip_ahs_len = icl_pdu_ahs_length(request);
282         if (request->ip_ahs_len == 0)
283                 return (0);
284
285         request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
286             request->ip_ahs_len);
287         if (request->ip_ahs_mbuf == NULL) {
288                 ICL_DEBUG("failed to receive AHS");
289                 return (-1);
290         }
291
292         *availablep -= request->ip_ahs_len;
293         return (0);
294 }
295
296 static uint32_t
297 icl_mbuf_to_crc32c(const struct mbuf *m0)
298 {
299         uint32_t digest = 0xffffffff;
300         const struct mbuf *m;
301
302         for (m = m0; m != NULL; m = m->m_next)
303                 digest = calculate_crc32c(digest,
304                     mtod(m, const void *), m->m_len);
305
306         digest = digest ^ 0xffffffff;
307
308         return (digest);
309 }
310
311 static int
312 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
313 {
314         struct mbuf *m;
315         uint32_t received_digest, valid_digest;
316
317         if (request->ip_conn->ic_header_crc32c == false)
318                 return (0);
319
320         m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
321         if (m == NULL) {
322                 ICL_DEBUG("failed to receive header digest");
323                 return (-1);
324         }
325
326         CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
327         memcpy(&received_digest, mtod(m, void *), ISCSI_HEADER_DIGEST_SIZE);
328         m_freem(m);
329
330         *availablep -= ISCSI_HEADER_DIGEST_SIZE;
331
332         /*
333          * XXX: Handle AHS.
334          */
335         valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
336         if (received_digest != valid_digest) {
337                 ICL_WARN("header digest check failed; got 0x%x, "
338                     "should be 0x%x", received_digest, valid_digest);
339                 return (-1);
340         }
341
342         return (0);
343 }
344
345 /*
346  * Return the number of bytes that should be waiting in the receive socket
347  * before icl_pdu_receive_data_segment() gets called.
348  */
349 static size_t
350 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
351 {
352         size_t len;
353
354         len = icl_pdu_data_segment_length(request);
355         if (len == 0)
356                 return (0);
357
358         /*
359          * Account for the parts of data segment already read from
360          * the socket buffer.
361          */
362         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
363         len -= request->ip_data_len;
364
365         /*
366          * Don't always wait for the full data segment to be delivered
367          * to the socket; this might badly affect performance due to
368          * TCP window scaling.
369          */
370         if (len > partial_receive_len) {
371 #if 0
372                 ICL_DEBUG("need %zd bytes of data, limiting to %zd",
373                     len, partial_receive_len));
374 #endif
375                 len = partial_receive_len;
376
377                 return (len);
378         }
379
380         /*
381          * Account for padding.  Note that due to the way code is written,
382          * the icl_pdu_receive_data_segment() must always receive padding
383          * along with the last part of data segment, because it would be
384          * impossible to tell whether we've already received the full data
385          * segment including padding, or without it.
386          */
387         if ((len % 4) != 0)
388                 len += 4 - (len % 4);
389
390 #if 0
391         ICL_DEBUG("need %zd bytes of data", len));
392 #endif
393
394         return (len);
395 }
396
397 static int
398 icl_pdu_receive_data_segment(struct icl_pdu *request,
399     size_t *availablep, bool *more_neededp)
400 {
401         struct icl_conn *ic;
402         size_t len, padding = 0;
403         struct mbuf *m;
404
405         ic = request->ip_conn;
406
407         *more_neededp = false;
408         ic->ic_receive_len = 0;
409
410         len = icl_pdu_data_segment_length(request);
411         if (len == 0)
412                 return (0);
413
414         if ((len % 4) != 0)
415                 padding = 4 - (len % 4);
416
417         /*
418          * Account for already received parts of data segment.
419          */
420         KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
421         len -= request->ip_data_len;
422
423         if (len + padding > *availablep) {
424                 /*
425                  * Not enough data in the socket buffer.  Receive as much
426                  * as we can.  Don't receive padding, since, obviously, it's
427                  * not the end of data segment yet.
428                  */
429 #if 0
430                 ICL_DEBUG("limited from %zd to %zd",
431                     len + padding, *availablep - padding));
432 #endif
433                 len = *availablep - padding;
434                 *more_neededp = true;
435                 padding = 0;
436         }
437
438         /*
439          * Must not try to receive padding without at least one byte
440          * of actual data segment.
441          */
442         if (len > 0) {
443                 m = icl_conn_receive(request->ip_conn, len + padding);
444                 if (m == NULL) {
445                         ICL_DEBUG("failed to receive data segment");
446                         return (-1);
447                 }
448
449                 if (request->ip_data_mbuf == NULL)
450                         request->ip_data_mbuf = m;
451                 else
452                         m_cat(request->ip_data_mbuf, m);
453
454                 request->ip_data_len += len;
455                 *availablep -= len + padding;
456         } else
457                 ICL_DEBUG("len 0");
458
459         if (*more_neededp)
460                 ic->ic_receive_len =
461                     icl_pdu_data_segment_receive_len(request);
462
463         return (0);
464 }
465
466 static int
467 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
468 {
469         struct mbuf *m;
470         uint32_t received_digest, valid_digest;
471
472         if (request->ip_conn->ic_data_crc32c == false)
473                 return (0);
474
475         if (request->ip_data_len == 0)
476                 return (0);
477
478         m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
479         if (m == NULL) {
480                 ICL_DEBUG("failed to receive data digest");
481                 return (-1);
482         }
483
484         CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
485         memcpy(&received_digest, mtod(m, void *), ISCSI_DATA_DIGEST_SIZE);
486         m_freem(m);
487
488         *availablep -= ISCSI_DATA_DIGEST_SIZE;
489
490         /*
491          * Note that ip_data_mbuf also contains padding; since digest
492          * calculation is supposed to include that, we iterate over
493          * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
494          */
495         valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
496         if (received_digest != valid_digest) {
497                 ICL_WARN("data digest check failed; got 0x%x, "
498                     "should be 0x%x", received_digest, valid_digest);
499                 return (-1);
500         }
501
502         return (0);
503 }
504
505 /*
506  * Somewhat contrary to the name, this attempts to receive only one
507  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
508  */
509 static struct icl_pdu *
510 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
511 {
512         struct icl_pdu *request;
513         struct socket *so;
514         size_t len;
515         int error;
516         bool more_needed;
517
518         so = ic->ic_socket;
519
520         if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
521                 KASSERT(ic->ic_receive_pdu == NULL,
522                     ("ic->ic_receive_pdu != NULL"));
523                 request = icl_pdu_new(ic, M_NOWAIT);
524                 if (request == NULL) {
525                         ICL_DEBUG("failed to allocate PDU; "
526                             "dropping connection");
527                         icl_conn_fail(ic);
528                         return (NULL);
529                 }
530                 ic->ic_receive_pdu = request;
531         } else {
532                 KASSERT(ic->ic_receive_pdu != NULL,
533                     ("ic->ic_receive_pdu == NULL"));
534                 request = ic->ic_receive_pdu;
535         }
536
537         if (*availablep < ic->ic_receive_len) {
538 #if 0
539                 ICL_DEBUG("not enough data; need %zd, "
540                     "have %zd", ic->ic_receive_len, *availablep);
541 #endif
542                 return (NULL);
543         }
544
545         switch (ic->ic_receive_state) {
546         case ICL_CONN_STATE_BHS:
547                 //ICL_DEBUG("receiving BHS");
548                 error = icl_pdu_receive_bhs(request, availablep);
549                 if (error != 0) {
550                         ICL_DEBUG("failed to receive BHS; "
551                             "dropping connection");
552                         break;
553                 }
554
555                 /*
556                  * We don't enforce any limit for AHS length;
557                  * its length is stored in 8 bit field.
558                  */
559
560                 len = icl_pdu_data_segment_length(request);
561                 if (len > ic->ic_max_data_segment_length) {
562                         ICL_WARN("received data segment "
563                             "length %zd is larger than negotiated "
564                             "MaxDataSegmentLength %zd; "
565                             "dropping connection",
566                             len, ic->ic_max_data_segment_length);
567                         error = EINVAL;
568                         break;
569                 }
570
571                 ic->ic_receive_state = ICL_CONN_STATE_AHS;
572                 ic->ic_receive_len = icl_pdu_ahs_length(request);
573                 break;
574
575         case ICL_CONN_STATE_AHS:
576                 //ICL_DEBUG("receiving AHS");
577                 error = icl_pdu_receive_ahs(request, availablep);
578                 if (error != 0) {
579                         ICL_DEBUG("failed to receive AHS; "
580                             "dropping connection");
581                         break;
582                 }
583                 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
584                 if (ic->ic_header_crc32c == false)
585                         ic->ic_receive_len = 0;
586                 else
587                         ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
588                 break;
589
590         case ICL_CONN_STATE_HEADER_DIGEST:
591                 //ICL_DEBUG("receiving header digest");
592                 error = icl_pdu_check_header_digest(request, availablep);
593                 if (error != 0) {
594                         ICL_DEBUG("header digest failed; "
595                             "dropping connection");
596                         break;
597                 }
598
599                 ic->ic_receive_state = ICL_CONN_STATE_DATA;
600                 ic->ic_receive_len =
601                     icl_pdu_data_segment_receive_len(request);
602                 break;
603
604         case ICL_CONN_STATE_DATA:
605                 //ICL_DEBUG("receiving data segment");
606                 error = icl_pdu_receive_data_segment(request, availablep,
607                     &more_needed);
608                 if (error != 0) {
609                         ICL_DEBUG("failed to receive data segment;"
610                             "dropping connection");
611                         break;
612                 }
613
614                 if (more_needed)
615                         break;
616
617                 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
618                 if (ic->ic_data_crc32c == false)
619                         ic->ic_receive_len = 0;
620                 else
621                         ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
622                 break;
623
624         case ICL_CONN_STATE_DATA_DIGEST:
625                 //ICL_DEBUG("receiving data digest");
626                 error = icl_pdu_check_data_digest(request, availablep);
627                 if (error != 0) {
628                         ICL_DEBUG("data digest failed; "
629                             "dropping connection");
630                         break;
631                 }
632
633                 /*
634                  * We've received complete PDU; reset the receive state machine
635                  * and return the PDU.
636                  */
637                 ic->ic_receive_state = ICL_CONN_STATE_BHS;
638                 ic->ic_receive_len = sizeof(struct iscsi_bhs);
639                 ic->ic_receive_pdu = NULL;
640                 return (request);
641
642         default:
643                 panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
644         }
645
646         if (error != 0) {
647                 icl_pdu_free(request);
648                 icl_conn_fail(ic);
649         }
650
651         return (NULL);
652 }
653
654 static void
655 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
656 {
657         struct icl_pdu *response;
658         struct socket *so;
659
660         so = ic->ic_socket;
661
662         /*
663          * This can never happen; we're careful to only mess with ic->ic_socket
664          * pointer when the send/receive threads are not running.
665          */
666         KASSERT(so != NULL, ("NULL socket"));
667
668         for (;;) {
669                 if (ic->ic_disconnecting)
670                         return;
671
672                 if (so->so_error != 0) {
673                         ICL_DEBUG("connection error %d; "
674                             "dropping connection", so->so_error);
675                         icl_conn_fail(ic);
676                         return;
677                 }
678
679                 /*
680                  * Loop until we have a complete PDU or there is not enough
681                  * data in the socket buffer.
682                  */
683                 if (available < ic->ic_receive_len) {
684 #if 0
685                         ICL_DEBUG("not enough data; have %zd, "
686                             "need %zd", available,
687                             ic->ic_receive_len);
688 #endif
689                         return;
690                 }
691
692                 response = icl_conn_receive_pdu(ic, &available);
693                 if (response == NULL)
694                         continue;
695
696                 if (response->ip_ahs_len > 0) {
697                         ICL_WARN("received PDU with unsupported "
698                             "AHS; opcode 0x%x; dropping connection",
699                             response->ip_bhs->bhs_opcode);
700                         icl_pdu_free(response);
701                         icl_conn_fail(ic);
702                         return;
703                 }
704
705                 (ic->ic_receive)(response);
706         }
707 }
708
709 static void
710 icl_receive_thread(void *arg)
711 {
712         struct icl_conn *ic;
713         size_t available;
714         struct socket *so;
715
716         ic = arg;
717         so = ic->ic_socket;
718
719         ICL_CONN_LOCK(ic);
720         ic->ic_receive_running = true;
721         ICL_CONN_UNLOCK(ic);
722
723         for (;;) {
724                 if (ic->ic_disconnecting) {
725                         //ICL_DEBUG("terminating");
726                         ICL_CONN_LOCK(ic);
727                         ic->ic_receive_running = false;
728                         ICL_CONN_UNLOCK(ic);
729                         kthread_exit();
730                         return;
731                 }
732
733                 SOCKBUF_LOCK(&so->so_rcv);
734                 available = so->so_rcv.sb_cc;
735                 if (available < ic->ic_receive_len) {
736                         so->so_rcv.sb_lowat = ic->ic_receive_len;
737                         cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
738                 }
739                 SOCKBUF_UNLOCK(&so->so_rcv);
740
741                 icl_conn_receive_pdus(ic, available);
742         }
743 }
744
745 static int
746 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
747 {
748         struct icl_conn *ic;
749
750         ic = arg;
751         cv_signal(&ic->ic_receive_cv);
752         return (SU_OK);
753 }
754
755 static int
756 icl_pdu_send(struct icl_pdu *request)
757 {
758         size_t padding, pdu_len;
759         uint32_t digest, zero = 0;
760         int error, ok;
761         struct socket *so;
762         struct icl_conn *ic;
763
764         ic = request->ip_conn;
765         so = request->ip_conn->ic_socket;
766
767         ICL_CONN_LOCK_ASSERT(ic);
768
769         icl_pdu_set_data_segment_length(request, request->ip_data_len);
770
771         pdu_len = icl_pdu_size(request);
772
773         if (ic->ic_header_crc32c) {
774                 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
775                 ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
776                     (void *)&digest);
777                 if (ok != 1) {
778                         ICL_WARN("failed to append header digest");
779                         return (1);
780                 }
781         }
782
783         if (request->ip_data_len != 0) {
784                 padding = icl_pdu_padding(request);
785                 if (padding > 0) {
786                         ok = m_append(request->ip_data_mbuf, padding,
787                             (void *)&zero);
788                         if (ok != 1) {
789                                 ICL_WARN("failed to append padding");
790                                 return (1);
791                         }
792                 }
793
794                 if (ic->ic_data_crc32c) {
795                         digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
796
797                         ok = m_append(request->ip_data_mbuf, sizeof(digest),
798                             (void *)&digest);
799                         if (ok != 1) {
800                                 ICL_WARN("failed to append header digest");
801                                 return (1);
802                         }
803                 }
804
805                 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
806                 request->ip_data_mbuf = NULL;
807         }
808
809         request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
810
811         error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
812             NULL, MSG_DONTWAIT, curthread);
813         request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
814         if (error != 0) {
815                 ICL_DEBUG("sosend error %d", error);
816                 return (error);
817         }
818
819         return (0);
820 }
821
822 static void
823 icl_conn_send_pdus(struct icl_conn *ic)
824 {
825         struct icl_pdu *request;
826         struct socket *so;
827         size_t available, size;
828         int error;
829
830         ICL_CONN_LOCK_ASSERT(ic);
831
832         so = ic->ic_socket;
833
834         SOCKBUF_LOCK(&so->so_snd);
835         available = sbspace(&so->so_snd);
836         SOCKBUF_UNLOCK(&so->so_snd);
837
838         while (!TAILQ_EMPTY(&ic->ic_to_send)) {
839                 if (ic->ic_disconnecting)
840                         return;
841
842                 request = TAILQ_FIRST(&ic->ic_to_send);
843                 size = icl_pdu_size(request);
844                 if (available < size) {
845                         /*
846                          * Set the low watermark on the socket,
847                          * to avoid waking up until there is enough
848                          * space.
849                          */
850                         SOCKBUF_LOCK(&so->so_snd);
851                         so->so_snd.sb_lowat = size;
852                         SOCKBUF_UNLOCK(&so->so_snd);
853 #if 1
854                         ICL_DEBUG("no space to send; "
855                             "have %zd, need %zd",
856                             available, size);
857 #endif
858                         return;
859                 }
860                 available -= size;
861                 TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
862                 error = icl_pdu_send(request);
863                 if (error != 0) {
864                         ICL_DEBUG("failed to send PDU; "
865                             "dropping connection");
866                         icl_conn_fail(ic);
867                         return;
868                 } 
869                 icl_pdu_free(request);
870         }
871 }
872
873 static void
874 icl_send_thread(void *arg)
875 {
876         struct icl_conn *ic;
877
878         ic = arg;
879
880         ICL_CONN_LOCK(ic);
881         ic->ic_send_running = true;
882         ICL_CONN_UNLOCK(ic);
883
884         for (;;) {
885                 ICL_CONN_LOCK(ic);
886                 if (ic->ic_disconnecting) {
887                         //ICL_DEBUG("terminating");
888                         ic->ic_send_running = false;
889                         ICL_CONN_UNLOCK(ic);
890                         kthread_exit();
891                         return;
892                 }
893                 if (TAILQ_EMPTY(&ic->ic_to_send))
894                         cv_wait(&ic->ic_send_cv, &ic->ic_lock);
895                 icl_conn_send_pdus(ic);
896                 ICL_CONN_UNLOCK(ic);
897         }
898 }
899
900 static int
901 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
902 {
903         struct icl_conn *ic;
904
905         ic = arg;
906         cv_signal(&ic->ic_send_cv);
907         return (SU_OK);
908 }
909
910 int
911 icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
912 {
913         struct mbuf *mb, *newmb;
914         size_t copylen, off = 0;
915
916         KASSERT(len > 0, ("len == 0"));
917
918         newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
919         if (newmb == NULL) {
920                 ICL_WARN("failed to allocate mbuf for %zd bytes", len);
921                 return (ENOMEM);
922         }
923
924         for (mb = newmb; mb != NULL; mb = mb->m_next) {
925                 copylen = min(M_TRAILINGSPACE(mb), len - off);
926                 memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
927                 mb->m_len = copylen;
928                 off += copylen;
929         }
930         KASSERT(off == len, ("%s: off != len", __func__));
931
932         if (request->ip_data_mbuf == NULL) {
933                 request->ip_data_mbuf = newmb;
934                 request->ip_data_len = len;
935         } else {
936                 m_cat(request->ip_data_mbuf, newmb);
937                 request->ip_data_len += len;
938         }
939
940         return (0);
941 }
942
943 void
944 icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
945 {
946
947         m_copydata(ip->ip_data_mbuf, off, len, addr);
948 }
949
950 void
951 icl_pdu_queue(struct icl_pdu *ip)
952 {
953         struct icl_conn *ic;
954
955         ic = ip->ip_conn;
956
957         ICL_CONN_LOCK(ic);
958         if (ic->ic_disconnecting || ic->ic_socket == NULL) {
959                 ICL_DEBUG("icl_pdu_queue on closed connection");
960                 ICL_CONN_UNLOCK(ic);
961                 icl_pdu_free(ip);
962                 return;
963         }
964         TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
965         ICL_CONN_UNLOCK(ic);
966         cv_signal(&ic->ic_send_cv);
967 }
968
969 struct icl_conn *
970 icl_conn_new(void)
971 {
972         struct icl_conn *ic;
973
974         refcount_acquire(&icl_ncons);
975
976         ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
977
978         TAILQ_INIT(&ic->ic_to_send);
979         mtx_init(&ic->ic_lock, "icl_lock", NULL, MTX_DEF);
980         cv_init(&ic->ic_send_cv, "icl_tx");
981         cv_init(&ic->ic_receive_cv, "icl_rx");
982         refcount_init(&ic->ic_outstanding_pdus, 0);
983         ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
984
985         return (ic);
986 }
987
988 void
989 icl_conn_free(struct icl_conn *ic)
990 {
991
992         mtx_destroy(&ic->ic_lock);
993         cv_destroy(&ic->ic_send_cv);
994         cv_destroy(&ic->ic_receive_cv);
995         uma_zfree(icl_conn_zone, ic);
996         refcount_release(&icl_ncons);
997 }
998
999 static int
1000 icl_conn_start(struct icl_conn *ic)
1001 {
1002         size_t bufsize;
1003         struct sockopt opt;
1004         int error, one = 1;
1005
1006         ICL_CONN_LOCK(ic);
1007
1008         /*
1009          * XXX: Ugly hack.
1010          */
1011         if (ic->ic_socket == NULL) {
1012                 ICL_CONN_UNLOCK(ic);
1013                 return (EINVAL);
1014         }
1015
1016         ic->ic_receive_state = ICL_CONN_STATE_BHS;
1017         ic->ic_receive_len = sizeof(struct iscsi_bhs);
1018         ic->ic_disconnecting = false;
1019
1020         ICL_CONN_UNLOCK(ic);
1021
1022         /*
1023          * Use max available sockbuf size for sending.  Do it manually
1024          * instead of sbreserve(9) to work around resource limits.
1025          *
1026          * XXX: This kind of sucks.  On one hand, we don't currently support
1027          *      sending a part of data segment; we always do it in one piece,
1028          *      so we have to make sure it can fit in the socket buffer.
1029          *      Once I've implemented partial send, we'll get rid of this
1030          *      and use autoscaling.
1031          */
1032         bufsize = (sizeof(struct iscsi_bhs) +
1033             ic->ic_max_data_segment_length) * 8;
1034         error = soreserve(ic->ic_socket, bufsize, bufsize);
1035         if (error != 0) {
1036                 ICL_WARN("soreserve failed with error %d", error);
1037                 icl_conn_close(ic);
1038                 return (error);
1039         }
1040
1041         /*
1042          * Disable Nagle.
1043          */
1044         bzero(&opt, sizeof(opt));
1045         opt.sopt_dir = SOPT_SET;
1046         opt.sopt_level = IPPROTO_TCP;
1047         opt.sopt_name = TCP_NODELAY;
1048         opt.sopt_val = &one;
1049         opt.sopt_valsize = sizeof(one);
1050         error = sosetopt(ic->ic_socket, &opt);
1051         if (error != 0) {
1052                 ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1053                 icl_conn_close(ic);
1054                 return (error);
1055         }
1056
1057         /*
1058          * Start threads.
1059          */
1060         error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx");
1061         if (error != 0) {
1062                 ICL_WARN("kthread_add(9) failed with error %d", error);
1063                 icl_conn_close(ic);
1064                 return (error);
1065         }
1066
1067         error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx");
1068         if (error != 0) {
1069                 ICL_WARN("kthread_add(9) failed with error %d", error);
1070                 icl_conn_close(ic);
1071                 return (error);
1072         }
1073
1074         /*
1075          * Register socket upcall, to get notified about incoming PDUs
1076          * and free space to send outgoing ones.
1077          */
1078         SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1079         soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1080         SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1081         SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1082         soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1083         SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1084
1085         return (0);
1086 }
1087
1088 int
1089 icl_conn_handoff(struct icl_conn *ic, int fd)
1090 {
1091         struct file *fp;
1092         struct socket *so;
1093         cap_rights_t rights;
1094         int error;
1095
1096         /*
1097          * Steal the socket from userland.
1098          */
1099         error = fget(curthread, fd,
1100             cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1101         if (error != 0)
1102                 return (error);
1103         if (fp->f_type != DTYPE_SOCKET) {
1104                 fdrop(fp, curthread);
1105                 return (EINVAL);
1106         }
1107         so = fp->f_data;
1108         if (so->so_type != SOCK_STREAM) {
1109                 fdrop(fp, curthread);
1110                 return (EINVAL);
1111         }
1112
1113         ICL_CONN_LOCK(ic);
1114
1115         if (ic->ic_socket != NULL) {
1116                 ICL_CONN_UNLOCK(ic);
1117                 fdrop(fp, curthread);
1118                 return (EBUSY);
1119         }
1120
1121         ic->ic_socket = fp->f_data;
1122         fp->f_ops = &badfileops;
1123         fp->f_data = NULL;
1124         fdrop(fp, curthread);
1125         ICL_CONN_UNLOCK(ic);
1126
1127         error = icl_conn_start(ic);
1128
1129         return (error);
1130 }
1131
1132 void
1133 icl_conn_shutdown(struct icl_conn *ic)
1134 {
1135
1136         ICL_CONN_LOCK(ic);
1137         if (ic->ic_socket == NULL) {
1138                 ICL_CONN_UNLOCK(ic);
1139                 return;
1140         }
1141         ICL_CONN_UNLOCK(ic);
1142
1143         soshutdown(ic->ic_socket, SHUT_RDWR);
1144 }
1145
1146 void
1147 icl_conn_close(struct icl_conn *ic)
1148 {
1149         struct icl_pdu *pdu;
1150
1151         ICL_CONN_LOCK(ic);
1152         if (ic->ic_socket == NULL) {
1153                 ICL_CONN_UNLOCK(ic);
1154                 return;
1155         }
1156
1157         ic->ic_disconnecting = true;
1158
1159         /*
1160          * Wake up the threads, so they can properly terminate.
1161          */
1162         cv_signal(&ic->ic_receive_cv);
1163         cv_signal(&ic->ic_send_cv);
1164         while (ic->ic_receive_running || ic->ic_send_running) {
1165                 //ICL_DEBUG("waiting for send/receive threads to terminate");
1166                 ICL_CONN_UNLOCK(ic);
1167                 cv_signal(&ic->ic_receive_cv);
1168                 cv_signal(&ic->ic_send_cv);
1169                 pause("icl_close", 1 * hz);
1170                 ICL_CONN_LOCK(ic);
1171         }
1172         //ICL_DEBUG("send/receive threads terminated");
1173
1174         soclose(ic->ic_socket);
1175         ic->ic_socket = NULL;
1176
1177         if (ic->ic_receive_pdu != NULL) {
1178                 //ICL_DEBUG("freeing partially received PDU");
1179                 icl_pdu_free(ic->ic_receive_pdu);
1180                 ic->ic_receive_pdu = NULL;
1181         }
1182
1183         /*
1184          * Remove any outstanding PDUs from the send queue.
1185          */
1186         while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1187                 pdu = TAILQ_FIRST(&ic->ic_to_send);
1188                 TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1189                 icl_pdu_free(pdu);
1190         }
1191
1192         KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1193             ("destroying session with non-empty send queue"));
1194         /*
1195          * XXX
1196          */
1197 #if 0
1198         KASSERT(ic->ic_outstanding_pdus == 0,
1199             ("destroying session with %d outstanding PDUs",
1200              ic->ic_outstanding_pdus));
1201 #endif
1202         ICL_CONN_UNLOCK(ic);
1203 }
1204
1205 bool
1206 icl_conn_connected(struct icl_conn *ic)
1207 {
1208
1209         ICL_CONN_LOCK(ic);
1210         if (ic->ic_socket == NULL) {
1211                 ICL_CONN_UNLOCK(ic);
1212                 return (false);
1213         }
1214         if (ic->ic_socket->so_error != 0) {
1215                 ICL_CONN_UNLOCK(ic);
1216                 return (false);
1217         }
1218         ICL_CONN_UNLOCK(ic);
1219         return (true);
1220 }
1221
1222 #ifdef ICL_KERNEL_PROXY
1223 int
1224 icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1225 {
1226         int error;
1227
1228         if (so->so_type != SOCK_STREAM)
1229                 return (EINVAL);
1230
1231         ICL_CONN_LOCK(ic);
1232         if (ic->ic_socket != NULL) {
1233                 ICL_CONN_UNLOCK(ic);
1234                 return (EBUSY);
1235         }
1236         ic->ic_socket = so;
1237         ICL_CONN_UNLOCK(ic);
1238
1239         error = icl_conn_start(ic);
1240
1241         return (error);
1242 }
1243 #endif /* ICL_KERNEL_PROXY */
1244
1245 static int
1246 icl_unload(void)
1247 {
1248
1249         if (icl_ncons != 0)
1250                 return (EBUSY);
1251
1252         uma_zdestroy(icl_conn_zone);
1253         uma_zdestroy(icl_pdu_zone);
1254
1255         return (0);
1256 }
1257
1258 static void
1259 icl_load(void)
1260 {
1261
1262         icl_conn_zone = uma_zcreate("icl_conn",
1263             sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1264             UMA_ALIGN_PTR, 0);
1265         icl_pdu_zone = uma_zcreate("icl_pdu",
1266             sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1267             UMA_ALIGN_PTR, 0);
1268
1269         refcount_init(&icl_ncons, 0);
1270 }
1271
1272 static int
1273 icl_modevent(module_t mod, int what, void *arg)
1274 {
1275
1276         switch (what) {
1277         case MOD_LOAD:
1278                 icl_load();
1279                 return (0);
1280         case MOD_UNLOAD:
1281                 return (icl_unload());
1282         default:
1283                 return (EINVAL);
1284         }
1285 }
1286
1287 moduledata_t icl_data = {
1288         "icl",
1289         icl_modevent,
1290         0
1291 };
1292
1293 DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1294 MODULE_VERSION(icl, 1);