]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/cam/ctl/ctl_backend_block.c
MFC r363979: Add CTL support for REPORT IDENTIFYING INFORMATION command.
[FreeBSD/FreeBSD.git] / sys / cam / ctl / ctl_backend_block.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2003 Silicon Graphics International Corp.
5  * Copyright (c) 2009-2011 Spectra Logic Corporation
6  * Copyright (c) 2012 The FreeBSD Foundation
7  * Copyright (c) 2014-2015 Alexander Motin <mav@FreeBSD.org>
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Edward Tomasz Napierala
11  * under sponsorship from the FreeBSD Foundation.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions, and the following disclaimer,
18  *    without modification.
19  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
20  *    substantially similar to the "NO WARRANTY" disclaimer below
21  *    ("Disclaimer") and any redistribution must be conditioned upon
22  *    including a substantially similar Disclaimer requirement for further
23  *    binary redistribution.
24  *
25  * NO WARRANTY
26  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
29  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
34  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
35  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGES.
37  *
38  * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
39  */
40 /*
41  * CAM Target Layer driver backend for block devices.
42  *
43  * Author: Ken Merry <ken@FreeBSD.org>
44  */
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kernel.h>
51 #include <sys/types.h>
52 #include <sys/kthread.h>
53 #include <sys/bio.h>
54 #include <sys/fcntl.h>
55 #include <sys/limits.h>
56 #include <sys/lock.h>
57 #include <sys/mutex.h>
58 #include <sys/condvar.h>
59 #include <sys/malloc.h>
60 #include <sys/conf.h>
61 #include <sys/ioccom.h>
62 #include <sys/queue.h>
63 #include <sys/sbuf.h>
64 #include <sys/endian.h>
65 #include <sys/uio.h>
66 #include <sys/buf.h>
67 #include <sys/taskqueue.h>
68 #include <sys/vnode.h>
69 #include <sys/namei.h>
70 #include <sys/mount.h>
71 #include <sys/disk.h>
72 #include <sys/fcntl.h>
73 #include <sys/filedesc.h>
74 #include <sys/filio.h>
75 #include <sys/proc.h>
76 #include <sys/pcpu.h>
77 #include <sys/module.h>
78 #include <sys/sdt.h>
79 #include <sys/devicestat.h>
80 #include <sys/sysctl.h>
81 #include <sys/nv.h>
82 #include <sys/dnv.h>
83 #include <sys/sx.h>
84
85 #include <geom/geom.h>
86
87 #include <cam/cam.h>
88 #include <cam/scsi/scsi_all.h>
89 #include <cam/scsi/scsi_da.h>
90 #include <cam/ctl/ctl_io.h>
91 #include <cam/ctl/ctl.h>
92 #include <cam/ctl/ctl_backend.h>
93 #include <cam/ctl/ctl_ioctl.h>
94 #include <cam/ctl/ctl_ha.h>
95 #include <cam/ctl/ctl_scsi_all.h>
96 #include <cam/ctl/ctl_private.h>
97 #include <cam/ctl/ctl_error.h>
98
99 /*
100  * The idea here is that we'll allocate enough S/G space to hold a 1MB
101  * I/O.  If we get an I/O larger than that, we'll split it.
102  */
103 #define CTLBLK_HALF_IO_SIZE     (512 * 1024)
104 #define CTLBLK_MAX_IO_SIZE      (CTLBLK_HALF_IO_SIZE * 2)
105 #define CTLBLK_MAX_SEG          MAXPHYS
106 #define CTLBLK_HALF_SEGS        MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1)
107 #define CTLBLK_MAX_SEGS         (CTLBLK_HALF_SEGS * 2)
108
109 #ifdef CTLBLK_DEBUG
110 #define DPRINTF(fmt, args...) \
111     printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
112 #else
113 #define DPRINTF(fmt, args...) do {} while(0)
114 #endif
115
116 #define PRIV(io)        \
117     ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
118 #define ARGS(io)        \
119     ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])
120
121 SDT_PROVIDER_DEFINE(cbb);
122
123 typedef enum {
124         CTL_BE_BLOCK_LUN_UNCONFIGURED   = 0x01,
125         CTL_BE_BLOCK_LUN_WAITING        = 0x04,
126 } ctl_be_block_lun_flags;
127
128 typedef enum {
129         CTL_BE_BLOCK_NONE,
130         CTL_BE_BLOCK_DEV,
131         CTL_BE_BLOCK_FILE
132 } ctl_be_block_type;
133
134 struct ctl_be_block_filedata {
135         struct ucred *cred;
136 };
137
138 union ctl_be_block_bedata {
139         struct ctl_be_block_filedata file;
140 };
141
142 struct ctl_be_block_io;
143 struct ctl_be_block_lun;
144
145 typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
146                                struct ctl_be_block_io *beio);
147 typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun,
148                                   const char *attrname);
149
150 /*
151  * Backend LUN structure.  There is a 1:1 mapping between a block device
152  * and a backend block LUN, and between a backend block LUN and a CTL LUN.
153  */
154 struct ctl_be_block_lun {
155         struct ctl_be_lun cbe_lun;              /* Must be first element. */
156         struct ctl_lun_create_params params;
157         char *dev_path;
158         ctl_be_block_type dev_type;
159         struct vnode *vn;
160         union ctl_be_block_bedata backend;
161         cbb_dispatch_t dispatch;
162         cbb_dispatch_t lun_flush;
163         cbb_dispatch_t unmap;
164         cbb_dispatch_t get_lba_status;
165         cbb_getattr_t getattr;
166         uint64_t size_blocks;
167         uint64_t size_bytes;
168         struct ctl_be_block_softc *softc;
169         struct devstat *disk_stats;
170         ctl_be_block_lun_flags flags;
171         SLIST_ENTRY(ctl_be_block_lun) links;
172         struct taskqueue *io_taskqueue;
173         struct task io_task;
174         int num_threads;
175         STAILQ_HEAD(, ctl_io_hdr) input_queue;
176         STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
177         STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
178         STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
179         struct mtx_padalign io_lock;
180         struct mtx_padalign queue_lock;
181 };
182
183 /*
184  * Overall softc structure for the block backend module.
185  */
186 struct ctl_be_block_softc {
187         struct sx                        modify_lock;
188         struct mtx                       lock;
189         int                              num_luns;
190         SLIST_HEAD(, ctl_be_block_lun)   lun_list;
191         uma_zone_t                       beio_zone;
192         uma_zone_t                       buf_zone;
193 };
194
195 static struct ctl_be_block_softc backend_block_softc;
196
197 /*
198  * Per-I/O information.
199  */
200 struct ctl_be_block_io {
201         union ctl_io                    *io;
202         struct ctl_sg_entry             sg_segs[CTLBLK_MAX_SEGS];
203         struct iovec                    xiovecs[CTLBLK_MAX_SEGS];
204         int                             bio_cmd;
205         int                             two_sglists;
206         int                             num_segs;
207         int                             num_bios_sent;
208         int                             num_bios_done;
209         int                             send_complete;
210         int                             first_error;
211         uint64_t                        first_error_offset;
212         struct bintime                  ds_t0;
213         devstat_tag_type                ds_tag_type;
214         devstat_trans_flags             ds_trans_type;
215         uint64_t                        io_len;
216         uint64_t                        io_offset;
217         int                             io_arg;
218         struct ctl_be_block_softc       *softc;
219         struct ctl_be_block_lun         *lun;
220         void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */
221 };
222
223 extern struct ctl_softc *control_softc;
224
225 static int cbb_num_threads = 14;
226 SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
227             "CAM Target Layer Block Backend");
228 SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN,
229            &cbb_num_threads, 0, "Number of threads per backing file");
230
231 static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
232 static void ctl_free_beio(struct ctl_be_block_io *beio);
233 static void ctl_complete_beio(struct ctl_be_block_io *beio);
234 static int ctl_be_block_move_done(union ctl_io *io);
235 static void ctl_be_block_biodone(struct bio *bio);
236 static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
237                                     struct ctl_be_block_io *beio);
238 static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
239                                        struct ctl_be_block_io *beio);
240 static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
241                                   struct ctl_be_block_io *beio);
242 static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
243                                          const char *attrname);
244 static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
245                                    struct ctl_be_block_io *beio);
246 static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
247                                    struct ctl_be_block_io *beio);
248 static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
249                                       struct ctl_be_block_io *beio);
250 static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
251                                          const char *attrname);
252 static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
253                                     union ctl_io *io);
254 static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
255                                     union ctl_io *io);
256 static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
257                                   union ctl_io *io);
258 static void ctl_be_block_worker(void *context, int pending);
259 static int ctl_be_block_submit(union ctl_io *io);
260 static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
261                                    int flag, struct thread *td);
262 static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
263                                   struct ctl_lun_req *req);
264 static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
265                                  struct ctl_lun_req *req);
266 static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
267 static int ctl_be_block_open(struct ctl_be_block_lun *be_lun,
268                              struct ctl_lun_req *req);
269 static int ctl_be_block_create(struct ctl_be_block_softc *softc,
270                                struct ctl_lun_req *req);
271 static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
272                            struct ctl_lun_req *req);
273 static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
274                            struct ctl_lun_req *req);
275 static void ctl_be_block_lun_shutdown(struct ctl_be_lun *cbe_lun);
276 static int ctl_be_block_config_write(union ctl_io *io);
277 static int ctl_be_block_config_read(union ctl_io *io);
278 static int ctl_be_block_lun_info(struct ctl_be_lun *cbe_lun, struct sbuf *sb);
279 static uint64_t ctl_be_block_lun_attr(struct ctl_be_lun *cbe_lun, const char *attrname);
280 static int ctl_be_block_init(void);
281 static int ctl_be_block_shutdown(void);
282
283 static struct ctl_backend_driver ctl_be_block_driver = 
284 {
285         .name = "block",
286         .flags = CTL_BE_FLAG_HAS_CONFIG,
287         .init = ctl_be_block_init,
288         .shutdown = ctl_be_block_shutdown,
289         .data_submit = ctl_be_block_submit,
290         .data_move_done = ctl_be_block_move_done,
291         .config_read = ctl_be_block_config_read,
292         .config_write = ctl_be_block_config_write,
293         .ioctl = ctl_be_block_ioctl,
294         .lun_info = ctl_be_block_lun_info,
295         .lun_attr = ctl_be_block_lun_attr
296 };
297
298 MALLOC_DEFINE(M_CTLBLK, "ctlblock", "Memory used for CTL block backend");
299 CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
300
301 static struct ctl_be_block_io *
302 ctl_alloc_beio(struct ctl_be_block_softc *softc)
303 {
304         struct ctl_be_block_io *beio;
305
306         beio = uma_zalloc(softc->beio_zone, M_WAITOK | M_ZERO);
307         beio->softc = softc;
308         return (beio);
309 }
310
311 static void
312 ctl_free_beio(struct ctl_be_block_io *beio)
313 {
314         struct ctl_be_block_softc *softc = beio->softc;
315         int i;
316
317         for (i = 0; i < beio->num_segs; i++) {
318                 uma_zfree(softc->buf_zone, beio->sg_segs[i].addr);
319
320                 /* For compare we had two equal S/G lists. */
321                 if (beio->two_sglists) {
322                         uma_zfree(softc->buf_zone,
323                             beio->sg_segs[i + CTLBLK_HALF_SEGS].addr);
324                 }
325         }
326
327         uma_zfree(softc->beio_zone, beio);
328 }
329
330 static void
331 ctl_complete_beio(struct ctl_be_block_io *beio)
332 {
333         union ctl_io *io = beio->io;
334
335         if (beio->beio_cont != NULL) {
336                 beio->beio_cont(beio);
337         } else {
338                 ctl_free_beio(beio);
339                 ctl_data_submit_done(io);
340         }
341 }
342
343 static size_t
344 cmp(uint8_t *a, uint8_t *b, size_t size)
345 {
346         size_t i;
347
348         for (i = 0; i < size; i++) {
349                 if (a[i] != b[i])
350                         break;
351         }
352         return (i);
353 }
354
355 static void
356 ctl_be_block_compare(union ctl_io *io)
357 {
358         struct ctl_be_block_io *beio;
359         uint64_t off, res;
360         int i;
361         uint8_t info[8];
362
363         beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
364         off = 0;
365         for (i = 0; i < beio->num_segs; i++) {
366                 res = cmp(beio->sg_segs[i].addr,
367                     beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
368                     beio->sg_segs[i].len);
369                 off += res;
370                 if (res < beio->sg_segs[i].len)
371                         break;
372         }
373         if (i < beio->num_segs) {
374                 scsi_u64to8b(off, info);
375                 ctl_set_sense(&io->scsiio, /*current_error*/ 1,
376                     /*sense_key*/ SSD_KEY_MISCOMPARE,
377                     /*asc*/ 0x1D, /*ascq*/ 0x00,
378                     /*type*/ SSD_ELEM_INFO,
379                     /*size*/ sizeof(info), /*data*/ &info,
380                     /*type*/ SSD_ELEM_NONE);
381         } else
382                 ctl_set_success(&io->scsiio);
383 }
384
385 static int
386 ctl_be_block_move_done(union ctl_io *io)
387 {
388         struct ctl_be_block_io *beio;
389         struct ctl_be_block_lun *be_lun;
390         struct ctl_lba_len_flags *lbalen;
391 #ifdef CTL_TIME_IO
392         struct bintime cur_bt;
393 #endif
394
395         beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
396         be_lun = beio->lun;
397
398         DPRINTF("entered\n");
399
400 #ifdef CTL_TIME_IO
401         getbinuptime(&cur_bt);
402         bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
403         bintime_add(&io->io_hdr.dma_bt, &cur_bt);
404 #endif
405         io->io_hdr.num_dmas++;
406         io->scsiio.kern_rel_offset += io->scsiio.kern_data_len;
407
408         /*
409          * We set status at this point for read commands, and write
410          * commands with errors.
411          */
412         if (io->io_hdr.flags & CTL_FLAG_ABORT) {
413                 ;
414         } else if ((io->io_hdr.port_status != 0) &&
415             ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
416              (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
417                 ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1,
418                     /*retry_count*/ io->io_hdr.port_status);
419         } else if (io->scsiio.kern_data_resid != 0 &&
420             (io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT &&
421             ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
422              (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
423                 ctl_set_invalid_field_ciu(&io->scsiio);
424         } else if ((io->io_hdr.port_status == 0) &&
425             ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
426                 lbalen = ARGS(beio->io);
427                 if (lbalen->flags & CTL_LLF_READ) {
428                         ctl_set_success(&io->scsiio);
429                 } else if (lbalen->flags & CTL_LLF_COMPARE) {
430                         /* We have two data blocks ready for comparison. */
431                         ctl_be_block_compare(io);
432                 }
433         }
434
435         /*
436          * If this is a read, or a write with errors, it is done.
437          */
438         if ((beio->bio_cmd == BIO_READ)
439          || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
440          || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
441                 ctl_complete_beio(beio);
442                 return (0);
443         }
444
445         /*
446          * At this point, we have a write and the DMA completed
447          * successfully.  We now have to queue it to the task queue to
448          * execute the backend I/O.  That is because we do blocking
449          * memory allocations, and in the file backing case, blocking I/O.
450          * This move done routine is generally called in the SIM's
451          * interrupt context, and therefore we cannot block.
452          */
453         mtx_lock(&be_lun->queue_lock);
454         STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
455         mtx_unlock(&be_lun->queue_lock);
456         taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
457
458         return (0);
459 }
460
461 static void
462 ctl_be_block_biodone(struct bio *bio)
463 {
464         struct ctl_be_block_io *beio;
465         struct ctl_be_block_lun *be_lun;
466         union ctl_io *io;
467         int error;
468
469         beio = bio->bio_caller1;
470         be_lun = beio->lun;
471         io = beio->io;
472
473         DPRINTF("entered\n");
474
475         error = bio->bio_error;
476         mtx_lock(&be_lun->io_lock);
477         if (error != 0 &&
478             (beio->first_error == 0 ||
479              bio->bio_offset < beio->first_error_offset)) {
480                 beio->first_error = error;
481                 beio->first_error_offset = bio->bio_offset;
482         }
483
484         beio->num_bios_done++;
485
486         /*
487          * XXX KDM will this cause WITNESS to complain?  Holding a lock
488          * during the free might cause it to complain.
489          */
490         g_destroy_bio(bio);
491
492         /*
493          * If the send complete bit isn't set, or we aren't the last I/O to
494          * complete, then we're done.
495          */
496         if ((beio->send_complete == 0)
497          || (beio->num_bios_done < beio->num_bios_sent)) {
498                 mtx_unlock(&be_lun->io_lock);
499                 return;
500         }
501
502         /*
503          * At this point, we've verified that we are the last I/O to
504          * complete, so it's safe to drop the lock.
505          */
506         devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
507             beio->ds_tag_type, beio->ds_trans_type,
508             /*now*/ NULL, /*then*/&beio->ds_t0);
509         mtx_unlock(&be_lun->io_lock);
510
511         /*
512          * If there are any errors from the backing device, we fail the
513          * entire I/O with a medium error.
514          */
515         error = beio->first_error;
516         if (error != 0) {
517                 if (error == EOPNOTSUPP) {
518                         ctl_set_invalid_opcode(&io->scsiio);
519                 } else if (error == ENOSPC || error == EDQUOT) {
520                         ctl_set_space_alloc_fail(&io->scsiio);
521                 } else if (error == EROFS || error == EACCES) {
522                         ctl_set_hw_write_protected(&io->scsiio);
523                 } else if (beio->bio_cmd == BIO_FLUSH) {
524                         /* XXX KDM is there is a better error here? */
525                         ctl_set_internal_failure(&io->scsiio,
526                                                  /*sks_valid*/ 1,
527                                                  /*retry_count*/ 0xbad2);
528                 } else {
529                         ctl_set_medium_error(&io->scsiio,
530                             beio->bio_cmd == BIO_READ);
531                 }
532                 ctl_complete_beio(beio);
533                 return;
534         }
535
536         /*
537          * If this is a write, a flush, a delete or verify, we're all done.
538          * If this is a read, we can now send the data to the user.
539          */
540         if ((beio->bio_cmd == BIO_WRITE)
541          || (beio->bio_cmd == BIO_FLUSH)
542          || (beio->bio_cmd == BIO_DELETE)
543          || (ARGS(io)->flags & CTL_LLF_VERIFY)) {
544                 ctl_set_success(&io->scsiio);
545                 ctl_complete_beio(beio);
546         } else {
547                 if ((ARGS(io)->flags & CTL_LLF_READ) &&
548                     beio->beio_cont == NULL) {
549                         ctl_set_success(&io->scsiio);
550                         ctl_serseq_done(io);
551                 }
552 #ifdef CTL_TIME_IO
553                 getbinuptime(&io->io_hdr.dma_start_bt);
554 #endif
555                 ctl_datamove(io);
556         }
557 }
558
559 static void
560 ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
561                         struct ctl_be_block_io *beio)
562 {
563         union ctl_io *io = beio->io;
564         struct mount *mountpoint;
565         int error, lock_flags;
566
567         DPRINTF("entered\n");
568
569         binuptime(&beio->ds_t0);
570         mtx_lock(&be_lun->io_lock);
571         devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
572         mtx_unlock(&be_lun->io_lock);
573
574         (void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
575
576         if (MNT_SHARED_WRITES(mountpoint) ||
577             ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
578                 lock_flags = LK_SHARED;
579         else
580                 lock_flags = LK_EXCLUSIVE;
581         vn_lock(be_lun->vn, lock_flags | LK_RETRY);
582         error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT,
583             curthread);
584         VOP_UNLOCK(be_lun->vn, 0);
585
586         vn_finished_write(mountpoint);
587
588         mtx_lock(&be_lun->io_lock);
589         devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
590             beio->ds_tag_type, beio->ds_trans_type,
591             /*now*/ NULL, /*then*/&beio->ds_t0);
592         mtx_unlock(&be_lun->io_lock);
593
594         if (error == 0)
595                 ctl_set_success(&io->scsiio);
596         else {
597                 /* XXX KDM is there is a better error here? */
598                 ctl_set_internal_failure(&io->scsiio,
599                                          /*sks_valid*/ 1,
600                                          /*retry_count*/ 0xbad1);
601         }
602
603         ctl_complete_beio(beio);
604 }
605
606 SDT_PROBE_DEFINE1(cbb, , read, file_start, "uint64_t");
607 SDT_PROBE_DEFINE1(cbb, , write, file_start, "uint64_t");
608 SDT_PROBE_DEFINE1(cbb, , read, file_done,"uint64_t");
609 SDT_PROBE_DEFINE1(cbb, , write, file_done, "uint64_t");
610
611 static void
612 ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
613                            struct ctl_be_block_io *beio)
614 {
615         struct ctl_be_block_filedata *file_data;
616         union ctl_io *io;
617         struct uio xuio;
618         struct iovec *xiovec;
619         size_t s;
620         int error, flags, i;
621
622         DPRINTF("entered\n");
623
624         file_data = &be_lun->backend.file;
625         io = beio->io;
626         flags = 0;
627         if (ARGS(io)->flags & CTL_LLF_DPO)
628                 flags |= IO_DIRECT;
629         if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
630                 flags |= IO_SYNC;
631
632         bzero(&xuio, sizeof(xuio));
633         if (beio->bio_cmd == BIO_READ) {
634                 SDT_PROBE0(cbb, , read, file_start);
635                 xuio.uio_rw = UIO_READ;
636         } else {
637                 SDT_PROBE0(cbb, , write, file_start);
638                 xuio.uio_rw = UIO_WRITE;
639         }
640         xuio.uio_offset = beio->io_offset;
641         xuio.uio_resid = beio->io_len;
642         xuio.uio_segflg = UIO_SYSSPACE;
643         xuio.uio_iov = beio->xiovecs;
644         xuio.uio_iovcnt = beio->num_segs;
645         xuio.uio_td = curthread;
646
647         for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
648                 xiovec->iov_base = beio->sg_segs[i].addr;
649                 xiovec->iov_len = beio->sg_segs[i].len;
650         }
651
652         binuptime(&beio->ds_t0);
653         mtx_lock(&be_lun->io_lock);
654         devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
655         mtx_unlock(&be_lun->io_lock);
656
657         if (beio->bio_cmd == BIO_READ) {
658                 vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
659
660                 /*
661                  * UFS pays attention to IO_DIRECT for reads.  If the
662                  * DIRECTIO option is configured into the kernel, it calls
663                  * ffs_rawread().  But that only works for single-segment
664                  * uios with user space addresses.  In our case, with a
665                  * kernel uio, it still reads into the buffer cache, but it
666                  * will just try to release the buffer from the cache later
667                  * on in ffs_read().
668                  *
669                  * ZFS does not pay attention to IO_DIRECT for reads.
670                  *
671                  * UFS does not pay attention to IO_SYNC for reads.
672                  *
673                  * ZFS pays attention to IO_SYNC (which translates into the
674                  * Solaris define FRSYNC for zfs_read()) for reads.  It
675                  * attempts to sync the file before reading.
676                  */
677                 error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);
678
679                 VOP_UNLOCK(be_lun->vn, 0);
680                 SDT_PROBE0(cbb, , read, file_done);
681                 if (error == 0 && xuio.uio_resid > 0) {
682                         /*
683                          * If we red less then requested (EOF), then
684                          * we should clean the rest of the buffer.
685                          */
686                         s = beio->io_len - xuio.uio_resid;
687                         for (i = 0; i < beio->num_segs; i++) {
688                                 if (s >= beio->sg_segs[i].len) {
689                                         s -= beio->sg_segs[i].len;
690                                         continue;
691                                 }
692                                 bzero((uint8_t *)beio->sg_segs[i].addr + s,
693                                     beio->sg_segs[i].len - s);
694                                 s = 0;
695                         }
696                 }
697         } else {
698                 struct mount *mountpoint;
699                 int lock_flags;
700
701                 (void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
702
703                 if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL)
704                   && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
705                         lock_flags = LK_SHARED;
706                 else
707                         lock_flags = LK_EXCLUSIVE;
708                 vn_lock(be_lun->vn, lock_flags | LK_RETRY);
709
710                 /*
711                  * UFS pays attention to IO_DIRECT for writes.  The write
712                  * is done asynchronously.  (Normally the write would just
713                  * get put into cache.
714                  *
715                  * UFS pays attention to IO_SYNC for writes.  It will
716                  * attempt to write the buffer out synchronously if that
717                  * flag is set.
718                  *
719                  * ZFS does not pay attention to IO_DIRECT for writes.
720                  *
721                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
722                  * for writes.  It will flush the transaction from the
723                  * cache before returning.
724                  */
725                 error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
726                 VOP_UNLOCK(be_lun->vn, 0);
727
728                 vn_finished_write(mountpoint);
729                 SDT_PROBE0(cbb, , write, file_done);
730         }
731
732         mtx_lock(&be_lun->io_lock);
733         devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
734             beio->ds_tag_type, beio->ds_trans_type,
735             /*now*/ NULL, /*then*/&beio->ds_t0);
736         mtx_unlock(&be_lun->io_lock);
737
738         /*
739          * If we got an error, set the sense data to "MEDIUM ERROR" and
740          * return the I/O to the user.
741          */
742         if (error != 0) {
743                 if (error == ENOSPC || error == EDQUOT) {
744                         ctl_set_space_alloc_fail(&io->scsiio);
745                 } else if (error == EROFS || error == EACCES) {
746                         ctl_set_hw_write_protected(&io->scsiio);
747                 } else {
748                         ctl_set_medium_error(&io->scsiio,
749                             beio->bio_cmd == BIO_READ);
750                 }
751                 ctl_complete_beio(beio);
752                 return;
753         }
754
755         /*
756          * If this is a write or a verify, we're all done.
757          * If this is a read, we can now send the data to the user.
758          */
759         if ((beio->bio_cmd == BIO_WRITE) ||
760             (ARGS(io)->flags & CTL_LLF_VERIFY)) {
761                 ctl_set_success(&io->scsiio);
762                 ctl_complete_beio(beio);
763         } else {
764                 if ((ARGS(io)->flags & CTL_LLF_READ) &&
765                     beio->beio_cont == NULL) {
766                         ctl_set_success(&io->scsiio);
767                         ctl_serseq_done(io);
768                 }
769 #ifdef CTL_TIME_IO
770                 getbinuptime(&io->io_hdr.dma_start_bt);
771 #endif
772                 ctl_datamove(io);
773         }
774 }
775
776 static void
777 ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
778                         struct ctl_be_block_io *beio)
779 {
780         union ctl_io *io = beio->io;
781         struct ctl_lba_len_flags *lbalen = ARGS(io);
782         struct scsi_get_lba_status_data *data;
783         off_t roff, off;
784         int error, status;
785
786         DPRINTF("entered\n");
787
788         off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
789         vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
790         error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
791             0, curthread->td_ucred, curthread);
792         if (error == 0 && off > roff)
793                 status = 0;     /* mapped up to off */
794         else {
795                 error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
796                     0, curthread->td_ucred, curthread);
797                 if (error == 0 && off > roff)
798                         status = 1;     /* deallocated up to off */
799                 else {
800                         status = 0;     /* unknown up to the end */
801                         off = be_lun->size_bytes;
802                 }
803         }
804         VOP_UNLOCK(be_lun->vn, 0);
805
806         data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
807         scsi_u64to8b(lbalen->lba, data->descr[0].addr);
808         scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
809             lbalen->lba), data->descr[0].length);
810         data->descr[0].status = status;
811
812         ctl_complete_beio(beio);
813 }
814
815 static uint64_t
816 ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
817 {
818         struct vattr            vattr;
819         struct statfs           statfs;
820         uint64_t                val;
821         int                     error;
822
823         val = UINT64_MAX;
824         if (be_lun->vn == NULL)
825                 return (val);
826         vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
827         if (strcmp(attrname, "blocksused") == 0) {
828                 error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
829                 if (error == 0)
830                         val = vattr.va_bytes / be_lun->cbe_lun.blocksize;
831         }
832         if (strcmp(attrname, "blocksavail") == 0 &&
833             (be_lun->vn->v_iflag & VI_DOOMED) == 0) {
834                 error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
835                 if (error == 0)
836                         val = statfs.f_bavail * statfs.f_bsize /
837                             be_lun->cbe_lun.blocksize;
838         }
839         VOP_UNLOCK(be_lun->vn, 0);
840         return (val);
841 }
842
843 static void
844 ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
845                            struct ctl_be_block_io *beio)
846 {
847         union ctl_io *io;
848         struct cdevsw *csw;
849         struct cdev *dev;
850         struct uio xuio;
851         struct iovec *xiovec;
852         int error, flags, i, ref;
853
854         DPRINTF("entered\n");
855
856         io = beio->io;
857         flags = 0;
858         if (ARGS(io)->flags & CTL_LLF_DPO)
859                 flags |= IO_DIRECT;
860         if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
861                 flags |= IO_SYNC;
862
863         bzero(&xuio, sizeof(xuio));
864         if (beio->bio_cmd == BIO_READ) {
865                 SDT_PROBE0(cbb, , read, file_start);
866                 xuio.uio_rw = UIO_READ;
867         } else {
868                 SDT_PROBE0(cbb, , write, file_start);
869                 xuio.uio_rw = UIO_WRITE;
870         }
871         xuio.uio_offset = beio->io_offset;
872         xuio.uio_resid = beio->io_len;
873         xuio.uio_segflg = UIO_SYSSPACE;
874         xuio.uio_iov = beio->xiovecs;
875         xuio.uio_iovcnt = beio->num_segs;
876         xuio.uio_td = curthread;
877
878         for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
879                 xiovec->iov_base = beio->sg_segs[i].addr;
880                 xiovec->iov_len = beio->sg_segs[i].len;
881         }
882
883         binuptime(&beio->ds_t0);
884         mtx_lock(&be_lun->io_lock);
885         devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
886         mtx_unlock(&be_lun->io_lock);
887
888         csw = devvn_refthread(be_lun->vn, &dev, &ref);
889         if (csw) {
890                 if (beio->bio_cmd == BIO_READ)
891                         error = csw->d_read(dev, &xuio, flags);
892                 else
893                         error = csw->d_write(dev, &xuio, flags);
894                 dev_relthread(dev, ref);
895         } else
896                 error = ENXIO;
897
898         if (beio->bio_cmd == BIO_READ)
899                 SDT_PROBE0(cbb, , read, file_done);
900         else
901                 SDT_PROBE0(cbb, , write, file_done);
902
903         mtx_lock(&be_lun->io_lock);
904         devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
905             beio->ds_tag_type, beio->ds_trans_type,
906             /*now*/ NULL, /*then*/&beio->ds_t0);
907         mtx_unlock(&be_lun->io_lock);
908
909         /*
910          * If we got an error, set the sense data to "MEDIUM ERROR" and
911          * return the I/O to the user.
912          */
913         if (error != 0) {
914                 if (error == ENOSPC || error == EDQUOT) {
915                         ctl_set_space_alloc_fail(&io->scsiio);
916                 } else if (error == EROFS || error == EACCES) {
917                         ctl_set_hw_write_protected(&io->scsiio);
918                 } else {
919                         ctl_set_medium_error(&io->scsiio,
920                             beio->bio_cmd == BIO_READ);
921                 }
922                 ctl_complete_beio(beio);
923                 return;
924         }
925
926         /*
927          * If this is a write or a verify, we're all done.
928          * If this is a read, we can now send the data to the user.
929          */
930         if ((beio->bio_cmd == BIO_WRITE) ||
931             (ARGS(io)->flags & CTL_LLF_VERIFY)) {
932                 ctl_set_success(&io->scsiio);
933                 ctl_complete_beio(beio);
934         } else {
935                 if ((ARGS(io)->flags & CTL_LLF_READ) &&
936                     beio->beio_cont == NULL) {
937                         ctl_set_success(&io->scsiio);
938                         ctl_serseq_done(io);
939                 }
940 #ifdef CTL_TIME_IO
941                 getbinuptime(&io->io_hdr.dma_start_bt);
942 #endif
943                 ctl_datamove(io);
944         }
945 }
946
947 static void
948 ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
949                         struct ctl_be_block_io *beio)
950 {
951         union ctl_io *io = beio->io;
952         struct cdevsw *csw;
953         struct cdev *dev;
954         struct ctl_lba_len_flags *lbalen = ARGS(io);
955         struct scsi_get_lba_status_data *data;
956         off_t roff, off;
957         int error, ref, status;
958
959         DPRINTF("entered\n");
960
961         csw = devvn_refthread(be_lun->vn, &dev, &ref);
962         if (csw == NULL) {
963                 status = 0;     /* unknown up to the end */
964                 off = be_lun->size_bytes;
965                 goto done;
966         }
967         off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
968         error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD,
969             curthread);
970         if (error == 0 && off > roff)
971                 status = 0;     /* mapped up to off */
972         else {
973                 error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD,
974                     curthread);
975                 if (error == 0 && off > roff)
976                         status = 1;     /* deallocated up to off */
977                 else {
978                         status = 0;     /* unknown up to the end */
979                         off = be_lun->size_bytes;
980                 }
981         }
982         dev_relthread(dev, ref);
983
984 done:
985         data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
986         scsi_u64to8b(lbalen->lba, data->descr[0].addr);
987         scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
988             lbalen->lba), data->descr[0].length);
989         data->descr[0].status = status;
990
991         ctl_complete_beio(beio);
992 }
993
994 static void
995 ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
996                        struct ctl_be_block_io *beio)
997 {
998         struct bio *bio;
999         struct cdevsw *csw;
1000         struct cdev *dev;
1001         int ref;
1002
1003         DPRINTF("entered\n");
1004
1005         /* This can't fail, it's a blocking allocation. */
1006         bio = g_alloc_bio();
1007
1008         bio->bio_cmd        = BIO_FLUSH;
1009         bio->bio_offset     = 0;
1010         bio->bio_data       = 0;
1011         bio->bio_done       = ctl_be_block_biodone;
1012         bio->bio_caller1    = beio;
1013         bio->bio_pblkno     = 0;
1014
1015         /*
1016          * We don't need to acquire the LUN lock here, because we are only
1017          * sending one bio, and so there is no other context to synchronize
1018          * with.
1019          */
1020         beio->num_bios_sent = 1;
1021         beio->send_complete = 1;
1022
1023         binuptime(&beio->ds_t0);
1024         mtx_lock(&be_lun->io_lock);
1025         devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1026         mtx_unlock(&be_lun->io_lock);
1027
1028         csw = devvn_refthread(be_lun->vn, &dev, &ref);
1029         if (csw) {
1030                 bio->bio_dev = dev;
1031                 csw->d_strategy(bio);
1032                 dev_relthread(dev, ref);
1033         } else {
1034                 bio->bio_error = ENXIO;
1035                 ctl_be_block_biodone(bio);
1036         }
1037 }
1038
1039 static void
1040 ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
1041                        struct ctl_be_block_io *beio,
1042                        uint64_t off, uint64_t len, int last)
1043 {
1044         struct bio *bio;
1045         uint64_t maxlen;
1046         struct cdevsw *csw;
1047         struct cdev *dev;
1048         int ref;
1049
1050         csw = devvn_refthread(be_lun->vn, &dev, &ref);
1051         maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize);
1052         while (len > 0) {
1053                 bio = g_alloc_bio();
1054                 bio->bio_cmd        = BIO_DELETE;
1055                 bio->bio_dev        = dev;
1056                 bio->bio_offset     = off;
1057                 bio->bio_length     = MIN(len, maxlen);
1058                 bio->bio_data       = 0;
1059                 bio->bio_done       = ctl_be_block_biodone;
1060                 bio->bio_caller1    = beio;
1061                 bio->bio_pblkno     = off / be_lun->cbe_lun.blocksize;
1062
1063                 off += bio->bio_length;
1064                 len -= bio->bio_length;
1065
1066                 mtx_lock(&be_lun->io_lock);
1067                 beio->num_bios_sent++;
1068                 if (last && len == 0)
1069                         beio->send_complete = 1;
1070                 mtx_unlock(&be_lun->io_lock);
1071
1072                 if (csw) {
1073                         csw->d_strategy(bio);
1074                 } else {
1075                         bio->bio_error = ENXIO;
1076                         ctl_be_block_biodone(bio);
1077                 }
1078         }
1079         if (csw)
1080                 dev_relthread(dev, ref);
1081 }
1082
1083 static void
1084 ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
1085                        struct ctl_be_block_io *beio)
1086 {
1087         union ctl_io *io;
1088         struct ctl_ptr_len_flags *ptrlen;
1089         struct scsi_unmap_desc *buf, *end;
1090         uint64_t len;
1091
1092         io = beio->io;
1093
1094         DPRINTF("entered\n");
1095
1096         binuptime(&beio->ds_t0);
1097         mtx_lock(&be_lun->io_lock);
1098         devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1099         mtx_unlock(&be_lun->io_lock);
1100
1101         if (beio->io_offset == -1) {
1102                 beio->io_len = 0;
1103                 ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1104                 buf = (struct scsi_unmap_desc *)ptrlen->ptr;
1105                 end = buf + ptrlen->len / sizeof(*buf);
1106                 for (; buf < end; buf++) {
1107                         len = (uint64_t)scsi_4btoul(buf->length) *
1108                             be_lun->cbe_lun.blocksize;
1109                         beio->io_len += len;
1110                         ctl_be_block_unmap_dev_range(be_lun, beio,
1111                             scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize,
1112                             len, (end - buf < 2) ? TRUE : FALSE);
1113                 }
1114         } else
1115                 ctl_be_block_unmap_dev_range(be_lun, beio,
1116                     beio->io_offset, beio->io_len, TRUE);
1117 }
1118
1119 static void
1120 ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
1121                           struct ctl_be_block_io *beio)
1122 {
1123         TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
1124         struct bio *bio;
1125         struct cdevsw *csw;
1126         struct cdev *dev;
1127         off_t cur_offset;
1128         int i, max_iosize, ref;
1129
1130         DPRINTF("entered\n");
1131         csw = devvn_refthread(be_lun->vn, &dev, &ref);
1132
1133         /*
1134          * We have to limit our I/O size to the maximum supported by the
1135          * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
1136          * set it properly, use DFLTPHYS.
1137          */
1138         if (csw) {
1139                 max_iosize = dev->si_iosize_max;
1140                 if (max_iosize < PAGE_SIZE)
1141                         max_iosize = DFLTPHYS;
1142         } else
1143                 max_iosize = DFLTPHYS;
1144
1145         cur_offset = beio->io_offset;
1146         for (i = 0; i < beio->num_segs; i++) {
1147                 size_t cur_size;
1148                 uint8_t *cur_ptr;
1149
1150                 cur_size = beio->sg_segs[i].len;
1151                 cur_ptr = beio->sg_segs[i].addr;
1152
1153                 while (cur_size > 0) {
1154                         /* This can't fail, it's a blocking allocation. */
1155                         bio = g_alloc_bio();
1156
1157                         KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
1158
1159                         bio->bio_cmd = beio->bio_cmd;
1160                         bio->bio_dev = dev;
1161                         bio->bio_caller1 = beio;
1162                         bio->bio_length = min(cur_size, max_iosize);
1163                         bio->bio_offset = cur_offset;
1164                         bio->bio_data = cur_ptr;
1165                         bio->bio_done = ctl_be_block_biodone;
1166                         bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize;
1167
1168                         cur_offset += bio->bio_length;
1169                         cur_ptr += bio->bio_length;
1170                         cur_size -= bio->bio_length;
1171
1172                         TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
1173                         beio->num_bios_sent++;
1174                 }
1175         }
1176         binuptime(&beio->ds_t0);
1177         mtx_lock(&be_lun->io_lock);
1178         devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1179         beio->send_complete = 1;
1180         mtx_unlock(&be_lun->io_lock);
1181
1182         /*
1183          * Fire off all allocated requests!
1184          */
1185         while ((bio = TAILQ_FIRST(&queue)) != NULL) {
1186                 TAILQ_REMOVE(&queue, bio, bio_queue);
1187                 if (csw)
1188                         csw->d_strategy(bio);
1189                 else {
1190                         bio->bio_error = ENXIO;
1191                         ctl_be_block_biodone(bio);
1192                 }
1193         }
1194         if (csw)
1195                 dev_relthread(dev, ref);
1196 }
1197
1198 static uint64_t
1199 ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname)
1200 {
1201         struct diocgattr_arg    arg;
1202         struct cdevsw *csw;
1203         struct cdev *dev;
1204         int error, ref;
1205
1206         csw = devvn_refthread(be_lun->vn, &dev, &ref);
1207         if (csw == NULL)
1208                 return (UINT64_MAX);
1209         strlcpy(arg.name, attrname, sizeof(arg.name));
1210         arg.len = sizeof(arg.value.off);
1211         if (csw->d_ioctl) {
1212                 error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
1213                     curthread);
1214         } else
1215                 error = ENODEV;
1216         dev_relthread(dev, ref);
1217         if (error != 0)
1218                 return (UINT64_MAX);
1219         return (arg.value.off);
1220 }
1221
1222 static void
1223 ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun,
1224                             union ctl_io *io)
1225 {
1226         struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1227         struct ctl_be_block_io *beio;
1228         struct ctl_lba_len_flags *lbalen;
1229
1230         DPRINTF("entered\n");
1231         beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1232         lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1233
1234         beio->io_len = lbalen->len * cbe_lun->blocksize;
1235         beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1236         beio->io_arg = (lbalen->flags & SSC_IMMED) != 0;
1237         beio->bio_cmd = BIO_FLUSH;
1238         beio->ds_trans_type = DEVSTAT_NO_DATA;
1239         DPRINTF("SYNC\n");
1240         be_lun->lun_flush(be_lun, beio);
1241 }
1242
1243 static void
1244 ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
1245 {
1246         union ctl_io *io;
1247
1248         io = beio->io;
1249         ctl_free_beio(beio);
1250         if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1251             ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1252              (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1253                 ctl_config_write_done(io);
1254                 return;
1255         }
1256
1257         ctl_be_block_config_write(io);
1258 }
1259
1260 static void
1261 ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
1262                             union ctl_io *io)
1263 {
1264         struct ctl_be_block_softc *softc = be_lun->softc;
1265         struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1266         struct ctl_be_block_io *beio;
1267         struct ctl_lba_len_flags *lbalen;
1268         uint64_t len_left, lba;
1269         uint32_t pb, pbo, adj;
1270         int i, seglen;
1271         uint8_t *buf, *end;
1272
1273         DPRINTF("entered\n");
1274
1275         beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1276         lbalen = ARGS(beio->io);
1277
1278         if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) ||
1279             (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) {
1280                 ctl_free_beio(beio);
1281                 ctl_set_invalid_field(&io->scsiio,
1282                                       /*sks_valid*/ 1,
1283                                       /*command*/ 1,
1284                                       /*field*/ 1,
1285                                       /*bit_valid*/ 0,
1286                                       /*bit*/ 0);
1287                 ctl_config_write_done(io);
1288                 return;
1289         }
1290
1291         if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) {
1292                 beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1293                 beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
1294                 beio->bio_cmd = BIO_DELETE;
1295                 beio->ds_trans_type = DEVSTAT_FREE;
1296
1297                 be_lun->unmap(be_lun, beio);
1298                 return;
1299         }
1300
1301         beio->bio_cmd = BIO_WRITE;
1302         beio->ds_trans_type = DEVSTAT_WRITE;
1303
1304         DPRINTF("WRITE SAME at LBA %jx len %u\n",
1305                (uintmax_t)lbalen->lba, lbalen->len);
1306
1307         pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp;
1308         if (be_lun->cbe_lun.pblockoff > 0)
1309                 pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff;
1310         else
1311                 pbo = 0;
1312         len_left = (uint64_t)lbalen->len * cbe_lun->blocksize;
1313         for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
1314
1315                 /*
1316                  * Setup the S/G entry for this chunk.
1317                  */
1318                 seglen = MIN(CTLBLK_MAX_SEG, len_left);
1319                 if (pb > cbe_lun->blocksize) {
1320                         adj = ((lbalen->lba + lba) * cbe_lun->blocksize +
1321                             seglen - pbo) % pb;
1322                         if (seglen > adj)
1323                                 seglen -= adj;
1324                         else
1325                                 seglen -= seglen % cbe_lun->blocksize;
1326                 } else
1327                         seglen -= seglen % cbe_lun->blocksize;
1328                 beio->sg_segs[i].len = seglen;
1329                 beio->sg_segs[i].addr = uma_zalloc(softc->buf_zone, M_WAITOK);
1330
1331                 DPRINTF("segment %d addr %p len %zd\n", i,
1332                         beio->sg_segs[i].addr, beio->sg_segs[i].len);
1333
1334                 beio->num_segs++;
1335                 len_left -= seglen;
1336
1337                 buf = beio->sg_segs[i].addr;
1338                 end = buf + seglen;
1339                 for (; buf < end; buf += cbe_lun->blocksize) {
1340                         if (lbalen->flags & SWS_NDOB) {
1341                                 memset(buf, 0, cbe_lun->blocksize);
1342                         } else {
1343                                 memcpy(buf, io->scsiio.kern_data_ptr,
1344                                     cbe_lun->blocksize);
1345                         }
1346                         if (lbalen->flags & SWS_LBDATA)
1347                                 scsi_ulto4b(lbalen->lba + lba, buf);
1348                         lba++;
1349                 }
1350         }
1351
1352         beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1353         beio->io_len = lba * cbe_lun->blocksize;
1354
1355         /* We can not do all in one run. Correct and schedule rerun. */
1356         if (len_left > 0) {
1357                 lbalen->lba += lba;
1358                 lbalen->len -= lba;
1359                 beio->beio_cont = ctl_be_block_cw_done_ws;
1360         }
1361
1362         be_lun->dispatch(be_lun, beio);
1363 }
1364
1365 static void
1366 ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
1367                             union ctl_io *io)
1368 {
1369         struct ctl_be_block_io *beio;
1370         struct ctl_ptr_len_flags *ptrlen;
1371
1372         DPRINTF("entered\n");
1373
1374         beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1375         ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1376
1377         if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) {
1378                 ctl_free_beio(beio);
1379                 ctl_set_invalid_field(&io->scsiio,
1380                                       /*sks_valid*/ 0,
1381                                       /*command*/ 1,
1382                                       /*field*/ 0,
1383                                       /*bit_valid*/ 0,
1384                                       /*bit*/ 0);
1385                 ctl_config_write_done(io);
1386                 return;
1387         }
1388
1389         beio->io_len = 0;
1390         beio->io_offset = -1;
1391         beio->bio_cmd = BIO_DELETE;
1392         beio->ds_trans_type = DEVSTAT_FREE;
1393         DPRINTF("UNMAP\n");
1394         be_lun->unmap(be_lun, beio);
1395 }
1396
1397 static void
1398 ctl_be_block_cr_done(struct ctl_be_block_io *beio)
1399 {
1400         union ctl_io *io;
1401
1402         io = beio->io;
1403         ctl_free_beio(beio);
1404         ctl_config_read_done(io);
1405 }
1406
1407 static void
1408 ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
1409                          union ctl_io *io)
1410 {
1411         struct ctl_be_block_io *beio;
1412         struct ctl_be_block_softc *softc;
1413
1414         DPRINTF("entered\n");
1415
1416         softc = be_lun->softc;
1417         beio = ctl_alloc_beio(softc);
1418         beio->io = io;
1419         beio->lun = be_lun;
1420         beio->beio_cont = ctl_be_block_cr_done;
1421         PRIV(io)->ptr = (void *)beio;
1422
1423         switch (io->scsiio.cdb[0]) {
1424         case SERVICE_ACTION_IN:         /* GET LBA STATUS */
1425                 beio->bio_cmd = -1;
1426                 beio->ds_trans_type = DEVSTAT_NO_DATA;
1427                 beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1428                 beio->io_len = 0;
1429                 if (be_lun->get_lba_status)
1430                         be_lun->get_lba_status(be_lun, beio);
1431                 else
1432                         ctl_be_block_cr_done(beio);
1433                 break;
1434         default:
1435                 panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1436                 break;
1437         }
1438 }
1439
1440 static void
1441 ctl_be_block_cw_done(struct ctl_be_block_io *beio)
1442 {
1443         union ctl_io *io;
1444
1445         io = beio->io;
1446         ctl_free_beio(beio);
1447         ctl_config_write_done(io);
1448 }
1449
1450 static void
1451 ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
1452                          union ctl_io *io)
1453 {
1454         struct ctl_be_block_io *beio;
1455         struct ctl_be_block_softc *softc;
1456
1457         DPRINTF("entered\n");
1458
1459         softc = be_lun->softc;
1460         beio = ctl_alloc_beio(softc);
1461         beio->io = io;
1462         beio->lun = be_lun;
1463         beio->beio_cont = ctl_be_block_cw_done;
1464         switch (io->scsiio.tag_type) {
1465         case CTL_TAG_ORDERED:
1466                 beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1467                 break;
1468         case CTL_TAG_HEAD_OF_QUEUE:
1469                 beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1470                 break;
1471         case CTL_TAG_UNTAGGED:
1472         case CTL_TAG_SIMPLE:
1473         case CTL_TAG_ACA:
1474         default:
1475                 beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1476                 break;
1477         }
1478         PRIV(io)->ptr = (void *)beio;
1479
1480         switch (io->scsiio.cdb[0]) {
1481         case SYNCHRONIZE_CACHE:
1482         case SYNCHRONIZE_CACHE_16:
1483                 ctl_be_block_cw_dispatch_sync(be_lun, io);
1484                 break;
1485         case WRITE_SAME_10:
1486         case WRITE_SAME_16:
1487                 ctl_be_block_cw_dispatch_ws(be_lun, io);
1488                 break;
1489         case UNMAP:
1490                 ctl_be_block_cw_dispatch_unmap(be_lun, io);
1491                 break;
1492         default:
1493                 panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1494                 break;
1495         }
1496 }
1497
1498 SDT_PROBE_DEFINE1(cbb, , read, start, "uint64_t");
1499 SDT_PROBE_DEFINE1(cbb, , write, start, "uint64_t");
1500 SDT_PROBE_DEFINE1(cbb, , read, alloc_done, "uint64_t");
1501 SDT_PROBE_DEFINE1(cbb, , write, alloc_done, "uint64_t");
1502
1503 static void
1504 ctl_be_block_next(struct ctl_be_block_io *beio)
1505 {
1506         struct ctl_be_block_lun *be_lun;
1507         union ctl_io *io;
1508
1509         io = beio->io;
1510         be_lun = beio->lun;
1511         ctl_free_beio(beio);
1512         if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1513             ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1514              (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1515                 ctl_data_submit_done(io);
1516                 return;
1517         }
1518
1519         io->io_hdr.status &= ~CTL_STATUS_MASK;
1520         io->io_hdr.status |= CTL_STATUS_NONE;
1521
1522         mtx_lock(&be_lun->queue_lock);
1523         STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1524         mtx_unlock(&be_lun->queue_lock);
1525         taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1526 }
1527
1528 static void
1529 ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
1530                            union ctl_io *io)
1531 {
1532         struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1533         struct ctl_be_block_io *beio;
1534         struct ctl_be_block_softc *softc;
1535         struct ctl_lba_len_flags *lbalen;
1536         struct ctl_ptr_len_flags *bptrlen;
1537         uint64_t len_left, lbas;
1538         int i;
1539
1540         softc = be_lun->softc;
1541
1542         DPRINTF("entered\n");
1543
1544         lbalen = ARGS(io);
1545         if (lbalen->flags & CTL_LLF_WRITE) {
1546                 SDT_PROBE0(cbb, , write, start);
1547         } else {
1548                 SDT_PROBE0(cbb, , read, start);
1549         }
1550
1551         beio = ctl_alloc_beio(softc);
1552         beio->io = io;
1553         beio->lun = be_lun;
1554         bptrlen = PRIV(io);
1555         bptrlen->ptr = (void *)beio;
1556
1557         switch (io->scsiio.tag_type) {
1558         case CTL_TAG_ORDERED:
1559                 beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1560                 break;
1561         case CTL_TAG_HEAD_OF_QUEUE:
1562                 beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1563                 break;
1564         case CTL_TAG_UNTAGGED:
1565         case CTL_TAG_SIMPLE:
1566         case CTL_TAG_ACA:
1567         default:
1568                 beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1569                 break;
1570         }
1571
1572         if (lbalen->flags & CTL_LLF_WRITE) {
1573                 beio->bio_cmd = BIO_WRITE;
1574                 beio->ds_trans_type = DEVSTAT_WRITE;
1575         } else {
1576                 beio->bio_cmd = BIO_READ;
1577                 beio->ds_trans_type = DEVSTAT_READ;
1578         }
1579
1580         DPRINTF("%s at LBA %jx len %u @%ju\n",
1581                (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
1582                (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
1583         if (lbalen->flags & CTL_LLF_COMPARE) {
1584                 beio->two_sglists = 1;
1585                 lbas = CTLBLK_HALF_IO_SIZE;
1586         } else {
1587                 lbas = CTLBLK_MAX_IO_SIZE;
1588         }
1589         lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize);
1590         beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize;
1591         beio->io_len = lbas * cbe_lun->blocksize;
1592         bptrlen->len += lbas;
1593
1594         for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
1595                 KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
1596                     i, CTLBLK_MAX_SEGS));
1597
1598                 /*
1599                  * Setup the S/G entry for this chunk.
1600                  */
1601                 beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left);
1602                 beio->sg_segs[i].addr = uma_zalloc(softc->buf_zone, M_WAITOK);
1603
1604                 DPRINTF("segment %d addr %p len %zd\n", i,
1605                         beio->sg_segs[i].addr, beio->sg_segs[i].len);
1606
1607                 /* Set up second segment for compare operation. */
1608                 if (beio->two_sglists) {
1609                         beio->sg_segs[i + CTLBLK_HALF_SEGS].len =
1610                             beio->sg_segs[i].len;
1611                         beio->sg_segs[i + CTLBLK_HALF_SEGS].addr =
1612                             uma_zalloc(softc->buf_zone, M_WAITOK);
1613                 }
1614
1615                 beio->num_segs++;
1616                 len_left -= beio->sg_segs[i].len;
1617         }
1618         if (bptrlen->len < lbalen->len)
1619                 beio->beio_cont = ctl_be_block_next;
1620         io->scsiio.be_move_done = ctl_be_block_move_done;
1621         /* For compare we have separate S/G lists for read and datamove. */
1622         if (beio->two_sglists)
1623                 io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS];
1624         else
1625                 io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1626         io->scsiio.kern_data_len = beio->io_len;
1627         io->scsiio.kern_sg_entries = beio->num_segs;
1628         io->io_hdr.flags |= CTL_FLAG_ALLOCATED;
1629
1630         /*
1631          * For the read case, we need to read the data into our buffers and
1632          * then we can send it back to the user.  For the write case, we
1633          * need to get the data from the user first.
1634          */
1635         if (beio->bio_cmd == BIO_READ) {
1636                 SDT_PROBE0(cbb, , read, alloc_done);
1637                 be_lun->dispatch(be_lun, beio);
1638         } else {
1639                 SDT_PROBE0(cbb, , write, alloc_done);
1640 #ifdef CTL_TIME_IO
1641                 getbinuptime(&io->io_hdr.dma_start_bt);
1642 #endif
1643                 ctl_datamove(io);
1644         }
1645 }
1646
1647 static void
1648 ctl_be_block_worker(void *context, int pending)
1649 {
1650         struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context;
1651         struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1652         union ctl_io *io;
1653         struct ctl_be_block_io *beio;
1654
1655         DPRINTF("entered\n");
1656         /*
1657          * Fetch and process I/Os from all queues.  If we detect LUN
1658          * CTL_LUN_FLAG_NO_MEDIA status here -- it is result of a race,
1659          * so make response maximally opaque to not confuse initiator.
1660          */
1661         for (;;) {
1662                 mtx_lock(&be_lun->queue_lock);
1663                 io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1664                 if (io != NULL) {
1665                         DPRINTF("datamove queue\n");
1666                         STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1667                                       ctl_io_hdr, links);
1668                         mtx_unlock(&be_lun->queue_lock);
1669                         beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1670                         if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
1671                                 ctl_set_busy(&io->scsiio);
1672                                 ctl_complete_beio(beio);
1673                                 return;
1674                         }
1675                         be_lun->dispatch(be_lun, beio);
1676                         continue;
1677                 }
1678                 io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1679                 if (io != NULL) {
1680                         DPRINTF("config write queue\n");
1681                         STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1682                                       ctl_io_hdr, links);
1683                         mtx_unlock(&be_lun->queue_lock);
1684                         if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
1685                                 ctl_set_busy(&io->scsiio);
1686                                 ctl_config_write_done(io);
1687                                 return;
1688                         }
1689                         ctl_be_block_cw_dispatch(be_lun, io);
1690                         continue;
1691                 }
1692                 io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
1693                 if (io != NULL) {
1694                         DPRINTF("config read queue\n");
1695                         STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr,
1696                                       ctl_io_hdr, links);
1697                         mtx_unlock(&be_lun->queue_lock);
1698                         if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
1699                                 ctl_set_busy(&io->scsiio);
1700                                 ctl_config_read_done(io);
1701                                 return;
1702                         }
1703                         ctl_be_block_cr_dispatch(be_lun, io);
1704                         continue;
1705                 }
1706                 io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1707                 if (io != NULL) {
1708                         DPRINTF("input queue\n");
1709                         STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1710                                       ctl_io_hdr, links);
1711                         mtx_unlock(&be_lun->queue_lock);
1712                         if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
1713                                 ctl_set_busy(&io->scsiio);
1714                                 ctl_data_submit_done(io);
1715                                 return;
1716                         }
1717                         ctl_be_block_dispatch(be_lun, io);
1718                         continue;
1719                 }
1720
1721                 /*
1722                  * If we get here, there is no work left in the queues, so
1723                  * just break out and let the task queue go to sleep.
1724                  */
1725                 mtx_unlock(&be_lun->queue_lock);
1726                 break;
1727         }
1728 }
1729
1730 /*
1731  * Entry point from CTL to the backend for I/O.  We queue everything to a
1732  * work thread, so this just puts the I/O on a queue and wakes up the
1733  * thread.
1734  */
1735 static int
1736 ctl_be_block_submit(union ctl_io *io)
1737 {
1738         struct ctl_be_block_lun *be_lun;
1739
1740         DPRINTF("entered\n");
1741
1742         be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);
1743
1744         /*
1745          * Make sure we only get SCSI I/O.
1746          */
1747         KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1748                 "%#x) encountered", io->io_hdr.io_type));
1749
1750         PRIV(io)->len = 0;
1751
1752         mtx_lock(&be_lun->queue_lock);
1753         STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1754         mtx_unlock(&be_lun->queue_lock);
1755         taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1756
1757         return (CTL_RETVAL_COMPLETE);
1758 }
1759
1760 static int
1761 ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1762                         int flag, struct thread *td)
1763 {
1764         struct ctl_be_block_softc *softc = &backend_block_softc;
1765         int error;
1766
1767         error = 0;
1768         switch (cmd) {
1769         case CTL_LUN_REQ: {
1770                 struct ctl_lun_req *lun_req;
1771
1772                 lun_req = (struct ctl_lun_req *)addr;
1773
1774                 switch (lun_req->reqtype) {
1775                 case CTL_LUNREQ_CREATE:
1776                         error = ctl_be_block_create(softc, lun_req);
1777                         break;
1778                 case CTL_LUNREQ_RM:
1779                         error = ctl_be_block_rm(softc, lun_req);
1780                         break;
1781                 case CTL_LUNREQ_MODIFY:
1782                         error = ctl_be_block_modify(softc, lun_req);
1783                         break;
1784                 default:
1785                         lun_req->status = CTL_LUN_ERROR;
1786                         snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1787                                  "invalid LUN request type %d",
1788                                  lun_req->reqtype);
1789                         break;
1790                 }
1791                 break;
1792         }
1793         default:
1794                 error = ENOTTY;
1795                 break;
1796         }
1797
1798         return (error);
1799 }
1800
1801 static int
1802 ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1803 {
1804         struct ctl_be_lun *cbe_lun;
1805         struct ctl_be_block_filedata *file_data;
1806         struct ctl_lun_create_params *params;
1807         const char                   *value;
1808         struct vattr                  vattr;
1809         off_t                         ps, pss, po, pos, us, uss, uo, uos;
1810         int                           error;
1811
1812         cbe_lun = &be_lun->cbe_lun;
1813         file_data = &be_lun->backend.file;
1814         params = &be_lun->params;
1815
1816         be_lun->dev_type = CTL_BE_BLOCK_FILE;
1817         be_lun->dispatch = ctl_be_block_dispatch_file;
1818         be_lun->lun_flush = ctl_be_block_flush_file;
1819         be_lun->get_lba_status = ctl_be_block_gls_file;
1820         be_lun->getattr = ctl_be_block_getattr_file;
1821         be_lun->unmap = NULL;
1822         cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
1823
1824         error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1825         if (error != 0) {
1826                 snprintf(req->error_str, sizeof(req->error_str),
1827                          "error calling VOP_GETATTR() for file %s",
1828                          be_lun->dev_path);
1829                 return (error);
1830         }
1831
1832         file_data->cred = crhold(curthread->td_ucred);
1833         if (params->lun_size_bytes != 0)
1834                 be_lun->size_bytes = params->lun_size_bytes;
1835         else
1836                 be_lun->size_bytes = vattr.va_size;
1837
1838         /*
1839          * For files we can use any logical block size.  Prefer 512 bytes
1840          * for compatibility reasons.  If file's vattr.va_blocksize
1841          * (preferred I/O block size) is bigger and multiple to chosen
1842          * logical block size -- report it as physical block size.
1843          */
1844         if (params->blocksize_bytes != 0)
1845                 cbe_lun->blocksize = params->blocksize_bytes;
1846         else if (cbe_lun->lun_type == T_CDROM)
1847                 cbe_lun->blocksize = 2048;
1848         else
1849                 cbe_lun->blocksize = 512;
1850         be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
1851         cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
1852             0 : (be_lun->size_blocks - 1);
1853
1854         us = ps = vattr.va_blocksize;
1855         uo = po = 0;
1856
1857         value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL);
1858         if (value != NULL)
1859                 ctl_expand_number(value, &ps);
1860         value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL);
1861         if (value != NULL)
1862                 ctl_expand_number(value, &po);
1863         pss = ps / cbe_lun->blocksize;
1864         pos = po / cbe_lun->blocksize;
1865         if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
1866             ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
1867                 cbe_lun->pblockexp = fls(pss) - 1;
1868                 cbe_lun->pblockoff = (pss - pos) % pss;
1869         }
1870
1871         value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL);
1872         if (value != NULL)
1873                 ctl_expand_number(value, &us);
1874         value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL);
1875         if (value != NULL)
1876                 ctl_expand_number(value, &uo);
1877         uss = us / cbe_lun->blocksize;
1878         uos = uo / cbe_lun->blocksize;
1879         if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
1880             ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
1881                 cbe_lun->ublockexp = fls(uss) - 1;
1882                 cbe_lun->ublockoff = (uss - uos) % uss;
1883         }
1884
1885         /*
1886          * Sanity check.  The media size has to be at least one
1887          * sector long.
1888          */
1889         if (be_lun->size_bytes < cbe_lun->blocksize) {
1890                 error = EINVAL;
1891                 snprintf(req->error_str, sizeof(req->error_str),
1892                          "file %s size %ju < block size %u", be_lun->dev_path,
1893                          (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize);
1894         }
1895
1896         cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize;
1897         return (error);
1898 }
1899
1900 static int
1901 ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1902 {
1903         struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1904         struct ctl_lun_create_params *params;
1905         struct cdevsw                *csw;
1906         struct cdev                  *dev;
1907         const char                   *value;
1908         int                           error, atomic, maxio, ref, unmap, tmp;
1909         off_t                         ps, pss, po, pos, us, uss, uo, uos, otmp;
1910
1911         params = &be_lun->params;
1912
1913         be_lun->dev_type = CTL_BE_BLOCK_DEV;
1914         csw = devvn_refthread(be_lun->vn, &dev, &ref);
1915         if (csw == NULL)
1916                 return (ENXIO);
1917         if (strcmp(csw->d_name, "zvol") == 0) {
1918                 be_lun->dispatch = ctl_be_block_dispatch_zvol;
1919                 be_lun->get_lba_status = ctl_be_block_gls_zvol;
1920                 atomic = maxio = CTLBLK_MAX_IO_SIZE;
1921         } else {
1922                 be_lun->dispatch = ctl_be_block_dispatch_dev;
1923                 be_lun->get_lba_status = NULL;
1924                 atomic = 0;
1925                 maxio = dev->si_iosize_max;
1926                 if (maxio <= 0)
1927                         maxio = DFLTPHYS;
1928                 if (maxio > CTLBLK_MAX_IO_SIZE)
1929                         maxio = CTLBLK_MAX_IO_SIZE;
1930         }
1931         be_lun->lun_flush = ctl_be_block_flush_dev;
1932         be_lun->getattr = ctl_be_block_getattr_dev;
1933         be_lun->unmap = ctl_be_block_unmap_dev;
1934
1935         if (!csw->d_ioctl) {
1936                 dev_relthread(dev, ref);
1937                 snprintf(req->error_str, sizeof(req->error_str),
1938                          "no d_ioctl for device %s!", be_lun->dev_path);
1939                 return (ENODEV);
1940         }
1941
1942         error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
1943                                curthread);
1944         if (error) {
1945                 dev_relthread(dev, ref);
1946                 snprintf(req->error_str, sizeof(req->error_str),
1947                          "error %d returned for DIOCGSECTORSIZE ioctl "
1948                          "on %s!", error, be_lun->dev_path);
1949                 return (error);
1950         }
1951
1952         /*
1953          * If the user has asked for a blocksize that is greater than the
1954          * backing device's blocksize, we can do it only if the blocksize
1955          * the user is asking for is an even multiple of the underlying 
1956          * device's blocksize.
1957          */
1958         if ((params->blocksize_bytes != 0) &&
1959             (params->blocksize_bytes >= tmp)) {
1960                 if (params->blocksize_bytes % tmp == 0) {
1961                         cbe_lun->blocksize = params->blocksize_bytes;
1962                 } else {
1963                         dev_relthread(dev, ref);
1964                         snprintf(req->error_str, sizeof(req->error_str),
1965                                  "requested blocksize %u is not an even "
1966                                  "multiple of backing device blocksize %u",
1967                                  params->blocksize_bytes, tmp);
1968                         return (EINVAL);
1969                 }
1970         } else if (params->blocksize_bytes != 0) {
1971                 dev_relthread(dev, ref);
1972                 snprintf(req->error_str, sizeof(req->error_str),
1973                          "requested blocksize %u < backing device "
1974                          "blocksize %u", params->blocksize_bytes, tmp);
1975                 return (EINVAL);
1976         } else if (cbe_lun->lun_type == T_CDROM)
1977                 cbe_lun->blocksize = MAX(tmp, 2048);
1978         else
1979                 cbe_lun->blocksize = tmp;
1980
1981         error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD,
1982                              curthread);
1983         if (error) {
1984                 dev_relthread(dev, ref);
1985                 snprintf(req->error_str, sizeof(req->error_str),
1986                          "error %d returned for DIOCGMEDIASIZE "
1987                          " ioctl on %s!", error,
1988                          be_lun->dev_path);
1989                 return (error);
1990         }
1991
1992         if (params->lun_size_bytes != 0) {
1993                 if (params->lun_size_bytes > otmp) {
1994                         dev_relthread(dev, ref);
1995                         snprintf(req->error_str, sizeof(req->error_str),
1996                                  "requested LUN size %ju > backing device "
1997                                  "size %ju",
1998                                  (uintmax_t)params->lun_size_bytes,
1999                                  (uintmax_t)otmp);
2000                         return (EINVAL);
2001                 }
2002
2003                 be_lun->size_bytes = params->lun_size_bytes;
2004         } else
2005                 be_lun->size_bytes = otmp;
2006         be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2007         cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2008             0 : (be_lun->size_blocks - 1);
2009
2010         error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD,
2011             curthread);
2012         if (error)
2013                 ps = po = 0;
2014         else {
2015                 error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po,
2016                     FREAD, curthread);
2017                 if (error)
2018                         po = 0;
2019         }
2020         us = ps;
2021         uo = po;
2022
2023         value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL);
2024         if (value != NULL)
2025                 ctl_expand_number(value, &ps);
2026         value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL);
2027         if (value != NULL)
2028                 ctl_expand_number(value, &po);
2029         pss = ps / cbe_lun->blocksize;
2030         pos = po / cbe_lun->blocksize;
2031         if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
2032             ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
2033                 cbe_lun->pblockexp = fls(pss) - 1;
2034                 cbe_lun->pblockoff = (pss - pos) % pss;
2035         }
2036
2037         value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL);
2038         if (value != NULL)
2039                 ctl_expand_number(value, &us);
2040         value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL);
2041         if (value != NULL)
2042                 ctl_expand_number(value, &uo);
2043         uss = us / cbe_lun->blocksize;
2044         uos = uo / cbe_lun->blocksize;
2045         if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
2046             ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
2047                 cbe_lun->ublockexp = fls(uss) - 1;
2048                 cbe_lun->ublockoff = (uss - uos) % uss;
2049         }
2050
2051         cbe_lun->atomicblock = atomic / cbe_lun->blocksize;
2052         cbe_lun->opttxferlen = maxio / cbe_lun->blocksize;
2053
2054         if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
2055                 unmap = 1;
2056         } else {
2057                 struct diocgattr_arg    arg;
2058
2059                 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
2060                 arg.len = sizeof(arg.value.i);
2061                 error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
2062                     curthread);
2063                 unmap = (error == 0) ? arg.value.i : 0;
2064         }
2065         value = dnvlist_get_string(cbe_lun->options, "unmap", NULL);
2066         if (value != NULL)
2067                 unmap = (strcmp(value, "on") == 0);
2068         if (unmap)
2069                 cbe_lun->flags |= CTL_LUN_FLAG_UNMAP;
2070         else
2071                 cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
2072
2073         dev_relthread(dev, ref);
2074         return (0);
2075 }
2076
2077 static int
2078 ctl_be_block_close(struct ctl_be_block_lun *be_lun)
2079 {
2080         struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2081         int flags;
2082
2083         if (be_lun->vn) {
2084                 flags = FREAD;
2085                 if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0)
2086                         flags |= FWRITE;
2087                 (void)vn_close(be_lun->vn, flags, NOCRED, curthread);
2088                 be_lun->vn = NULL;
2089
2090                 switch (be_lun->dev_type) {
2091                 case CTL_BE_BLOCK_DEV:
2092                         break;
2093                 case CTL_BE_BLOCK_FILE:
2094                         if (be_lun->backend.file.cred != NULL) {
2095                                 crfree(be_lun->backend.file.cred);
2096                                 be_lun->backend.file.cred = NULL;
2097                         }
2098                         break;
2099                 case CTL_BE_BLOCK_NONE:
2100                         break;
2101                 default:
2102                         panic("Unexpected backend type %d", be_lun->dev_type);
2103                         break;
2104                 }
2105                 be_lun->dev_type = CTL_BE_BLOCK_NONE;
2106         }
2107         return (0);
2108 }
2109
2110 static int
2111 ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2112 {
2113         struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2114         struct nameidata nd;
2115         const char      *value;
2116         int              error, flags;
2117
2118         error = 0;
2119         if (rootvnode == NULL) {
2120                 snprintf(req->error_str, sizeof(req->error_str),
2121                          "Root filesystem is not mounted");
2122                 return (1);
2123         }
2124         pwd_ensure_dirs();
2125
2126         value = dnvlist_get_string(cbe_lun->options, "file", NULL);
2127         if (value == NULL) {
2128                 snprintf(req->error_str, sizeof(req->error_str),
2129                          "no file argument specified");
2130                 return (1);
2131         }
2132         free(be_lun->dev_path, M_CTLBLK);
2133         be_lun->dev_path = strdup(value, M_CTLBLK);
2134
2135         flags = FREAD;
2136         value = dnvlist_get_string(cbe_lun->options, "readonly", NULL);
2137         if (value != NULL) {
2138                 if (strcmp(value, "on") != 0)
2139                         flags |= FWRITE;
2140         } else if (cbe_lun->lun_type == T_DIRECT)
2141                 flags |= FWRITE;
2142
2143 again:
2144         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
2145         error = vn_open(&nd, &flags, 0, NULL);
2146         if ((error == EROFS || error == EACCES) && (flags & FWRITE)) {
2147                 flags &= ~FWRITE;
2148                 goto again;
2149         }
2150         if (error) {
2151                 /*
2152                  * This is the only reasonable guess we can make as far as
2153                  * path if the user doesn't give us a fully qualified path.
2154                  * If they want to specify a file, they need to specify the
2155                  * full path.
2156                  */
2157                 if (be_lun->dev_path[0] != '/') {
2158                         char *dev_name;
2159
2160                         asprintf(&dev_name, M_CTLBLK, "/dev/%s",
2161                                 be_lun->dev_path);
2162                         free(be_lun->dev_path, M_CTLBLK);
2163                         be_lun->dev_path = dev_name;
2164                         goto again;
2165                 }
2166                 snprintf(req->error_str, sizeof(req->error_str),
2167                     "error opening %s: %d", be_lun->dev_path, error);
2168                 return (error);
2169         }
2170         if (flags & FWRITE)
2171                 cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY;
2172         else
2173                 cbe_lun->flags |= CTL_LUN_FLAG_READONLY;
2174
2175         NDFREE(&nd, NDF_ONLY_PNBUF);
2176         be_lun->vn = nd.ni_vp;
2177
2178         /* We only support disks and files. */
2179         if (vn_isdisk(be_lun->vn, &error)) {
2180                 error = ctl_be_block_open_dev(be_lun, req);
2181         } else if (be_lun->vn->v_type == VREG) {
2182                 error = ctl_be_block_open_file(be_lun, req);
2183         } else {
2184                 error = EINVAL;
2185                 snprintf(req->error_str, sizeof(req->error_str),
2186                          "%s is not a disk or plain file", be_lun->dev_path);
2187         }
2188         VOP_UNLOCK(be_lun->vn, 0);
2189
2190         if (error != 0)
2191                 ctl_be_block_close(be_lun);
2192         cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2193         if (be_lun->dispatch != ctl_be_block_dispatch_dev)
2194                 cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2195         value = dnvlist_get_string(cbe_lun->options, "serseq", NULL);
2196         if (value != NULL && strcmp(value, "on") == 0)
2197                 cbe_lun->serseq = CTL_LUN_SERSEQ_ON;
2198         else if (value != NULL && strcmp(value, "read") == 0)
2199                 cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2200         else if (value != NULL && strcmp(value, "off") == 0)
2201                 cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2202         return (0);
2203 }
2204
2205 static int
2206 ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2207 {
2208         struct ctl_be_lun *cbe_lun;
2209         struct ctl_be_block_lun *be_lun;
2210         struct ctl_lun_create_params *params;
2211         char num_thread_str[16];
2212         char tmpstr[32];
2213         const char *value;
2214         int retval, num_threads;
2215         int tmp_num_threads;
2216
2217         params = &req->reqdata.create;
2218         retval = 0;
2219         req->status = CTL_LUN_OK;
2220
2221         be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
2222         cbe_lun = &be_lun->cbe_lun;
2223         be_lun->params = req->reqdata.create;
2224         be_lun->softc = softc;
2225         STAILQ_INIT(&be_lun->input_queue);
2226         STAILQ_INIT(&be_lun->config_read_queue);
2227         STAILQ_INIT(&be_lun->config_write_queue);
2228         STAILQ_INIT(&be_lun->datamove_queue);
2229         mtx_init(&be_lun->io_lock, "ctlblock io", NULL, MTX_DEF);
2230         mtx_init(&be_lun->queue_lock, "ctlblock queue", NULL, MTX_DEF);
2231         cbe_lun->options = nvlist_clone(req->args_nvl);
2232
2233         if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
2234                 cbe_lun->lun_type = params->device_type;
2235         else
2236                 cbe_lun->lun_type = T_DIRECT;
2237         be_lun->flags = 0;
2238         cbe_lun->flags = 0;
2239         value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL);
2240         if (value != NULL) {
2241                 if (strcmp(value, "primary") == 0)
2242                         cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2243         } else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2244                 cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2245
2246         if (cbe_lun->lun_type == T_DIRECT ||
2247             cbe_lun->lun_type == T_CDROM) {
2248                 be_lun->size_bytes = params->lun_size_bytes;
2249                 if (params->blocksize_bytes != 0)
2250                         cbe_lun->blocksize = params->blocksize_bytes;
2251                 else if (cbe_lun->lun_type == T_CDROM)
2252                         cbe_lun->blocksize = 2048;
2253                 else
2254                         cbe_lun->blocksize = 512;
2255                 be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2256                 cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2257                     0 : (be_lun->size_blocks - 1);
2258
2259                 if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2260                     control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2261                         retval = ctl_be_block_open(be_lun, req);
2262                         if (retval != 0) {
2263                                 retval = 0;
2264                                 req->status = CTL_LUN_WARNING;
2265                         }
2266                 }
2267                 num_threads = cbb_num_threads;
2268         } else {
2269                 num_threads = 1;
2270         }
2271
2272         value = dnvlist_get_string(cbe_lun->options, "num_threads", NULL);
2273         if (value != NULL) {
2274                 tmp_num_threads = strtol(value, NULL, 0);
2275
2276                 /*
2277                  * We don't let the user specify less than one
2278                  * thread, but hope he's clueful enough not to
2279                  * specify 1000 threads.
2280                  */
2281                 if (tmp_num_threads < 1) {
2282                         snprintf(req->error_str, sizeof(req->error_str),
2283                                  "invalid number of threads %s",
2284                                  num_thread_str);
2285                         goto bailout_error;
2286                 }
2287                 num_threads = tmp_num_threads;
2288         }
2289
2290         if (be_lun->vn == NULL)
2291                 cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2292         /* Tell the user the blocksize we ended up using */
2293         params->lun_size_bytes = be_lun->size_bytes;
2294         params->blocksize_bytes = cbe_lun->blocksize;
2295         if (params->flags & CTL_LUN_FLAG_ID_REQ) {
2296                 cbe_lun->req_lun_id = params->req_lun_id;
2297                 cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ;
2298         } else
2299                 cbe_lun->req_lun_id = 0;
2300
2301         cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown;
2302         cbe_lun->be = &ctl_be_block_driver;
2303
2304         if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
2305                 snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%04d",
2306                          softc->num_luns);
2307                 strncpy((char *)cbe_lun->serial_num, tmpstr,
2308                         MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr)));
2309
2310                 /* Tell the user what we used for a serial number */
2311                 strncpy((char *)params->serial_num, tmpstr,
2312                         MIN(sizeof(params->serial_num), sizeof(tmpstr)));
2313         } else { 
2314                 strncpy((char *)cbe_lun->serial_num, params->serial_num,
2315                         MIN(sizeof(cbe_lun->serial_num),
2316                         sizeof(params->serial_num)));
2317         }
2318         if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
2319                 snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%04d", softc->num_luns);
2320                 strncpy((char *)cbe_lun->device_id, tmpstr,
2321                         MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr)));
2322
2323                 /* Tell the user what we used for a device ID */
2324                 strncpy((char *)params->device_id, tmpstr,
2325                         MIN(sizeof(params->device_id), sizeof(tmpstr)));
2326         } else {
2327                 strncpy((char *)cbe_lun->device_id, params->device_id,
2328                         MIN(sizeof(cbe_lun->device_id),
2329                             sizeof(params->device_id)));
2330         }
2331
2332         TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
2333
2334         be_lun->io_taskqueue = taskqueue_create("ctlblocktq", M_WAITOK,
2335             taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
2336
2337         if (be_lun->io_taskqueue == NULL) {
2338                 snprintf(req->error_str, sizeof(req->error_str),
2339                          "unable to create taskqueue");
2340                 goto bailout_error;
2341         }
2342
2343         /*
2344          * Note that we start the same number of threads by default for
2345          * both the file case and the block device case.  For the file
2346          * case, we need multiple threads to allow concurrency, because the
2347          * vnode interface is designed to be a blocking interface.  For the
2348          * block device case, ZFS zvols at least will block the caller's
2349          * context in many instances, and so we need multiple threads to
2350          * overcome that problem.  Other block devices don't need as many
2351          * threads, but they shouldn't cause too many problems.
2352          *
2353          * If the user wants to just have a single thread for a block
2354          * device, he can specify that when the LUN is created, or change
2355          * the tunable/sysctl to alter the default number of threads.
2356          */
2357         retval = taskqueue_start_threads_in_proc(&be_lun->io_taskqueue,
2358                                          /*num threads*/num_threads,
2359                                          /*priority*/PUSER,
2360                                          /*proc*/control_softc->ctl_proc,
2361                                          /*thread name*/"block");
2362
2363         if (retval != 0)
2364                 goto bailout_error;
2365
2366         be_lun->num_threads = num_threads;
2367
2368         retval = ctl_add_lun(&be_lun->cbe_lun);
2369         if (retval != 0) {
2370                 snprintf(req->error_str, sizeof(req->error_str),
2371                          "ctl_add_lun() returned error %d, see dmesg for "
2372                          "details", retval);
2373                 retval = 0;
2374                 goto bailout_error;
2375         }
2376
2377         be_lun->disk_stats = devstat_new_entry("cbb", cbe_lun->lun_id,
2378                                                cbe_lun->blocksize,
2379                                                DEVSTAT_ALL_SUPPORTED,
2380                                                cbe_lun->lun_type
2381                                                | DEVSTAT_TYPE_IF_OTHER,
2382                                                DEVSTAT_PRIORITY_OTHER);
2383
2384         mtx_lock(&softc->lock);
2385         softc->num_luns++;
2386         SLIST_INSERT_HEAD(&softc->lun_list, be_lun, links);
2387         mtx_unlock(&softc->lock);
2388
2389         params->req_lun_id = cbe_lun->lun_id;
2390
2391         return (retval);
2392
2393 bailout_error:
2394         req->status = CTL_LUN_ERROR;
2395
2396         if (be_lun->io_taskqueue != NULL)
2397                 taskqueue_free(be_lun->io_taskqueue);
2398         ctl_be_block_close(be_lun);
2399         if (be_lun->dev_path != NULL)
2400                 free(be_lun->dev_path, M_CTLBLK);
2401         nvlist_destroy(cbe_lun->options);
2402         mtx_destroy(&be_lun->queue_lock);
2403         mtx_destroy(&be_lun->io_lock);
2404         free(be_lun, M_CTLBLK);
2405
2406         return (retval);
2407 }
2408
2409 static int
2410 ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2411 {
2412         struct ctl_lun_rm_params *params;
2413         struct ctl_be_block_lun *be_lun;
2414         struct ctl_be_lun *cbe_lun;
2415         int retval;
2416
2417         params = &req->reqdata.rm;
2418
2419         sx_xlock(&softc->modify_lock);
2420         mtx_lock(&softc->lock);
2421         SLIST_FOREACH(be_lun, &softc->lun_list, links) {
2422                 if (be_lun->cbe_lun.lun_id == params->lun_id) {
2423                         SLIST_REMOVE(&softc->lun_list, be_lun,
2424                             ctl_be_block_lun, links);
2425                         softc->num_luns--;
2426                         break;
2427                 }
2428         }
2429         mtx_unlock(&softc->lock);
2430         sx_xunlock(&softc->modify_lock);
2431         if (be_lun == NULL) {
2432                 snprintf(req->error_str, sizeof(req->error_str),
2433                          "LUN %u is not managed by the block backend",
2434                          params->lun_id);
2435                 goto bailout_error;
2436         }
2437         cbe_lun = &be_lun->cbe_lun;
2438
2439         if (be_lun->vn != NULL) {
2440                 cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2441                 ctl_lun_no_media(cbe_lun);
2442                 taskqueue_drain_all(be_lun->io_taskqueue);
2443                 ctl_be_block_close(be_lun);
2444         }
2445
2446         mtx_lock(&softc->lock);
2447         be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2448         mtx_unlock(&softc->lock);
2449
2450         retval = ctl_remove_lun(cbe_lun);
2451         if (retval != 0) {
2452                 snprintf(req->error_str, sizeof(req->error_str),
2453                          "error %d returned from ctl_remove_lun() for "
2454                          "LUN %d", retval, params->lun_id);
2455                 mtx_lock(&softc->lock);
2456                 be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2457                 mtx_unlock(&softc->lock);
2458                 goto bailout_error;
2459         }
2460
2461         mtx_lock(&softc->lock);
2462         while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2463                 retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblockrm", 0);
2464                 if (retval == EINTR)
2465                         break;
2466         }
2467         be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2468         if (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
2469                 mtx_unlock(&softc->lock);
2470                 free(be_lun, M_CTLBLK);
2471         } else {
2472                 mtx_unlock(&softc->lock);
2473                 return (EINTR);
2474         }
2475
2476         req->status = CTL_LUN_OK;
2477         return (0);
2478
2479 bailout_error:
2480         req->status = CTL_LUN_ERROR;
2481         return (0);
2482 }
2483
2484 static int
2485 ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2486 {
2487         struct ctl_lun_modify_params *params;
2488         struct ctl_be_block_lun *be_lun;
2489         struct ctl_be_lun *cbe_lun;
2490         const char *value;
2491         uint64_t oldsize;
2492         int error, wasprim;
2493
2494         params = &req->reqdata.modify;
2495
2496         sx_xlock(&softc->modify_lock);
2497         mtx_lock(&softc->lock);
2498         SLIST_FOREACH(be_lun, &softc->lun_list, links) {
2499                 if (be_lun->cbe_lun.lun_id == params->lun_id)
2500                         break;
2501         }
2502         mtx_unlock(&softc->lock);
2503         if (be_lun == NULL) {
2504                 snprintf(req->error_str, sizeof(req->error_str),
2505                          "LUN %u is not managed by the block backend",
2506                          params->lun_id);
2507                 goto bailout_error;
2508         }
2509         cbe_lun = &be_lun->cbe_lun;
2510
2511         if (params->lun_size_bytes != 0)
2512                 be_lun->params.lun_size_bytes = params->lun_size_bytes;
2513
2514         if (req->args_nvl != NULL) {
2515                 nvlist_destroy(cbe_lun->options);
2516                 cbe_lun->options = nvlist_clone(req->args_nvl);
2517         }
2518
2519         wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY);
2520         value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL);
2521         if (value != NULL) {
2522                 if (strcmp(value, "primary") == 0)
2523                         cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2524                 else
2525                         cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2526         } else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2527                 cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2528         else
2529                 cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2530         if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) {
2531                 if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)
2532                         ctl_lun_primary(cbe_lun);
2533                 else
2534                         ctl_lun_secondary(cbe_lun);
2535         }
2536
2537         oldsize = be_lun->size_blocks;
2538         if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2539             control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2540                 if (be_lun->vn == NULL)
2541                         error = ctl_be_block_open(be_lun, req);
2542                 else if (vn_isdisk(be_lun->vn, &error))
2543                         error = ctl_be_block_open_dev(be_lun, req);
2544                 else if (be_lun->vn->v_type == VREG) {
2545                         vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
2546                         error = ctl_be_block_open_file(be_lun, req);
2547                         VOP_UNLOCK(be_lun->vn, 0);
2548                 } else
2549                         error = EINVAL;
2550                 if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) &&
2551                     be_lun->vn != NULL) {
2552                         cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
2553                         ctl_lun_has_media(cbe_lun);
2554                 } else if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) == 0 &&
2555                     be_lun->vn == NULL) {
2556                         cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2557                         ctl_lun_no_media(cbe_lun);
2558                 }
2559                 cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
2560         } else {
2561                 if (be_lun->vn != NULL) {
2562                         cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2563                         ctl_lun_no_media(cbe_lun);
2564                         taskqueue_drain_all(be_lun->io_taskqueue);
2565                         error = ctl_be_block_close(be_lun);
2566                 } else
2567                         error = 0;
2568         }
2569         if (be_lun->size_blocks != oldsize)
2570                 ctl_lun_capacity_changed(cbe_lun);
2571
2572         /* Tell the user the exact size we ended up using */
2573         params->lun_size_bytes = be_lun->size_bytes;
2574
2575         sx_xunlock(&softc->modify_lock);
2576         req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
2577         return (0);
2578
2579 bailout_error:
2580         sx_xunlock(&softc->modify_lock);
2581         req->status = CTL_LUN_ERROR;
2582         return (0);
2583 }
2584
2585 static void
2586 ctl_be_block_lun_shutdown(struct ctl_be_lun *cbe_lun)
2587 {
2588         struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)cbe_lun;
2589         struct ctl_be_block_softc *softc = be_lun->softc;
2590
2591         taskqueue_drain_all(be_lun->io_taskqueue);
2592         taskqueue_free(be_lun->io_taskqueue);
2593         if (be_lun->disk_stats != NULL)
2594                 devstat_remove_entry(be_lun->disk_stats);
2595         nvlist_destroy(be_lun->cbe_lun.options);
2596         free(be_lun->dev_path, M_CTLBLK);
2597         mtx_destroy(&be_lun->queue_lock);
2598         mtx_destroy(&be_lun->io_lock);
2599
2600         mtx_lock(&softc->lock);
2601         be_lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2602         if (be_lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2603                 wakeup(be_lun);
2604         else
2605                 free(be_lun, M_CTLBLK);
2606         mtx_unlock(&softc->lock);
2607 }
2608
2609 static int
2610 ctl_be_block_config_write(union ctl_io *io)
2611 {
2612         struct ctl_be_block_lun *be_lun;
2613         struct ctl_be_lun *cbe_lun;
2614         int retval;
2615
2616         DPRINTF("entered\n");
2617
2618         cbe_lun = CTL_BACKEND_LUN(io);
2619         be_lun = (struct ctl_be_block_lun *)cbe_lun;
2620
2621         retval = 0;
2622         switch (io->scsiio.cdb[0]) {
2623         case SYNCHRONIZE_CACHE:
2624         case SYNCHRONIZE_CACHE_16:
2625         case WRITE_SAME_10:
2626         case WRITE_SAME_16:
2627         case UNMAP:
2628                 /*
2629                  * The upper level CTL code will filter out any CDBs with
2630                  * the immediate bit set and return the proper error.
2631                  *
2632                  * We don't really need to worry about what LBA range the
2633                  * user asked to be synced out.  When they issue a sync
2634                  * cache command, we'll sync out the whole thing.
2635                  */
2636                 mtx_lock(&be_lun->queue_lock);
2637                 STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2638                                    links);
2639                 mtx_unlock(&be_lun->queue_lock);
2640                 taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2641                 break;
2642         case START_STOP_UNIT: {
2643                 struct scsi_start_stop_unit *cdb;
2644                 struct ctl_lun_req req;
2645
2646                 cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2647                 if ((cdb->how & SSS_PC_MASK) != 0) {
2648                         ctl_set_success(&io->scsiio);
2649                         ctl_config_write_done(io);
2650                         break;
2651                 }
2652                 if (cdb->how & SSS_START) {
2653                         if ((cdb->how & SSS_LOEJ) && be_lun->vn == NULL) {
2654                                 retval = ctl_be_block_open(be_lun, &req);
2655                                 cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
2656                                 if (retval == 0) {
2657                                         cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
2658                                         ctl_lun_has_media(cbe_lun);
2659                                 } else {
2660                                         cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2661                                         ctl_lun_no_media(cbe_lun);
2662                                 }
2663                         }
2664                         ctl_start_lun(cbe_lun);
2665                 } else {
2666                         ctl_stop_lun(cbe_lun);
2667                         if (cdb->how & SSS_LOEJ) {
2668                                 cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2669                                 cbe_lun->flags |= CTL_LUN_FLAG_EJECTED;
2670                                 ctl_lun_ejected(cbe_lun);
2671                                 if (be_lun->vn != NULL)
2672                                         ctl_be_block_close(be_lun);
2673                         }
2674                 }
2675
2676                 ctl_set_success(&io->scsiio);
2677                 ctl_config_write_done(io);
2678                 break;
2679         }
2680         case PREVENT_ALLOW:
2681                 ctl_set_success(&io->scsiio);
2682                 ctl_config_write_done(io);
2683                 break;
2684         default:
2685                 ctl_set_invalid_opcode(&io->scsiio);
2686                 ctl_config_write_done(io);
2687                 retval = CTL_RETVAL_COMPLETE;
2688                 break;
2689         }
2690
2691         return (retval);
2692 }
2693
2694 static int
2695 ctl_be_block_config_read(union ctl_io *io)
2696 {
2697         struct ctl_be_block_lun *be_lun;
2698         int retval = 0;
2699
2700         DPRINTF("entered\n");
2701
2702         be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);
2703
2704         switch (io->scsiio.cdb[0]) {
2705         case SERVICE_ACTION_IN:
2706                 if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
2707                         mtx_lock(&be_lun->queue_lock);
2708                         STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
2709                             &io->io_hdr, links);
2710                         mtx_unlock(&be_lun->queue_lock);
2711                         taskqueue_enqueue(be_lun->io_taskqueue,
2712                             &be_lun->io_task);
2713                         retval = CTL_RETVAL_QUEUED;
2714                         break;
2715                 }
2716                 ctl_set_invalid_field(&io->scsiio,
2717                                       /*sks_valid*/ 1,
2718                                       /*command*/ 1,
2719                                       /*field*/ 1,
2720                                       /*bit_valid*/ 1,
2721                                       /*bit*/ 4);
2722                 ctl_config_read_done(io);
2723                 retval = CTL_RETVAL_COMPLETE;
2724                 break;
2725         default:
2726                 ctl_set_invalid_opcode(&io->scsiio);
2727                 ctl_config_read_done(io);
2728                 retval = CTL_RETVAL_COMPLETE;
2729                 break;
2730         }
2731
2732         return (retval);
2733 }
2734
2735 static int
2736 ctl_be_block_lun_info(struct ctl_be_lun *cbe_lun, struct sbuf *sb)
2737 {
2738         struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)cbe_lun;
2739         int retval;
2740
2741         retval = sbuf_printf(sb, "\t<num_threads>");
2742         if (retval != 0)
2743                 goto bailout;
2744         retval = sbuf_printf(sb, "%d", lun->num_threads);
2745         if (retval != 0)
2746                 goto bailout;
2747         retval = sbuf_printf(sb, "</num_threads>\n");
2748
2749 bailout:
2750         return (retval);
2751 }
2752
2753 static uint64_t
2754 ctl_be_block_lun_attr(struct ctl_be_lun *cbe_lun, const char *attrname)
2755 {
2756         struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)cbe_lun;
2757
2758         if (lun->getattr == NULL)
2759                 return (UINT64_MAX);
2760         return (lun->getattr(lun, attrname));
2761 }
2762
2763 static int
2764 ctl_be_block_init(void)
2765 {
2766         struct ctl_be_block_softc *softc = &backend_block_softc;
2767
2768         sx_init(&softc->modify_lock, "ctlblock modify");
2769         mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
2770         softc->beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
2771             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2772         softc->buf_zone = uma_zcreate("ctlblock", CTLBLK_MAX_SEG,
2773             NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
2774         SLIST_INIT(&softc->lun_list);
2775         return (0);
2776 }
2777
2778
2779 static int
2780 ctl_be_block_shutdown(void)
2781 {
2782         struct ctl_be_block_softc *softc = &backend_block_softc;
2783         struct ctl_be_block_lun *lun;
2784
2785         mtx_lock(&softc->lock);
2786         while ((lun = SLIST_FIRST(&softc->lun_list)) != NULL) {
2787                 SLIST_REMOVE_HEAD(&softc->lun_list, links);
2788                 softc->num_luns--;
2789                 /*
2790                  * Drop our lock here.  Since ctl_remove_lun() can call
2791                  * back into us, this could potentially lead to a recursive
2792                  * lock of the same mutex, which would cause a hang.
2793                  */
2794                 mtx_unlock(&softc->lock);
2795                 ctl_remove_lun(&lun->cbe_lun);
2796                 mtx_lock(&softc->lock);
2797         }
2798         mtx_unlock(&softc->lock);
2799         uma_zdestroy(softc->buf_zone);
2800         uma_zdestroy(softc->beio_zone);
2801         mtx_destroy(&softc->lock);
2802         sx_destroy(&softc->modify_lock);
2803         return (0);
2804 }