]> CyberLeo.Net >> Repos - FreeBSD/releng/9.1.git/blob - sys/cam/ctl/ctl_backend_block.c
Copy stable/9 to releng/9.1 as part of the 9.1-RELEASE release process.
[FreeBSD/releng/9.1.git] / sys / cam / ctl / ctl_backend_block.c
1 /*-
2  * Copyright (c) 2003 Silicon Graphics International Corp.
3  * Copyright (c) 2009-2011 Spectra Logic Corporation
4  * Copyright (c) 2012 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * Portions of this software were developed by Edward Tomasz Napierala
8  * under sponsorship from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions, and the following disclaimer,
15  *    without modification.
16  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
17  *    substantially similar to the "NO WARRANTY" disclaimer below
18  *    ("Disclaimer") and any redistribution must be conditioned upon
19  *    including a substantially similar Disclaimer requirement for further
20  *    binary redistribution.
21  *
22  * NO WARRANTY
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
32  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  * POSSIBILITY OF SUCH DAMAGES.
34  *
35  * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
36  */
37 /*
38  * CAM Target Layer driver backend for block devices.
39  *
40  * Author: Ken Merry <ken@FreeBSD.org>
41  */
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD$");
44
45 #include <opt_kdtrace.h>
46
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/kernel.h>
50 #include <sys/types.h>
51 #include <sys/kthread.h>
52 #include <sys/bio.h>
53 #include <sys/fcntl.h>
54 #include <sys/lock.h>
55 #include <sys/mutex.h>
56 #include <sys/condvar.h>
57 #include <sys/malloc.h>
58 #include <sys/conf.h>
59 #include <sys/ioccom.h>
60 #include <sys/queue.h>
61 #include <sys/sbuf.h>
62 #include <sys/endian.h>
63 #include <sys/uio.h>
64 #include <sys/buf.h>
65 #include <sys/taskqueue.h>
66 #include <sys/vnode.h>
67 #include <sys/namei.h>
68 #include <sys/mount.h>
69 #include <sys/disk.h>
70 #include <sys/fcntl.h>
71 #include <sys/filedesc.h>
72 #include <sys/proc.h>
73 #include <sys/pcpu.h>
74 #include <sys/module.h>
75 #include <sys/sdt.h>
76 #include <sys/devicestat.h>
77 #include <sys/sysctl.h>
78
79 #include <geom/geom.h>
80
81 #include <cam/cam.h>
82 #include <cam/scsi/scsi_all.h>
83 #include <cam/scsi/scsi_da.h>
84 #include <cam/ctl/ctl_io.h>
85 #include <cam/ctl/ctl.h>
86 #include <cam/ctl/ctl_backend.h>
87 #include <cam/ctl/ctl_frontend_internal.h>
88 #include <cam/ctl/ctl_ioctl.h>
89 #include <cam/ctl/ctl_scsi_all.h>
90 #include <cam/ctl/ctl_error.h>
91
92 /*
93  * The idea here is that we'll allocate enough S/G space to hold a 16MB
94  * I/O.  If we get an I/O larger than that, we'll reject it.
95  */
96 #define CTLBLK_MAX_IO_SIZE      (16 * 1024 * 1024)
97 #define CTLBLK_MAX_SEGS         (CTLBLK_MAX_IO_SIZE / MAXPHYS) + 1
98
99 #ifdef CTLBLK_DEBUG
100 #define DPRINTF(fmt, args...) \
101     printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
102 #else
103 #define DPRINTF(fmt, args...) do {} while(0)
104 #endif
105
106 SDT_PROVIDER_DEFINE(cbb);
107
108 typedef enum {
109         CTL_BE_BLOCK_LUN_UNCONFIGURED   = 0x01,
110         CTL_BE_BLOCK_LUN_CONFIG_ERR     = 0x02,
111         CTL_BE_BLOCK_LUN_WAITING        = 0x04,
112         CTL_BE_BLOCK_LUN_MULTI_THREAD   = 0x08
113 } ctl_be_block_lun_flags;
114
115 typedef enum {
116         CTL_BE_BLOCK_NONE,
117         CTL_BE_BLOCK_DEV,
118         CTL_BE_BLOCK_FILE
119 } ctl_be_block_type;
120
121 struct ctl_be_block_devdata {
122         struct cdev *cdev;
123         struct cdevsw *csw;
124         int dev_ref;
125 };
126
127 struct ctl_be_block_filedata {
128         struct ucred *cred;
129 };
130
131 union ctl_be_block_bedata {
132         struct ctl_be_block_devdata dev;
133         struct ctl_be_block_filedata file;
134 };
135
136 struct ctl_be_block_io;
137 struct ctl_be_block_lun;
138
139 typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
140                                struct ctl_be_block_io *beio);
141
142 /*
143  * Backend LUN structure.  There is a 1:1 mapping between a block device
144  * and a backend block LUN, and between a backend block LUN and a CTL LUN.
145  */
146 struct ctl_be_block_lun {
147         struct ctl_block_disk *disk;
148         char lunname[32];
149         char *dev_path;
150         ctl_be_block_type dev_type;
151         struct vnode *vn;
152         union ctl_be_block_bedata backend;
153         cbb_dispatch_t dispatch;
154         cbb_dispatch_t lun_flush;
155         struct mtx lock;
156         uma_zone_t lun_zone;
157         uint64_t size_blocks;
158         uint64_t size_bytes;
159         uint32_t blocksize;
160         int blocksize_shift;
161         struct ctl_be_block_softc *softc;
162         struct devstat *disk_stats;
163         ctl_be_block_lun_flags flags;
164         STAILQ_ENTRY(ctl_be_block_lun) links;
165         struct ctl_be_lun ctl_be_lun;
166         struct taskqueue *io_taskqueue;
167         struct task io_task;
168         int num_threads;
169         STAILQ_HEAD(, ctl_io_hdr) input_queue;
170         STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
171         STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
172 };
173
174 /*
175  * Overall softc structure for the block backend module.
176  */
177 struct ctl_be_block_softc {
178         STAILQ_HEAD(, ctl_be_block_io)   beio_free_queue;
179         struct mtx                       lock;
180         int                              prealloc_beio;
181         int                              num_disks;
182         STAILQ_HEAD(, ctl_block_disk)    disk_list;
183         int                              num_luns;
184         STAILQ_HEAD(, ctl_be_block_lun)  lun_list;
185 };
186
187 static struct ctl_be_block_softc backend_block_softc;
188
189 /*
190  * Per-I/O information.
191  */
192 struct ctl_be_block_io {
193         union ctl_io                    *io;
194         struct ctl_sg_entry             sg_segs[CTLBLK_MAX_SEGS];
195         struct iovec                    xiovecs[CTLBLK_MAX_SEGS];
196         int                             bio_cmd;
197         int                             bio_flags;
198         int                             num_segs;
199         int                             num_bios_sent;
200         int                             num_bios_done;
201         int                             send_complete;
202         int                             num_errors;
203         struct bintime                  ds_t0;
204         devstat_tag_type                ds_tag_type;
205         devstat_trans_flags             ds_trans_type;
206         uint64_t                        io_len;
207         uint64_t                        io_offset;
208         struct ctl_be_block_softc       *softc;
209         struct ctl_be_block_lun         *lun;
210         STAILQ_ENTRY(ctl_be_block_io)   links;
211 };
212
213 static int cbb_num_threads = 14;
214 TUNABLE_INT("kern.cam.ctl.block.num_threads", &cbb_num_threads);
215 SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
216             "CAM Target Layer Block Backend");
217 SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RW,
218            &cbb_num_threads, 0, "Number of threads per backing file");
219
220 static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
221 static void ctl_free_beio(struct ctl_be_block_io *beio);
222 static int ctl_grow_beio(struct ctl_be_block_softc *softc, int count);
223 #if 0
224 static void ctl_shrink_beio(struct ctl_be_block_softc *softc);
225 #endif
226 static void ctl_complete_beio(struct ctl_be_block_io *beio);
227 static int ctl_be_block_move_done(union ctl_io *io);
228 static void ctl_be_block_biodone(struct bio *bio);
229 static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
230                                     struct ctl_be_block_io *beio);
231 static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
232                                        struct ctl_be_block_io *beio);
233 static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
234                                    struct ctl_be_block_io *beio);
235 static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
236                                       struct ctl_be_block_io *beio);
237 static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
238                                     union ctl_io *io);
239 static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
240                                   union ctl_io *io);
241 static void ctl_be_block_worker(void *context, int pending);
242 static int ctl_be_block_submit(union ctl_io *io);
243 static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
244                                    int flag, struct thread *td);
245 static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
246                                   struct ctl_lun_req *req);
247 static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
248                                  struct ctl_lun_req *req);
249 static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
250 static int ctl_be_block_open(struct ctl_be_block_softc *softc,
251                              struct ctl_be_block_lun *be_lun,
252                              struct ctl_lun_req *req);
253 static int ctl_be_block_create(struct ctl_be_block_softc *softc,
254                                struct ctl_lun_req *req);
255 static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
256                            struct ctl_lun_req *req);
257 static int ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
258                                   struct ctl_lun_req *req);
259 static int ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
260                                  struct ctl_lun_req *req);
261 static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
262                            struct ctl_lun_req *req);
263 static void ctl_be_block_lun_shutdown(void *be_lun);
264 static void ctl_be_block_lun_config_status(void *be_lun,
265                                            ctl_lun_config_status status);
266 static int ctl_be_block_config_write(union ctl_io *io);
267 static int ctl_be_block_config_read(union ctl_io *io);
268 static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
269 int ctl_be_block_init(void);
270
271 static struct ctl_backend_driver ctl_be_block_driver = 
272 {
273         .name = "block",
274         .flags = CTL_BE_FLAG_HAS_CONFIG,
275         .init = ctl_be_block_init,
276         .data_submit = ctl_be_block_submit,
277         .data_move_done = ctl_be_block_move_done,
278         .config_read = ctl_be_block_config_read,
279         .config_write = ctl_be_block_config_write,
280         .ioctl = ctl_be_block_ioctl,
281         .lun_info = ctl_be_block_lun_info
282 };
283
284 MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
285 CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
286
287 static struct ctl_be_block_io *
288 ctl_alloc_beio(struct ctl_be_block_softc *softc)
289 {
290         struct ctl_be_block_io *beio;
291         int count;
292
293         mtx_lock(&softc->lock);
294
295         beio = STAILQ_FIRST(&softc->beio_free_queue);
296         if (beio != NULL) {
297                 STAILQ_REMOVE(&softc->beio_free_queue, beio,
298                               ctl_be_block_io, links);
299         }
300         mtx_unlock(&softc->lock);
301
302         if (beio != NULL) {
303                 bzero(beio, sizeof(*beio));
304                 beio->softc = softc;
305                 return (beio);
306         }
307
308         for (;;) {
309
310                 count = ctl_grow_beio(softc, /*count*/ 10);
311
312                 /*
313                  * This shouldn't be possible, since ctl_grow_beio() uses a
314                  * blocking malloc.
315                  */
316                 if (count == 0)
317                         return (NULL);
318
319                 /*
320                  * Since we have to drop the lock when we're allocating beio
321                  * structures, it's possible someone else can come along and
322                  * allocate the beio's we've just allocated.
323                  */
324                 mtx_lock(&softc->lock);
325                 beio = STAILQ_FIRST(&softc->beio_free_queue);
326                 if (beio != NULL) {
327                         STAILQ_REMOVE(&softc->beio_free_queue, beio,
328                                       ctl_be_block_io, links);
329                 }
330                 mtx_unlock(&softc->lock);
331
332                 if (beio != NULL) {
333                         bzero(beio, sizeof(*beio));
334                         beio->softc = softc;
335                         break;
336                 }
337         }
338         return (beio);
339 }
340
341 static void
342 ctl_free_beio(struct ctl_be_block_io *beio)
343 {
344         struct ctl_be_block_softc *softc;
345         int duplicate_free;
346         int i;
347
348         softc = beio->softc;
349         duplicate_free = 0;
350
351         for (i = 0; i < beio->num_segs; i++) {
352                 if (beio->sg_segs[i].addr == NULL)
353                         duplicate_free++;
354
355                 uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
356                 beio->sg_segs[i].addr = NULL;
357         }
358
359         if (duplicate_free > 0) {
360                 printf("%s: %d duplicate frees out of %d segments\n", __func__,
361                        duplicate_free, beio->num_segs);
362         }
363         mtx_lock(&softc->lock);
364         STAILQ_INSERT_TAIL(&softc->beio_free_queue, beio, links);
365         mtx_unlock(&softc->lock);
366 }
367
368 static int
369 ctl_grow_beio(struct ctl_be_block_softc *softc, int count)
370 {
371         int i;
372
373         for (i = 0; i < count; i++) {
374                 struct ctl_be_block_io *beio;
375
376                 beio = (struct ctl_be_block_io *)malloc(sizeof(*beio),
377                                                            M_CTLBLK,
378                                                            M_WAITOK | M_ZERO);
379                 if (beio == NULL)
380                         break;
381
382                 bzero(beio, sizeof(*beio));
383                 beio->softc = softc;
384                 mtx_lock(&softc->lock);
385                 STAILQ_INSERT_TAIL(&softc->beio_free_queue, beio, links);
386                 mtx_unlock(&softc->lock);
387         }
388
389         return (i);
390 }
391
392 #if 0
393 static void
394 ctl_shrink_beio(struct ctl_be_block_softc *softc)
395 {
396         struct ctl_be_block_io *beio, *beio_tmp;
397
398         mtx_lock(&softc->lock);
399         STAILQ_FOREACH_SAFE(beio, &softc->beio_free_queue, links, beio_tmp) {
400                 STAILQ_REMOVE(&softc->beio_free_queue, beio,
401                               ctl_be_block_io, links);
402                 free(beio, M_CTLBLK);
403         }
404         mtx_unlock(&softc->lock);
405 }
406 #endif
407
408 static void
409 ctl_complete_beio(struct ctl_be_block_io *beio)
410 {
411         union ctl_io *io;
412         int io_len;
413
414         io = beio->io;
415
416         if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)
417                 io_len = beio->io_len;
418         else
419                 io_len = 0;
420
421         devstat_end_transaction(beio->lun->disk_stats,
422                                 /*bytes*/ io_len,
423                                 beio->ds_tag_type,
424                                 beio->ds_trans_type,
425                                 /*now*/ NULL,
426                                 /*then*/&beio->ds_t0);
427
428         ctl_free_beio(beio);
429         ctl_done(io);
430 }
431
432 static int
433 ctl_be_block_move_done(union ctl_io *io)
434 {
435         struct ctl_be_block_io *beio;
436         struct ctl_be_block_lun *be_lun;
437 #ifdef CTL_TIME_IO
438         struct bintime cur_bt;
439 #endif  
440
441         beio = (struct ctl_be_block_io *)
442                 io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr;
443
444         be_lun = beio->lun;
445
446         DPRINTF("entered\n");
447
448 #ifdef CTL_TIME_IO
449         getbintime(&cur_bt);
450         bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
451         bintime_add(&io->io_hdr.dma_bt, &cur_bt);
452         io->io_hdr.num_dmas++;
453 #endif  
454
455         /*
456          * We set status at this point for read commands, and write
457          * commands with errors.
458          */
459         if ((beio->bio_cmd == BIO_READ)
460          && (io->io_hdr.port_status == 0)
461          && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
462          && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE))
463                 ctl_set_success(&io->scsiio);
464         else if ((io->io_hdr.port_status != 0)
465               && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
466               && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
467                 /*
468                  * For hardware error sense keys, the sense key
469                  * specific value is defined to be a retry count,
470                  * but we use it to pass back an internal FETD
471                  * error code.  XXX KDM  Hopefully the FETD is only
472                  * using 16 bits for an error code, since that's
473                  * all the space we have in the sks field.
474                  */
475                 ctl_set_internal_failure(&io->scsiio,
476                                          /*sks_valid*/ 1,
477                                          /*retry_count*/
478                                          io->io_hdr.port_status);
479         }
480
481         /*
482          * If this is a read, or a write with errors, it is done.
483          */
484         if ((beio->bio_cmd == BIO_READ)
485          || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
486          || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
487                 ctl_complete_beio(beio);
488                 return (0);
489         }
490
491         /*
492          * At this point, we have a write and the DMA completed
493          * successfully.  We now have to queue it to the task queue to
494          * execute the backend I/O.  That is because we do blocking
495          * memory allocations, and in the file backing case, blocking I/O.
496          * This move done routine is generally called in the SIM's
497          * interrupt context, and therefore we cannot block.
498          */
499         mtx_lock(&be_lun->lock);
500         /*
501          * XXX KDM make sure that links is okay to use at this point.
502          * Otherwise, we either need to add another field to ctl_io_hdr,
503          * or deal with resource allocation here.
504          */
505         STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
506         mtx_unlock(&be_lun->lock);
507
508         taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
509
510         return (0);
511 }
512
513 static void
514 ctl_be_block_biodone(struct bio *bio)
515 {
516         struct ctl_be_block_io *beio;
517         struct ctl_be_block_lun *be_lun;
518         union ctl_io *io;
519
520         beio = bio->bio_caller1;
521         be_lun = beio->lun;
522         io = beio->io;
523
524         DPRINTF("entered\n");
525
526         mtx_lock(&be_lun->lock);
527         if (bio->bio_error != 0)
528                 beio->num_errors++;
529
530         beio->num_bios_done++;
531
532         /*
533          * XXX KDM will this cause WITNESS to complain?  Holding a lock
534          * during the free might cause it to complain.
535          */
536         g_destroy_bio(bio);
537
538         /*
539          * If the send complete bit isn't set, or we aren't the last I/O to
540          * complete, then we're done.
541          */
542         if ((beio->send_complete == 0)
543          || (beio->num_bios_done < beio->num_bios_sent)) {
544                 mtx_unlock(&be_lun->lock);
545                 return;
546         }
547
548         /*
549          * At this point, we've verified that we are the last I/O to
550          * complete, so it's safe to drop the lock.
551          */
552         mtx_unlock(&be_lun->lock);
553
554         /*
555          * If there are any errors from the backing device, we fail the
556          * entire I/O with a medium error.
557          */
558         if (beio->num_errors > 0) {
559                 if (beio->bio_cmd == BIO_FLUSH) {
560                         /* XXX KDM is there is a better error here? */
561                         ctl_set_internal_failure(&io->scsiio,
562                                                  /*sks_valid*/ 1,
563                                                  /*retry_count*/ 0xbad2);
564                 } else
565                         ctl_set_medium_error(&io->scsiio);
566                 ctl_complete_beio(beio);
567                 return;
568         }
569
570         /*
571          * If this is a write or a flush, we're all done.
572          * If this is a read, we can now send the data to the user.
573          */
574         if ((beio->bio_cmd == BIO_WRITE)
575          || (beio->bio_cmd == BIO_FLUSH)) {
576                 ctl_set_success(&io->scsiio);
577                 ctl_complete_beio(beio);
578         } else {
579                 io->scsiio.be_move_done = ctl_be_block_move_done;
580                 io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
581                 io->scsiio.kern_data_len = beio->io_len;
582                 io->scsiio.kern_total_len = beio->io_len;
583                 io->scsiio.kern_rel_offset = 0;
584                 io->scsiio.kern_data_resid = 0;
585                 io->scsiio.kern_sg_entries = beio->num_segs;
586                 io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
587 #ifdef CTL_TIME_IO
588                 getbintime(&io->io_hdr.dma_start_bt);
589 #endif  
590                 ctl_datamove(io);
591         }
592 }
593
594 static void
595 ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
596                         struct ctl_be_block_io *beio)
597 {
598         union ctl_io *io;
599         struct mount *mountpoint;
600         int vfs_is_locked, error, lock_flags;
601
602         DPRINTF("entered\n");
603
604         io = beio->io;
605
606         vfs_is_locked = VFS_LOCK_GIANT(be_lun->vn->v_mount);
607
608         (void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
609
610         if (MNT_SHARED_WRITES(mountpoint)
611          || ((mountpoint == NULL)
612           && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
613                 lock_flags = LK_SHARED;
614         else
615                 lock_flags = LK_EXCLUSIVE;
616
617         vn_lock(be_lun->vn, lock_flags | LK_RETRY);
618
619         binuptime(&beio->ds_t0);
620         devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
621
622         error = VOP_FSYNC(be_lun->vn, MNT_WAIT, curthread);
623         VOP_UNLOCK(be_lun->vn, 0);
624
625         vn_finished_write(mountpoint);
626
627         VFS_UNLOCK_GIANT(vfs_is_locked);
628
629         if (error == 0)
630                 ctl_set_success(&io->scsiio);
631         else {
632                 /* XXX KDM is there is a better error here? */
633                 ctl_set_internal_failure(&io->scsiio,
634                                          /*sks_valid*/ 1,
635                                          /*retry_count*/ 0xbad1);
636         }
637
638         ctl_complete_beio(beio);
639 }
640
641 SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, file_start, "uint64_t");
642 SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, file_start, "uint64_t");
643 SDT_PROBE_DEFINE1(cbb, kernel, read, file_done, file_done,"uint64_t");
644 SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, file_done, "uint64_t");
645
646 static void
647 ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
648                            struct ctl_be_block_io *beio)
649 {
650         struct ctl_be_block_filedata *file_data;
651         union ctl_io *io;
652         struct uio xuio;
653         struct iovec *xiovec;
654         int vfs_is_locked, flags;
655         int error, i;
656
657         DPRINTF("entered\n");
658
659         file_data = &be_lun->backend.file;
660         io = beio->io;
661         flags = beio->bio_flags;
662
663         if (beio->bio_cmd == BIO_READ) {
664                 SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
665         } else {
666                 SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
667         }
668
669         bzero(&xuio, sizeof(xuio));
670         if (beio->bio_cmd == BIO_READ)
671                 xuio.uio_rw = UIO_READ;
672         else
673                 xuio.uio_rw = UIO_WRITE;
674
675         xuio.uio_offset = beio->io_offset;
676         xuio.uio_resid = beio->io_len;
677         xuio.uio_segflg = UIO_SYSSPACE;
678         xuio.uio_iov = beio->xiovecs;
679         xuio.uio_iovcnt = beio->num_segs;
680         xuio.uio_td = curthread;
681
682         for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
683                 xiovec->iov_base = beio->sg_segs[i].addr;
684                 xiovec->iov_len = beio->sg_segs[i].len;
685         }
686
687         vfs_is_locked = VFS_LOCK_GIANT(be_lun->vn->v_mount);
688         if (beio->bio_cmd == BIO_READ) {
689                 vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
690
691                 binuptime(&beio->ds_t0);
692                 devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
693
694                 /*
695                  * UFS pays attention to IO_DIRECT for reads.  If the
696                  * DIRECTIO option is configured into the kernel, it calls
697                  * ffs_rawread().  But that only works for single-segment
698                  * uios with user space addresses.  In our case, with a
699                  * kernel uio, it still reads into the buffer cache, but it
700                  * will just try to release the buffer from the cache later
701                  * on in ffs_read().
702                  *
703                  * ZFS does not pay attention to IO_DIRECT for reads.
704                  *
705                  * UFS does not pay attention to IO_SYNC for reads.
706                  *
707                  * ZFS pays attention to IO_SYNC (which translates into the
708                  * Solaris define FRSYNC for zfs_read()) for reads.  It
709                  * attempts to sync the file before reading.
710                  *
711                  * So, to attempt to provide some barrier semantics in the
712                  * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
713                  */
714                 error = VOP_READ(be_lun->vn, &xuio, (flags & BIO_ORDERED) ?
715                                  (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
716
717                 VOP_UNLOCK(be_lun->vn, 0);
718         } else {
719                 struct mount *mountpoint;
720                 int lock_flags;
721
722                 (void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
723
724                 if (MNT_SHARED_WRITES(mountpoint)
725                  || ((mountpoint == NULL)
726                   && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
727                         lock_flags = LK_SHARED;
728                 else
729                         lock_flags = LK_EXCLUSIVE;
730
731                 vn_lock(be_lun->vn, lock_flags | LK_RETRY);
732
733                 binuptime(&beio->ds_t0);
734                 devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
735
736                 /*
737                  * UFS pays attention to IO_DIRECT for writes.  The write
738                  * is done asynchronously.  (Normally the write would just
739                  * get put into cache.
740                  *
741                  * UFS pays attention to IO_SYNC for writes.  It will
742                  * attempt to write the buffer out synchronously if that
743                  * flag is set.
744                  *
745                  * ZFS does not pay attention to IO_DIRECT for writes.
746                  *
747                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
748                  * for writes.  It will flush the transaction from the
749                  * cache before returning.
750                  *
751                  * So if we've got the BIO_ORDERED flag set, we want
752                  * IO_SYNC in either the UFS or ZFS case.
753                  */
754                 error = VOP_WRITE(be_lun->vn, &xuio, (flags & BIO_ORDERED) ?
755                                   IO_SYNC : 0, file_data->cred);
756                 VOP_UNLOCK(be_lun->vn, 0);
757
758                 vn_finished_write(mountpoint);
759         }
760         VFS_UNLOCK_GIANT(vfs_is_locked);
761
762         /*
763          * If we got an error, set the sense data to "MEDIUM ERROR" and
764          * return the I/O to the user.
765          */
766         if (error != 0) {
767                 char path_str[32];
768
769                 ctl_scsi_path_string(io, path_str, sizeof(path_str));
770                 /*
771                  * XXX KDM ZFS returns ENOSPC when the underlying
772                  * filesystem fills up.  What kind of SCSI error should we
773                  * return for that?
774                  */
775                 printf("%s%s command returned errno %d\n", path_str,
776                        (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", error);
777                 ctl_set_medium_error(&io->scsiio);
778                 ctl_complete_beio(beio);
779                 return;
780         }
781
782         /*
783          * If this is a write, we're all done.
784          * If this is a read, we can now send the data to the user.
785          */
786         if (beio->bio_cmd == BIO_WRITE) {
787                 ctl_set_success(&io->scsiio);
788                 SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
789                 ctl_complete_beio(beio);
790         } else {
791                 SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
792                 io->scsiio.be_move_done = ctl_be_block_move_done;
793                 io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
794                 io->scsiio.kern_data_len = beio->io_len;
795                 io->scsiio.kern_total_len = beio->io_len;
796                 io->scsiio.kern_rel_offset = 0;
797                 io->scsiio.kern_data_resid = 0;
798                 io->scsiio.kern_sg_entries = beio->num_segs;
799                 io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
800 #ifdef CTL_TIME_IO
801                 getbintime(&io->io_hdr.dma_start_bt);
802 #endif  
803                 ctl_datamove(io);
804         }
805 }
806
807 static void
808 ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
809                        struct ctl_be_block_io *beio)
810 {
811         struct bio *bio;
812         union ctl_io *io;
813         struct ctl_be_block_devdata *dev_data;
814
815         dev_data = &be_lun->backend.dev;
816         io = beio->io;
817
818         DPRINTF("entered\n");
819
820         /* This can't fail, it's a blocking allocation. */
821         bio = g_alloc_bio();
822
823         bio->bio_cmd        = BIO_FLUSH;
824         bio->bio_flags     |= BIO_ORDERED;
825         bio->bio_dev        = dev_data->cdev;
826         bio->bio_offset     = 0;
827         bio->bio_data       = 0;
828         bio->bio_done       = ctl_be_block_biodone;
829         bio->bio_caller1    = beio;
830         bio->bio_pblkno     = 0;
831
832         /*
833          * We don't need to acquire the LUN lock here, because we are only
834          * sending one bio, and so there is no other context to synchronize
835          * with.
836          */
837         beio->num_bios_sent = 1;
838         beio->send_complete = 1;
839
840         binuptime(&beio->ds_t0);
841         devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
842
843         (*dev_data->csw->d_strategy)(bio);
844 }
845
846 static void
847 ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
848                           struct ctl_be_block_io *beio)
849 {
850         int i;
851         struct bio *bio;
852         struct ctl_be_block_devdata *dev_data;
853         off_t cur_offset;
854         int max_iosize;
855
856         DPRINTF("entered\n");
857
858         dev_data = &be_lun->backend.dev;
859
860         /*
861          * We have to limit our I/O size to the maximum supported by the
862          * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
863          * set it properly, use DFLTPHYS.
864          */
865         max_iosize = dev_data->cdev->si_iosize_max;
866         if (max_iosize < PAGE_SIZE)
867                 max_iosize = DFLTPHYS;
868
869         cur_offset = beio->io_offset;
870
871         /*
872          * XXX KDM need to accurately reflect the number of I/Os outstanding
873          * to a device.
874          */
875         binuptime(&beio->ds_t0);
876         devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
877
878         for (i = 0; i < beio->num_segs; i++) {
879                 size_t cur_size;
880                 uint8_t *cur_ptr;
881
882                 cur_size = beio->sg_segs[i].len;
883                 cur_ptr = beio->sg_segs[i].addr;
884
885                 while (cur_size > 0) {
886                         /* This can't fail, it's a blocking allocation. */
887                         bio = g_alloc_bio();
888
889                         KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
890
891                         bio->bio_cmd = beio->bio_cmd;
892                         bio->bio_flags |= beio->bio_flags;
893                         bio->bio_dev = dev_data->cdev;
894                         bio->bio_caller1 = beio;
895                         bio->bio_length = min(cur_size, max_iosize);
896                         bio->bio_offset = cur_offset;
897                         bio->bio_data = cur_ptr;
898                         bio->bio_done = ctl_be_block_biodone;
899                         bio->bio_pblkno = cur_offset / be_lun->blocksize;
900
901                         cur_offset += bio->bio_length;
902                         cur_ptr += bio->bio_length;
903                         cur_size -= bio->bio_length;
904
905                         /*
906                          * Make sure we set the complete bit just before we
907                          * issue the last bio so we don't wind up with a
908                          * race.
909                          *
910                          * Use the LUN mutex here instead of a combination
911                          * of atomic variables for simplicity.
912                          *
913                          * XXX KDM we could have a per-IO lock, but that
914                          * would cause additional per-IO setup and teardown
915                          * overhead.  Hopefully there won't be too much
916                          * contention on the LUN lock.
917                          */
918                         mtx_lock(&be_lun->lock);
919
920                         beio->num_bios_sent++;
921
922                         if ((i == beio->num_segs - 1)
923                          && (cur_size == 0))
924                                 beio->send_complete = 1;
925
926                         mtx_unlock(&be_lun->lock);
927
928                         (*dev_data->csw->d_strategy)(bio);
929                 }
930         }
931 }
932
933 static void
934 ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
935                          union ctl_io *io)
936 {
937         struct ctl_be_block_io *beio;
938         struct ctl_be_block_softc *softc;
939
940         DPRINTF("entered\n");
941
942         softc = be_lun->softc;
943         beio = ctl_alloc_beio(softc);
944         if (beio == NULL) {
945                 /*
946                  * This should not happen.  ctl_alloc_beio() will call
947                  * ctl_grow_beio() with a blocking malloc as needed.
948                  * A malloc with M_WAITOK should not fail.
949                  */
950                 ctl_set_busy(&io->scsiio);
951                 ctl_done(io);
952                 return;
953         }
954
955         beio->io = io;
956         beio->softc = softc;
957         beio->lun = be_lun;
958         io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr = beio;
959
960         switch (io->scsiio.cdb[0]) {
961         case SYNCHRONIZE_CACHE:
962         case SYNCHRONIZE_CACHE_16:
963                 beio->ds_trans_type = DEVSTAT_NO_DATA;
964                 beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
965                 beio->io_len = 0;
966                 be_lun->lun_flush(be_lun, beio);
967                 break;
968         default:
969                 panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
970                 break;
971         }
972 }
973
974 SDT_PROBE_DEFINE1(cbb, kernel, read, start, start, "uint64_t");
975 SDT_PROBE_DEFINE1(cbb, kernel, write, start, start, "uint64_t");
976 SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, alloc_done, "uint64_t");
977 SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, alloc_done, "uint64_t");
978
979 static void
980 ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
981                            union ctl_io *io)
982 {
983         struct ctl_be_block_io *beio;
984         struct ctl_be_block_softc *softc;
985         struct ctl_lba_len lbalen;
986         uint64_t len_left, io_size_bytes;
987         int i;
988
989         softc = be_lun->softc;
990
991         DPRINTF("entered\n");
992
993         if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
994                 SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0);
995         } else {
996                 SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0);
997         }
998
999         memcpy(&lbalen, io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes,
1000                sizeof(lbalen));
1001
1002         io_size_bytes = lbalen.len * be_lun->blocksize;
1003
1004         /*
1005          * XXX KDM this is temporary, until we implement chaining of beio
1006          * structures and multiple datamove calls to move all the data in
1007          * or out.
1008          */
1009         if (io_size_bytes > CTLBLK_MAX_IO_SIZE) {
1010                 printf("%s: IO length %ju > max io size %u\n", __func__,
1011                        io_size_bytes, CTLBLK_MAX_IO_SIZE);
1012                 ctl_set_invalid_field(&io->scsiio,
1013                                       /*sks_valid*/ 0,
1014                                       /*command*/ 1,
1015                                       /*field*/ 0,
1016                                       /*bit_valid*/ 0,
1017                                       /*bit*/ 0);
1018                 ctl_done(io);
1019                 return;
1020         }
1021
1022         beio = ctl_alloc_beio(softc);
1023         if (beio == NULL) {
1024                 /*
1025                  * This should not happen.  ctl_alloc_beio() will call
1026                  * ctl_grow_beio() with a blocking malloc as needed.
1027                  * A malloc with M_WAITOK should not fail.
1028                  */
1029                 ctl_set_busy(&io->scsiio);
1030                 ctl_done(io);
1031                 return;
1032         }
1033
1034         beio->io = io;
1035         beio->softc = softc;
1036         beio->lun = be_lun;
1037         io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr = beio;
1038
1039         /*
1040          * If the I/O came down with an ordered or head of queue tag, set
1041          * the BIO_ORDERED attribute.  For head of queue tags, that's
1042          * pretty much the best we can do.
1043          *
1044          * XXX KDM we don't have a great way to easily know about the FUA
1045          * bit right now (it is decoded in ctl_read_write(), but we don't
1046          * pass that knowledge to the backend), and in any case we would
1047          * need to determine how to handle it.  
1048          */
1049         if ((io->scsiio.tag_type == CTL_TAG_ORDERED)
1050          || (io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE))
1051                 beio->bio_flags = BIO_ORDERED;
1052
1053         switch (io->scsiio.tag_type) {
1054         case CTL_TAG_ORDERED:
1055                 beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1056                 break;
1057         case CTL_TAG_HEAD_OF_QUEUE:
1058                 beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1059                 break;
1060         case CTL_TAG_UNTAGGED:
1061         case CTL_TAG_SIMPLE:
1062         case CTL_TAG_ACA:
1063         default:
1064                 beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1065                 break;
1066         }
1067
1068         /*
1069          * This path handles read and write only.  The config write path
1070          * handles flush operations.
1071          */
1072         if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
1073                 beio->bio_cmd = BIO_READ;
1074                 beio->ds_trans_type = DEVSTAT_READ;
1075         } else {
1076                 beio->bio_cmd = BIO_WRITE;
1077                 beio->ds_trans_type = DEVSTAT_WRITE;
1078         }
1079
1080         beio->io_len = lbalen.len * be_lun->blocksize;
1081         beio->io_offset = lbalen.lba * be_lun->blocksize;
1082
1083         DPRINTF("%s at LBA %jx len %u\n",
1084                (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
1085                (uintmax_t)lbalen.lba, lbalen.len);
1086
1087         for (i = 0, len_left = io_size_bytes; i < CTLBLK_MAX_SEGS &&
1088              len_left > 0; i++) {
1089
1090                 /*
1091                  * Setup the S/G entry for this chunk.
1092                  */
1093                 beio->sg_segs[i].len = min(MAXPHYS, len_left);
1094                 beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1095                 /*
1096                  * uma_zalloc() can in theory return NULL even with M_WAITOK
1097                  * if it can't pull more memory into the zone.
1098                  */
1099                 if (beio->sg_segs[i].addr == NULL) {
1100                         ctl_set_busy(&io->scsiio);
1101                         ctl_complete_beio(beio);
1102                         return;
1103                 }
1104
1105                 DPRINTF("segment %d addr %p len %zd\n", i,
1106                         beio->sg_segs[i].addr, beio->sg_segs[i].len);
1107
1108                 beio->num_segs++;
1109                 len_left -= beio->sg_segs[i].len;
1110         }
1111
1112         /*
1113          * For the read case, we need to read the data into our buffers and
1114          * then we can send it back to the user.  For the write case, we
1115          * need to get the data from the user first.
1116          */
1117         if (beio->bio_cmd == BIO_READ) {
1118                 SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0);
1119                 be_lun->dispatch(be_lun, beio);
1120         } else {
1121                 SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0);
1122                 io->scsiio.be_move_done = ctl_be_block_move_done;
1123                 io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1124                 io->scsiio.kern_data_len = beio->io_len;
1125                 io->scsiio.kern_total_len = beio->io_len;
1126                 io->scsiio.kern_rel_offset = 0;
1127                 io->scsiio.kern_data_resid = 0;
1128                 io->scsiio.kern_sg_entries = beio->num_segs;
1129                 io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
1130 #ifdef CTL_TIME_IO
1131                 getbintime(&io->io_hdr.dma_start_bt);
1132 #endif  
1133                 ctl_datamove(io);
1134         }
1135 }
1136
1137 static void
1138 ctl_be_block_worker(void *context, int pending)
1139 {
1140         struct ctl_be_block_lun *be_lun;
1141         struct ctl_be_block_softc *softc;
1142         union ctl_io *io;
1143
1144         be_lun = (struct ctl_be_block_lun *)context;
1145         softc = be_lun->softc;
1146
1147         DPRINTF("entered\n");
1148
1149         mtx_lock(&be_lun->lock);
1150         for (;;) {
1151                 io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1152                 if (io != NULL) {
1153                         struct ctl_be_block_io *beio;
1154
1155                         DPRINTF("datamove queue\n");
1156
1157                         STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1158                                       ctl_io_hdr, links);
1159
1160                         mtx_unlock(&be_lun->lock);
1161
1162                         beio = (struct ctl_be_block_io *)
1163                             io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr;
1164
1165                         be_lun->dispatch(be_lun, beio);
1166
1167                         mtx_lock(&be_lun->lock);
1168                         continue;
1169                 }
1170                 io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1171                 if (io != NULL) {
1172
1173                         DPRINTF("config write queue\n");
1174
1175                         STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1176                                       ctl_io_hdr, links);
1177
1178                         mtx_unlock(&be_lun->lock);
1179
1180                         ctl_be_block_cw_dispatch(be_lun, io);
1181
1182                         mtx_lock(&be_lun->lock);
1183                         continue;
1184                 }
1185                 io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1186                 if (io != NULL) {
1187                         DPRINTF("input queue\n");
1188
1189                         STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1190                                       ctl_io_hdr, links);
1191                         mtx_unlock(&be_lun->lock);
1192
1193                         /*
1194                          * We must drop the lock, since this routine and
1195                          * its children may sleep.
1196                          */
1197                         ctl_be_block_dispatch(be_lun, io);
1198
1199                         mtx_lock(&be_lun->lock);
1200                         continue;
1201                 }
1202
1203                 /*
1204                  * If we get here, there is no work left in the queues, so
1205                  * just break out and let the task queue go to sleep.
1206                  */
1207                 break;
1208         }
1209         mtx_unlock(&be_lun->lock);
1210 }
1211
1212 /*
1213  * Entry point from CTL to the backend for I/O.  We queue everything to a
1214  * work thread, so this just puts the I/O on a queue and wakes up the
1215  * thread.
1216  */
1217 static int
1218 ctl_be_block_submit(union ctl_io *io)
1219 {
1220         struct ctl_be_block_lun *be_lun;
1221         struct ctl_be_lun *ctl_be_lun;
1222         int retval;
1223
1224         DPRINTF("entered\n");
1225
1226         retval = CTL_RETVAL_COMPLETE;
1227
1228         ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
1229                 CTL_PRIV_BACKEND_LUN].ptr;
1230         be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
1231
1232         /*
1233          * Make sure we only get SCSI I/O.
1234          */
1235         KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1236                 "%#x) encountered", io->io_hdr.io_type));
1237
1238         mtx_lock(&be_lun->lock);
1239         /*
1240          * XXX KDM make sure that links is okay to use at this point.
1241          * Otherwise, we either need to add another field to ctl_io_hdr,
1242          * or deal with resource allocation here.
1243          */
1244         STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1245         mtx_unlock(&be_lun->lock);
1246
1247         taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1248
1249         return (retval);
1250 }
1251
1252 static int
1253 ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1254                         int flag, struct thread *td)
1255 {
1256         struct ctl_be_block_softc *softc;
1257         int error;
1258
1259         softc = &backend_block_softc;
1260
1261         error = 0;
1262
1263         switch (cmd) {
1264         case CTL_LUN_REQ: {
1265                 struct ctl_lun_req *lun_req;
1266
1267                 lun_req = (struct ctl_lun_req *)addr;
1268
1269                 switch (lun_req->reqtype) {
1270                 case CTL_LUNREQ_CREATE:
1271                         error = ctl_be_block_create(softc, lun_req);
1272                         break;
1273                 case CTL_LUNREQ_RM:
1274                         error = ctl_be_block_rm(softc, lun_req);
1275                         break;
1276                 case CTL_LUNREQ_MODIFY:
1277                         error = ctl_be_block_modify(softc, lun_req);
1278                         break;
1279                 default:
1280                         lun_req->status = CTL_LUN_ERROR;
1281                         snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1282                                  "%s: invalid LUN request type %d", __func__,
1283                                  lun_req->reqtype);
1284                         break;
1285                 }
1286                 break;
1287         }
1288         default:
1289                 error = ENOTTY;
1290                 break;
1291         }
1292
1293         return (error);
1294 }
1295
1296 static int
1297 ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1298 {
1299         struct ctl_be_block_filedata *file_data;
1300         struct ctl_lun_create_params *params;
1301         struct vattr                  vattr;
1302         int                           error;
1303
1304         error = 0;
1305         file_data = &be_lun->backend.file;
1306         params = &req->reqdata.create;
1307
1308         be_lun->dev_type = CTL_BE_BLOCK_FILE;
1309         be_lun->dispatch = ctl_be_block_dispatch_file;
1310         be_lun->lun_flush = ctl_be_block_flush_file;
1311
1312         error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1313         if (error != 0) {
1314                 snprintf(req->error_str, sizeof(req->error_str),
1315                          "error calling VOP_GETATTR() for file %s",
1316                          be_lun->dev_path);
1317                 return (error);
1318         }
1319
1320         /*
1321          * Verify that we have the ability to upgrade to exclusive
1322          * access on this file so we can trap errors at open instead
1323          * of reporting them during first access.
1324          */
1325         if (VOP_ISLOCKED(be_lun->vn) != LK_EXCLUSIVE) {
1326                 vn_lock(be_lun->vn, LK_UPGRADE | LK_RETRY);
1327                 if (be_lun->vn->v_iflag & VI_DOOMED) {
1328                         error = EBADF;
1329                         snprintf(req->error_str, sizeof(req->error_str),
1330                                  "error locking file %s", be_lun->dev_path);
1331                         return (error);
1332                 }
1333         }
1334
1335
1336         file_data->cred = crhold(curthread->td_ucred);
1337         if (params->lun_size_bytes != 0)
1338                 be_lun->size_bytes = params->lun_size_bytes;
1339         else
1340                 be_lun->size_bytes = vattr.va_size;
1341         /*
1342          * We set the multi thread flag for file operations because all
1343          * filesystems (in theory) are capable of allowing multiple readers
1344          * of a file at once.  So we want to get the maximum possible
1345          * concurrency.
1346          */
1347         be_lun->flags |= CTL_BE_BLOCK_LUN_MULTI_THREAD;
1348
1349         /*
1350          * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
1351          * With ZFS, it is 131072 bytes.  Block sizes that large don't work
1352          * with disklabel and UFS on FreeBSD at least.  Large block sizes
1353          * may not work with other OSes as well.  So just export a sector
1354          * size of 512 bytes, which should work with any OS or
1355          * application.  Since our backing is a file, any block size will
1356          * work fine for the backing store.
1357          */
1358 #if 0
1359         be_lun->blocksize= vattr.va_blocksize;
1360 #endif
1361         if (params->blocksize_bytes != 0)
1362                 be_lun->blocksize = params->blocksize_bytes;
1363         else
1364                 be_lun->blocksize = 512;
1365
1366         /*
1367          * Sanity check.  The media size has to be at least one
1368          * sector long.
1369          */
1370         if (be_lun->size_bytes < be_lun->blocksize) {
1371                 error = EINVAL;
1372                 snprintf(req->error_str, sizeof(req->error_str),
1373                          "file %s size %ju < block size %u", be_lun->dev_path,
1374                          (uintmax_t)be_lun->size_bytes, be_lun->blocksize);
1375         }
1376         return (error);
1377 }
1378
1379 static int
1380 ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1381 {
1382         struct ctl_lun_create_params *params;
1383         struct vattr                  vattr;
1384         struct cdev                  *dev;
1385         struct cdevsw                *devsw;
1386         int                           error;
1387
1388         params = &req->reqdata.create;
1389
1390         be_lun->dev_type = CTL_BE_BLOCK_DEV;
1391         be_lun->dispatch = ctl_be_block_dispatch_dev;
1392         be_lun->lun_flush = ctl_be_block_flush_dev;
1393         be_lun->backend.dev.cdev = be_lun->vn->v_rdev;
1394         be_lun->backend.dev.csw = dev_refthread(be_lun->backend.dev.cdev,
1395                                              &be_lun->backend.dev.dev_ref);
1396         if (be_lun->backend.dev.csw == NULL)
1397                 panic("Unable to retrieve device switch");
1398
1399         error = VOP_GETATTR(be_lun->vn, &vattr, NOCRED);
1400         if (error) {
1401                 snprintf(req->error_str, sizeof(req->error_str),
1402                          "%s: error getting vnode attributes for device %s",
1403                          __func__, be_lun->dev_path);
1404                 return (error);
1405         }
1406
1407         dev = be_lun->vn->v_rdev;
1408         devsw = dev->si_devsw;
1409         if (!devsw->d_ioctl) {
1410                 snprintf(req->error_str, sizeof(req->error_str),
1411                          "%s: no d_ioctl for device %s!", __func__,
1412                          be_lun->dev_path);
1413                 return (ENODEV);
1414         }
1415
1416         error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
1417                                (caddr_t)&be_lun->blocksize, FREAD,
1418                                curthread);
1419         if (error) {
1420                 snprintf(req->error_str, sizeof(req->error_str),
1421                          "%s: error %d returned for DIOCGSECTORSIZE ioctl "
1422                          "on %s!", __func__, error, be_lun->dev_path);
1423                 return (error);
1424         }
1425
1426         /*
1427          * If the user has asked for a blocksize that is greater than the
1428          * backing device's blocksize, we can do it only if the blocksize
1429          * the user is asking for is an even multiple of the underlying 
1430          * device's blocksize.
1431          */
1432         if ((params->blocksize_bytes != 0)
1433          && (params->blocksize_bytes > be_lun->blocksize)) {
1434                 uint32_t bs_multiple, tmp_blocksize;
1435
1436                 bs_multiple = params->blocksize_bytes / be_lun->blocksize;
1437
1438                 tmp_blocksize = bs_multiple * be_lun->blocksize;
1439
1440                 if (tmp_blocksize == params->blocksize_bytes) {
1441                         be_lun->blocksize = params->blocksize_bytes;
1442                 } else {
1443                         snprintf(req->error_str, sizeof(req->error_str),
1444                                  "%s: requested blocksize %u is not an even "
1445                                  "multiple of backing device blocksize %u",
1446                                  __func__, params->blocksize_bytes,
1447                                  be_lun->blocksize);
1448                         return (EINVAL);
1449                         
1450                 }
1451         } else if ((params->blocksize_bytes != 0)
1452                 && (params->blocksize_bytes != be_lun->blocksize)) {
1453                 snprintf(req->error_str, sizeof(req->error_str),
1454                          "%s: requested blocksize %u < backing device "
1455                          "blocksize %u", __func__, params->blocksize_bytes,
1456                          be_lun->blocksize);
1457                 return (EINVAL);
1458         }
1459
1460         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
1461                                (caddr_t)&be_lun->size_bytes, FREAD,
1462                                curthread);
1463         if (error) {
1464                 snprintf(req->error_str, sizeof(req->error_str),
1465                          "%s: error %d returned for DIOCGMEDIASIZE "
1466                          " ioctl on %s!", __func__, error,
1467                          be_lun->dev_path);
1468                 return (error);
1469         }
1470
1471         if (params->lun_size_bytes != 0) {
1472                 if (params->lun_size_bytes > be_lun->size_bytes) {
1473                         snprintf(req->error_str, sizeof(req->error_str),
1474                                  "%s: requested LUN size %ju > backing device "
1475                                  "size %ju", __func__,
1476                                  (uintmax_t)params->lun_size_bytes,
1477                                  (uintmax_t)be_lun->size_bytes);
1478                         return (EINVAL);
1479                 }
1480
1481                 be_lun->size_bytes = params->lun_size_bytes;
1482         }
1483
1484         return (0);
1485 }
1486
1487 static int
1488 ctl_be_block_close(struct ctl_be_block_lun *be_lun)
1489 {
1490         DROP_GIANT();
1491         if (be_lun->vn) {
1492                 int flags = FREAD | FWRITE;
1493                 int vfs_is_locked = 0;
1494
1495                 switch (be_lun->dev_type) {
1496                 case CTL_BE_BLOCK_DEV:
1497                         if (be_lun->backend.dev.csw) {
1498                                 dev_relthread(be_lun->backend.dev.cdev,
1499                                               be_lun->backend.dev.dev_ref);
1500                                 be_lun->backend.dev.csw  = NULL;
1501                                 be_lun->backend.dev.cdev = NULL;
1502                         }
1503                         break;
1504                 case CTL_BE_BLOCK_FILE:
1505                         vfs_is_locked = VFS_LOCK_GIANT(be_lun->vn->v_mount);
1506                         break;
1507                 case CTL_BE_BLOCK_NONE:
1508                 default:
1509                         panic("Unexpected backend type.");
1510                         break;
1511                 }
1512
1513                 (void)vn_close(be_lun->vn, flags, NOCRED, curthread);
1514                 be_lun->vn = NULL;
1515
1516                 switch (be_lun->dev_type) {
1517                 case CTL_BE_BLOCK_DEV:
1518                         break;
1519                 case CTL_BE_BLOCK_FILE:
1520                         VFS_UNLOCK_GIANT(vfs_is_locked);
1521                         if (be_lun->backend.file.cred != NULL) {
1522                                 crfree(be_lun->backend.file.cred);
1523                                 be_lun->backend.file.cred = NULL;
1524                         }
1525                         break;
1526                 case CTL_BE_BLOCK_NONE:
1527                 default:
1528                         panic("Unexpected backend type.");
1529                         break;
1530                 }
1531         }
1532         PICKUP_GIANT();
1533
1534         return (0);
1535 }
1536
1537 static int
1538 ctl_be_block_open(struct ctl_be_block_softc *softc,
1539                        struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1540 {
1541         struct nameidata nd;
1542         int              flags;
1543         int              error;
1544         int              vfs_is_locked;
1545
1546         /*
1547          * XXX KDM allow a read-only option?
1548          */
1549         flags = FREAD | FWRITE;
1550         error = 0;
1551
1552         if (rootvnode == NULL) {
1553                 snprintf(req->error_str, sizeof(req->error_str),
1554                          "%s: Root filesystem is not mounted", __func__);
1555                 return (1);
1556         }
1557
1558         if (!curthread->td_proc->p_fd->fd_cdir) {
1559                 curthread->td_proc->p_fd->fd_cdir = rootvnode;
1560                 VREF(rootvnode);
1561         }
1562         if (!curthread->td_proc->p_fd->fd_rdir) {
1563                 curthread->td_proc->p_fd->fd_rdir = rootvnode;
1564                 VREF(rootvnode);
1565         }
1566         if (!curthread->td_proc->p_fd->fd_jdir) {
1567                 curthread->td_proc->p_fd->fd_jdir = rootvnode;
1568                 VREF(rootvnode);
1569         }
1570
1571  again:
1572         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
1573         error = vn_open(&nd, &flags, 0, NULL);
1574         if (error) {
1575                 /*
1576                  * This is the only reasonable guess we can make as far as
1577                  * path if the user doesn't give us a fully qualified path.
1578                  * If they want to specify a file, they need to specify the
1579                  * full path.
1580                  */
1581                 if (be_lun->dev_path[0] != '/') {
1582                         char *dev_path = "/dev/";
1583                         char *dev_name;
1584
1585                         /* Try adding device path at beginning of name */
1586                         dev_name = malloc(strlen(be_lun->dev_path)
1587                                         + strlen(dev_path) + 1,
1588                                           M_CTLBLK, M_WAITOK);
1589                         if (dev_name) {
1590                                 sprintf(dev_name, "%s%s", dev_path,
1591                                         be_lun->dev_path);
1592                                 free(be_lun->dev_path, M_CTLBLK);
1593                                 be_lun->dev_path = dev_name;
1594                                 goto again;
1595                         }
1596                 }
1597                 snprintf(req->error_str, sizeof(req->error_str),
1598                          "%s: error opening %s", __func__, be_lun->dev_path);
1599                 return (error);
1600         }
1601
1602         vfs_is_locked = NDHASGIANT(&nd);
1603
1604         NDFREE(&nd, NDF_ONLY_PNBUF);
1605                 
1606         be_lun->vn = nd.ni_vp;
1607
1608         /* We only support disks and files. */
1609         if (vn_isdisk(be_lun->vn, &error)) {
1610                 error = ctl_be_block_open_dev(be_lun, req);
1611         } else if (be_lun->vn->v_type == VREG) {
1612                 error = ctl_be_block_open_file(be_lun, req);
1613         } else {
1614                 error = EINVAL;
1615                 snprintf(req->error_str, sizeof(req->error_str),
1616                          "%s is not a disk or file", be_lun->dev_path);
1617         }
1618         VOP_UNLOCK(be_lun->vn, 0);
1619         VFS_UNLOCK_GIANT(vfs_is_locked);
1620
1621         if (error != 0) {
1622                 ctl_be_block_close(be_lun);
1623                 return (error);
1624         }
1625
1626         be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
1627         be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
1628
1629         return (0);
1630 }
1631
1632 static int
1633 ctl_be_block_mem_ctor(void *mem, int size, void *arg, int flags)
1634 {
1635         return (0);
1636 }
1637
1638 static void
1639 ctl_be_block_mem_dtor(void *mem, int size, void *arg)
1640 {
1641         bzero(mem, size);
1642 }
1643
1644 static int
1645 ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
1646 {
1647         struct ctl_be_block_lun *be_lun;
1648         struct ctl_lun_create_params *params;
1649         struct ctl_be_arg *file_arg;
1650         char tmpstr[32];
1651         int retval, num_threads;
1652         int i;
1653
1654         params = &req->reqdata.create;
1655         retval = 0;
1656
1657         num_threads = cbb_num_threads;
1658
1659         file_arg = NULL;
1660
1661         be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
1662
1663         if (be_lun == NULL) {
1664                 snprintf(req->error_str, sizeof(req->error_str),
1665                          "%s: error allocating %zd bytes", __func__,
1666                          sizeof(*be_lun));
1667                 goto bailout_error;
1668         }
1669
1670         be_lun->softc = softc;
1671         STAILQ_INIT(&be_lun->input_queue);
1672         STAILQ_INIT(&be_lun->config_write_queue);
1673         STAILQ_INIT(&be_lun->datamove_queue);
1674         sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
1675         mtx_init(&be_lun->lock, be_lun->lunname, NULL, MTX_DEF);
1676
1677         be_lun->lun_zone = uma_zcreate(be_lun->lunname, MAXPHYS, 
1678             ctl_be_block_mem_ctor, ctl_be_block_mem_dtor, NULL, NULL,
1679             /*align*/ 0, /*flags*/0);
1680
1681         if (be_lun->lun_zone == NULL) {
1682                 snprintf(req->error_str, sizeof(req->error_str),
1683                          "%s: error allocating UMA zone", __func__);
1684                 goto bailout_error;
1685         }
1686
1687         if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
1688                 be_lun->ctl_be_lun.lun_type = params->device_type;
1689         else
1690                 be_lun->ctl_be_lun.lun_type = T_DIRECT;
1691
1692         if (be_lun->ctl_be_lun.lun_type == T_DIRECT) {
1693                 for (i = 0; i < req->num_be_args; i++) {
1694                         if (strcmp(req->kern_be_args[i].name, "file") == 0) {
1695                                 file_arg = &req->kern_be_args[i];
1696                                 break;
1697                         }
1698                 }
1699
1700                 if (file_arg == NULL) {
1701                         snprintf(req->error_str, sizeof(req->error_str),
1702                                  "%s: no file argument specified", __func__);
1703                         goto bailout_error;
1704                 }
1705
1706                 be_lun->dev_path = malloc(file_arg->vallen, M_CTLBLK,
1707                                           M_WAITOK | M_ZERO);
1708                 if (be_lun->dev_path == NULL) {
1709                         snprintf(req->error_str, sizeof(req->error_str),
1710                                  "%s: error allocating %d bytes", __func__,
1711                                  file_arg->vallen);
1712                         goto bailout_error;
1713                 }
1714
1715                 strlcpy(be_lun->dev_path, (char *)file_arg->value,
1716                         file_arg->vallen);
1717
1718                 retval = ctl_be_block_open(softc, be_lun, req);
1719                 if (retval != 0) {
1720                         retval = 0;
1721                         goto bailout_error;
1722                 }
1723
1724                 /*
1725                  * Tell the user the size of the file/device.
1726                  */
1727                 params->lun_size_bytes = be_lun->size_bytes;
1728
1729                 /*
1730                  * The maximum LBA is the size - 1.
1731                  */
1732                 be_lun->ctl_be_lun.maxlba = be_lun->size_blocks - 1;
1733         } else {
1734                 /*
1735                  * For processor devices, we don't have any size.
1736                  */
1737                 be_lun->blocksize = 0;
1738                 be_lun->size_blocks = 0;
1739                 be_lun->size_bytes = 0;
1740                 be_lun->ctl_be_lun.maxlba = 0;
1741                 params->lun_size_bytes = 0;
1742
1743                 /*
1744                  * Default to just 1 thread for processor devices.
1745                  */
1746                 num_threads = 1;
1747         }
1748
1749         /*
1750          * XXX This searching loop might be refactored to be combined with
1751          * the loop above,
1752          */
1753         for (i = 0; i < req->num_be_args; i++) {
1754                 if (strcmp(req->kern_be_args[i].name, "num_threads") == 0) {
1755                         struct ctl_be_arg *thread_arg;
1756                         char num_thread_str[16];
1757                         int tmp_num_threads;
1758
1759
1760                         thread_arg = &req->kern_be_args[i];
1761
1762                         strlcpy(num_thread_str, (char *)thread_arg->value,
1763                                 min(thread_arg->vallen,
1764                                 sizeof(num_thread_str)));
1765
1766                         tmp_num_threads = strtol(num_thread_str, NULL, 0);
1767
1768                         /*
1769                          * We don't let the user specify less than one
1770                          * thread, but hope he's clueful enough not to
1771                          * specify 1000 threads.
1772                          */
1773                         if (tmp_num_threads < 1) {
1774                                 snprintf(req->error_str, sizeof(req->error_str),
1775                                          "%s: invalid number of threads %s",
1776                                          __func__, num_thread_str);
1777                                 goto bailout_error;
1778                         }
1779
1780                         num_threads = tmp_num_threads;
1781                 }
1782         }
1783
1784         be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
1785         be_lun->ctl_be_lun.flags = CTL_LUN_FLAG_PRIMARY;
1786         be_lun->ctl_be_lun.be_lun = be_lun;
1787         be_lun->ctl_be_lun.blocksize = be_lun->blocksize;
1788         /* Tell the user the blocksize we ended up using */
1789         params->blocksize_bytes = be_lun->blocksize;
1790         if (params->flags & CTL_LUN_FLAG_ID_REQ) {
1791                 be_lun->ctl_be_lun.req_lun_id = params->req_lun_id;
1792                 be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_ID_REQ;
1793         } else
1794                 be_lun->ctl_be_lun.req_lun_id = 0;
1795
1796         be_lun->ctl_be_lun.lun_shutdown = ctl_be_block_lun_shutdown;
1797         be_lun->ctl_be_lun.lun_config_status =
1798                 ctl_be_block_lun_config_status;
1799         be_lun->ctl_be_lun.be = &ctl_be_block_driver;
1800
1801         if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
1802                 snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
1803                          softc->num_luns);
1804                 strncpy((char *)be_lun->ctl_be_lun.serial_num, tmpstr,
1805                         ctl_min(sizeof(be_lun->ctl_be_lun.serial_num),
1806                         sizeof(tmpstr)));
1807
1808                 /* Tell the user what we used for a serial number */
1809                 strncpy((char *)params->serial_num, tmpstr,
1810                         ctl_min(sizeof(params->serial_num), sizeof(tmpstr)));
1811         } else { 
1812                 strncpy((char *)be_lun->ctl_be_lun.serial_num,
1813                         params->serial_num,
1814                         ctl_min(sizeof(be_lun->ctl_be_lun.serial_num),
1815                         sizeof(params->serial_num)));
1816         }
1817         if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
1818                 snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
1819                 strncpy((char *)be_lun->ctl_be_lun.device_id, tmpstr,
1820                         ctl_min(sizeof(be_lun->ctl_be_lun.device_id),
1821                         sizeof(tmpstr)));
1822
1823                 /* Tell the user what we used for a device ID */
1824                 strncpy((char *)params->device_id, tmpstr,
1825                         ctl_min(sizeof(params->device_id), sizeof(tmpstr)));
1826         } else {
1827                 strncpy((char *)be_lun->ctl_be_lun.device_id,
1828                         params->device_id,
1829                         ctl_min(sizeof(be_lun->ctl_be_lun.device_id),
1830                                 sizeof(params->device_id)));
1831         }
1832
1833         TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
1834
1835         be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
1836             taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
1837
1838         if (be_lun->io_taskqueue == NULL) {
1839                 snprintf(req->error_str, sizeof(req->error_str),
1840                          "%s: Unable to create taskqueue", __func__);
1841                 goto bailout_error;
1842         }
1843
1844         /*
1845          * Note that we start the same number of threads by default for
1846          * both the file case and the block device case.  For the file
1847          * case, we need multiple threads to allow concurrency, because the
1848          * vnode interface is designed to be a blocking interface.  For the
1849          * block device case, ZFS zvols at least will block the caller's
1850          * context in many instances, and so we need multiple threads to
1851          * overcome that problem.  Other block devices don't need as many
1852          * threads, but they shouldn't cause too many problems.
1853          *
1854          * If the user wants to just have a single thread for a block
1855          * device, he can specify that when the LUN is created, or change
1856          * the tunable/sysctl to alter the default number of threads.
1857          */
1858         retval = taskqueue_start_threads(&be_lun->io_taskqueue,
1859                                          /*num threads*/num_threads,
1860                                          /*priority*/PWAIT,
1861                                          /*thread name*/
1862                                          "%s taskq", be_lun->lunname);
1863
1864         if (retval != 0)
1865                 goto bailout_error;
1866
1867         be_lun->num_threads = num_threads;
1868
1869         mtx_lock(&softc->lock);
1870         softc->num_luns++;
1871         STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
1872
1873         mtx_unlock(&softc->lock);
1874
1875         retval = ctl_add_lun(&be_lun->ctl_be_lun);
1876         if (retval != 0) {
1877                 mtx_lock(&softc->lock);
1878                 STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
1879                               links);
1880                 softc->num_luns--;
1881                 mtx_unlock(&softc->lock);
1882                 snprintf(req->error_str, sizeof(req->error_str),
1883                          "%s: ctl_add_lun() returned error %d, see dmesg for "
1884                         "details", __func__, retval);
1885                 retval = 0;
1886                 goto bailout_error;
1887         }
1888
1889         mtx_lock(&softc->lock);
1890
1891         /*
1892          * Tell the config_status routine that we're waiting so it won't
1893          * clean up the LUN in the event of an error.
1894          */
1895         be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
1896
1897         while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
1898                 retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
1899                 if (retval == EINTR)
1900                         break;
1901         }
1902         be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
1903
1904         if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
1905                 snprintf(req->error_str, sizeof(req->error_str),
1906                          "%s: LUN configuration error, see dmesg for details",
1907                          __func__);
1908                 STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
1909                               links);
1910                 softc->num_luns--;
1911                 mtx_unlock(&softc->lock);
1912                 goto bailout_error;
1913         } else {
1914                 params->req_lun_id = be_lun->ctl_be_lun.lun_id;
1915         }
1916
1917         mtx_unlock(&softc->lock);
1918
1919         be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
1920                                                be_lun->blocksize,
1921                                                DEVSTAT_ALL_SUPPORTED,
1922                                                be_lun->ctl_be_lun.lun_type
1923                                                | DEVSTAT_TYPE_IF_OTHER,
1924                                                DEVSTAT_PRIORITY_OTHER);
1925
1926
1927         req->status = CTL_LUN_OK;
1928
1929         return (retval);
1930
1931 bailout_error:
1932         req->status = CTL_LUN_ERROR;
1933
1934         ctl_be_block_close(be_lun);
1935
1936         free(be_lun->dev_path, M_CTLBLK);
1937         free(be_lun, M_CTLBLK);
1938
1939         return (retval);
1940 }
1941
1942 static int
1943 ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
1944 {
1945         struct ctl_lun_rm_params *params;
1946         struct ctl_be_block_lun *be_lun;
1947         int retval;
1948
1949         params = &req->reqdata.rm;
1950
1951         mtx_lock(&softc->lock);
1952
1953         be_lun = NULL;
1954
1955         STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
1956                 if (be_lun->ctl_be_lun.lun_id == params->lun_id)
1957                         break;
1958         }
1959         mtx_unlock(&softc->lock);
1960
1961         if (be_lun == NULL) {
1962                 snprintf(req->error_str, sizeof(req->error_str),
1963                          "%s: LUN %u is not managed by the block backend",
1964                          __func__, params->lun_id);
1965                 goto bailout_error;
1966         }
1967
1968         retval = ctl_disable_lun(&be_lun->ctl_be_lun);
1969
1970         if (retval != 0) {
1971                 snprintf(req->error_str, sizeof(req->error_str),
1972                          "%s: error %d returned from ctl_disable_lun() for "
1973                          "LUN %d", __func__, retval, params->lun_id);
1974                 goto bailout_error;
1975
1976         }
1977
1978         retval = ctl_invalidate_lun(&be_lun->ctl_be_lun);
1979         if (retval != 0) {
1980                 snprintf(req->error_str, sizeof(req->error_str),
1981                          "%s: error %d returned from ctl_invalidate_lun() for "
1982                          "LUN %d", __func__, retval, params->lun_id);
1983                 goto bailout_error;
1984         }
1985
1986         mtx_lock(&softc->lock);
1987
1988         be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
1989
1990         while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
1991                 retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
1992                 if (retval == EINTR)
1993                         break;
1994         }
1995
1996         be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
1997
1998         if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
1999                 snprintf(req->error_str, sizeof(req->error_str),
2000                          "%s: interrupted waiting for LUN to be freed", 
2001                          __func__);
2002                 mtx_unlock(&softc->lock);
2003                 goto bailout_error;
2004         }
2005
2006         STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
2007
2008         softc->num_luns--;
2009         mtx_unlock(&softc->lock);
2010
2011         taskqueue_drain(be_lun->io_taskqueue, &be_lun->io_task);
2012
2013         taskqueue_free(be_lun->io_taskqueue);
2014
2015         ctl_be_block_close(be_lun);
2016
2017         if (be_lun->disk_stats != NULL)
2018                 devstat_remove_entry(be_lun->disk_stats);
2019
2020         uma_zdestroy(be_lun->lun_zone);
2021
2022         free(be_lun->dev_path, M_CTLBLK);
2023
2024         free(be_lun, M_CTLBLK);
2025
2026         req->status = CTL_LUN_OK;
2027
2028         return (0);
2029
2030 bailout_error:
2031
2032         req->status = CTL_LUN_ERROR;
2033
2034         return (0);
2035 }
2036
2037 static int
2038 ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
2039                          struct ctl_lun_req *req)
2040 {
2041         struct vattr vattr;
2042         int error;
2043         struct ctl_lun_modify_params *params;
2044
2045         params = &req->reqdata.modify;
2046
2047         if (params->lun_size_bytes != 0) {
2048                 be_lun->size_bytes = params->lun_size_bytes;
2049         } else  {
2050                 error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
2051                 if (error != 0) {
2052                         snprintf(req->error_str, sizeof(req->error_str),
2053                                  "error calling VOP_GETATTR() for file %s",
2054                                  be_lun->dev_path);
2055                         return (error);
2056                 }
2057
2058                 be_lun->size_bytes = vattr.va_size;
2059         }
2060
2061         return (0);
2062 }
2063
2064 static int
2065 ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
2066                         struct ctl_lun_req *req)
2067 {
2068         struct cdev *dev;
2069         struct cdevsw *devsw;
2070         int error;
2071         struct ctl_lun_modify_params *params;
2072         uint64_t size_bytes;
2073
2074         params = &req->reqdata.modify;
2075
2076         dev = be_lun->vn->v_rdev;
2077         devsw = dev->si_devsw;
2078         if (!devsw->d_ioctl) {
2079                 snprintf(req->error_str, sizeof(req->error_str),
2080                          "%s: no d_ioctl for device %s!", __func__,
2081                          be_lun->dev_path);
2082                 return (ENODEV);
2083         }
2084
2085         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2086                                (caddr_t)&size_bytes, FREAD,
2087                                curthread);
2088         if (error) {
2089                 snprintf(req->error_str, sizeof(req->error_str),
2090                          "%s: error %d returned for DIOCGMEDIASIZE ioctl "
2091                          "on %s!", __func__, error, be_lun->dev_path);
2092                 return (error);
2093         }
2094
2095         if (params->lun_size_bytes != 0) {
2096                 if (params->lun_size_bytes > size_bytes) {
2097                         snprintf(req->error_str, sizeof(req->error_str),
2098                                  "%s: requested LUN size %ju > backing device "
2099                                  "size %ju", __func__,
2100                                  (uintmax_t)params->lun_size_bytes,
2101                                  (uintmax_t)size_bytes);
2102                         return (EINVAL);
2103                 }
2104
2105                 be_lun->size_bytes = params->lun_size_bytes;
2106         } else {
2107                 be_lun->size_bytes = size_bytes;
2108         }
2109
2110         return (0);
2111 }
2112
2113 static int
2114 ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2115 {
2116         struct ctl_lun_modify_params *params;
2117         struct ctl_be_block_lun *be_lun;
2118         int vfs_is_locked, error;
2119
2120         params = &req->reqdata.modify;
2121
2122         mtx_lock(&softc->lock);
2123
2124         be_lun = NULL;
2125
2126         STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2127                 if (be_lun->ctl_be_lun.lun_id == params->lun_id)
2128                         break;
2129         }
2130         mtx_unlock(&softc->lock);
2131
2132         if (be_lun == NULL) {
2133                 snprintf(req->error_str, sizeof(req->error_str),
2134                          "%s: LUN %u is not managed by the block backend",
2135                          __func__, params->lun_id);
2136                 goto bailout_error;
2137         }
2138
2139         if (params->lun_size_bytes != 0) {
2140                 if (params->lun_size_bytes < be_lun->blocksize) {
2141                         snprintf(req->error_str, sizeof(req->error_str),
2142                                 "%s: LUN size %ju < blocksize %u", __func__,
2143                                 params->lun_size_bytes, be_lun->blocksize);
2144                         goto bailout_error;
2145                 }
2146         }
2147
2148         vfs_is_locked = VFS_LOCK_GIANT(be_lun->vn->v_mount);
2149         vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
2150
2151         if (be_lun->vn->v_type == VREG)
2152                 error = ctl_be_block_modify_file(be_lun, req);
2153         else
2154                 error = ctl_be_block_modify_dev(be_lun, req);
2155
2156         VOP_UNLOCK(be_lun->vn, 0);
2157         VFS_UNLOCK_GIANT(vfs_is_locked);
2158
2159         if (error != 0)
2160                 goto bailout_error;
2161
2162         be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
2163
2164         /*
2165          * The maximum LBA is the size - 1.
2166          *
2167          * XXX: Note that this field is being updated without locking,
2168          *      which might cause problems on 32-bit architectures.
2169          */
2170         be_lun->ctl_be_lun.maxlba = be_lun->size_blocks - 1;
2171         ctl_lun_capacity_changed(&be_lun->ctl_be_lun);
2172
2173         /* Tell the user the exact size we ended up using */
2174         params->lun_size_bytes = be_lun->size_bytes;
2175
2176         req->status = CTL_LUN_OK;
2177
2178         return (0);
2179
2180 bailout_error:
2181         req->status = CTL_LUN_ERROR;
2182
2183         return (0);
2184 }
2185
2186 static void
2187 ctl_be_block_lun_shutdown(void *be_lun)
2188 {
2189         struct ctl_be_block_lun *lun;
2190         struct ctl_be_block_softc *softc;
2191
2192         lun = (struct ctl_be_block_lun *)be_lun;
2193
2194         softc = lun->softc;
2195
2196         mtx_lock(&softc->lock);
2197         lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2198         if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2199                 wakeup(lun);
2200         mtx_unlock(&softc->lock);
2201
2202 }
2203
2204 static void
2205 ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
2206 {
2207         struct ctl_be_block_lun *lun;
2208         struct ctl_be_block_softc *softc;
2209
2210         lun = (struct ctl_be_block_lun *)be_lun;
2211         softc = lun->softc;
2212
2213         if (status == CTL_LUN_CONFIG_OK) {
2214                 mtx_lock(&softc->lock);
2215                 lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2216                 if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2217                         wakeup(lun);
2218                 mtx_unlock(&softc->lock);
2219
2220                 /*
2221                  * We successfully added the LUN, attempt to enable it.
2222                  */
2223                 if (ctl_enable_lun(&lun->ctl_be_lun) != 0) {
2224                         printf("%s: ctl_enable_lun() failed!\n", __func__);
2225                         if (ctl_invalidate_lun(&lun->ctl_be_lun) != 0) {
2226                                 printf("%s: ctl_invalidate_lun() failed!\n",
2227                                        __func__);
2228                         }
2229                 }
2230
2231                 return;
2232         }
2233
2234
2235         mtx_lock(&softc->lock);
2236         lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2237         lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
2238         wakeup(lun);
2239         mtx_unlock(&softc->lock);
2240 }
2241
2242
2243 static int
2244 ctl_be_block_config_write(union ctl_io *io)
2245 {
2246         struct ctl_be_block_lun *be_lun;
2247         struct ctl_be_lun *ctl_be_lun;
2248         int retval;
2249
2250         retval = 0;
2251
2252         DPRINTF("entered\n");
2253
2254         ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2255                 CTL_PRIV_BACKEND_LUN].ptr;
2256         be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
2257
2258         switch (io->scsiio.cdb[0]) {
2259         case SYNCHRONIZE_CACHE:
2260         case SYNCHRONIZE_CACHE_16:
2261                 /*
2262                  * The upper level CTL code will filter out any CDBs with
2263                  * the immediate bit set and return the proper error.
2264                  *
2265                  * We don't really need to worry about what LBA range the
2266                  * user asked to be synced out.  When they issue a sync
2267                  * cache command, we'll sync out the whole thing.
2268                  */
2269                 mtx_lock(&be_lun->lock);
2270                 STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2271                                    links);
2272                 mtx_unlock(&be_lun->lock);
2273                 taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2274                 break;
2275         case START_STOP_UNIT: {
2276                 struct scsi_start_stop_unit *cdb;
2277
2278                 cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2279
2280                 if (cdb->how & SSS_START)
2281                         retval = ctl_start_lun(ctl_be_lun);
2282                 else {
2283                         retval = ctl_stop_lun(ctl_be_lun);
2284                         /*
2285                          * XXX KDM Copan-specific offline behavior.
2286                          * Figure out a reasonable way to port this?
2287                          */
2288 #ifdef NEEDTOPORT
2289                         if ((retval == 0)
2290                          && (cdb->byte2 & SSS_ONOFFLINE))
2291                                 retval = ctl_lun_offline(ctl_be_lun);
2292 #endif
2293                 }
2294
2295                 /*
2296                  * In general, the above routines should not fail.  They
2297                  * just set state for the LUN.  So we've got something
2298                  * pretty wrong here if we can't start or stop the LUN.
2299                  */
2300                 if (retval != 0) {
2301                         ctl_set_internal_failure(&io->scsiio,
2302                                                  /*sks_valid*/ 1,
2303                                                  /*retry_count*/ 0xf051);
2304                         retval = CTL_RETVAL_COMPLETE;
2305                 } else {
2306                         ctl_set_success(&io->scsiio);
2307                 }
2308                 ctl_config_write_done(io);
2309                 break;
2310         }
2311         default:
2312                 ctl_set_invalid_opcode(&io->scsiio);
2313                 ctl_config_write_done(io);
2314                 retval = CTL_RETVAL_COMPLETE;
2315                 break;
2316         }
2317
2318         return (retval);
2319
2320 }
2321
2322 static int
2323 ctl_be_block_config_read(union ctl_io *io)
2324 {
2325         return (0);
2326 }
2327
2328 static int
2329 ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
2330 {
2331         struct ctl_be_block_lun *lun;
2332         int retval;
2333
2334         lun = (struct ctl_be_block_lun *)be_lun;
2335         retval = 0;
2336
2337         retval = sbuf_printf(sb, "<num_threads>");
2338
2339         if (retval != 0)
2340                 goto bailout;
2341
2342         retval = sbuf_printf(sb, "%d", lun->num_threads);
2343
2344         if (retval != 0)
2345                 goto bailout;
2346
2347         retval = sbuf_printf(sb, "</num_threads>");
2348
2349         /*
2350          * For processor devices, we don't have a path variable.
2351          */
2352         if ((retval != 0)
2353          || (lun->dev_path == NULL))
2354                 goto bailout;
2355
2356         retval = sbuf_printf(sb, "<file>");
2357
2358         if (retval != 0)
2359                 goto bailout;
2360
2361         retval = ctl_sbuf_printf_esc(sb, lun->dev_path);
2362
2363         if (retval != 0)
2364                 goto bailout;
2365
2366         retval = sbuf_printf(sb, "</file>\n");
2367
2368 bailout:
2369
2370         return (retval);
2371 }
2372
2373 int
2374 ctl_be_block_init(void)
2375 {
2376         struct ctl_be_block_softc *softc;
2377         int retval;
2378
2379         softc = &backend_block_softc;
2380         retval = 0;
2381
2382         mtx_init(&softc->lock, "ctlblk", NULL, MTX_DEF);
2383         STAILQ_INIT(&softc->beio_free_queue);
2384         STAILQ_INIT(&softc->disk_list);
2385         STAILQ_INIT(&softc->lun_list);
2386         ctl_grow_beio(softc, 200);
2387
2388         return (retval);
2389 }