2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
6 * Copyright 2020 Joyent, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
35 #include <sys/param.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
39 #include <sys/queue.h>
40 #include <sys/errno.h>
42 #include <sys/ioctl.h>
46 #ifndef WITHOUT_CAPSICUM
47 #include <capsicum_helpers.h>
55 #include <pthread_np.h>
60 #include <machine/atomic.h>
61 #include <machine/vmm_snapshot.h>
68 #define BLOCKIF_SIG 0xb109b109
70 #define BLOCKIF_NUMTHR 8
71 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
89 TAILQ_ENTRY(blockif_elem) be_link;
90 struct blockif_req *be_req;
92 enum blockstat be_status;
111 pthread_t bc_btid[BLOCKIF_NUMTHR];
112 pthread_mutex_t bc_mtx;
113 pthread_cond_t bc_cond;
114 pthread_cond_t bc_paused_cond;
115 pthread_cond_t bc_work_done_cond;
117 /* Request elements and free/pending/busy queues */
118 TAILQ_HEAD(, blockif_elem) bc_freeq;
119 TAILQ_HEAD(, blockif_elem) bc_pendq;
120 TAILQ_HEAD(, blockif_elem) bc_busyq;
121 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
124 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
126 struct blockif_sig_elem {
127 pthread_mutex_t bse_mtx;
128 pthread_cond_t bse_cond;
130 struct blockif_sig_elem *bse_next;
133 static struct blockif_sig_elem *blockif_bse_head;
136 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
139 struct blockif_elem *be, *tbe;
143 be = TAILQ_FIRST(&bc->bc_freeq);
145 assert(be->be_status == BST_FREE);
146 TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
153 off = breq->br_offset;
154 for (i = 0; i < breq->br_iovcnt; i++)
155 off += breq->br_iov[i].iov_len;
161 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
162 if (tbe->be_block == breq->br_offset)
166 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
167 if (tbe->be_block == breq->br_offset)
172 be->be_status = BST_PEND;
174 be->be_status = BST_BLOCK;
175 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
176 return (be->be_status == BST_PEND);
180 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
182 struct blockif_elem *be;
184 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
185 if (be->be_status == BST_PEND)
187 assert(be->be_status == BST_BLOCK);
191 TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
192 be->be_status = BST_BUSY;
194 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
200 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
202 struct blockif_elem *tbe;
204 if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
205 TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
207 TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
208 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
209 if (tbe->be_req->br_offset == be->be_block)
210 tbe->be_status = BST_PEND;
213 be->be_status = BST_FREE;
215 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
219 blockif_flush_bc(struct blockif_ctxt *bc)
222 if (ioctl(bc->bc_fd, DIOCGFLUSH))
224 } else if (fsync(bc->bc_fd))
231 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
233 struct blockif_req *br;
235 ssize_t clen, len, off, boff, voff;
239 if (br->br_iovcnt <= 1)
245 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
254 while (br->br_resid > 0) {
255 len = MIN(br->br_resid, MAXPHYS);
256 if (pread(bc->bc_fd, buf, len, br->br_offset +
263 clen = MIN(len - boff, br->br_iov[i].iov_len -
265 memcpy(br->br_iov[i].iov_base + voff,
267 if (clen < br->br_iov[i].iov_len - voff)
274 } while (boff < len);
285 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
294 while (br->br_resid > 0) {
295 len = MIN(br->br_resid, MAXPHYS);
298 clen = MIN(len - boff, br->br_iov[i].iov_len -
301 br->br_iov[i].iov_base + voff, clen);
302 if (clen < br->br_iov[i].iov_len - voff)
309 } while (boff < len);
310 if (pwrite(bc->bc_fd, buf, len, br->br_offset +
320 err = blockif_flush_bc(bc);
323 if (!bc->bc_candelete)
325 else if (bc->bc_rdonly)
327 else if (bc->bc_ischr) {
328 arg[0] = br->br_offset;
329 arg[1] = br->br_resid;
330 if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
342 be->be_status = BST_DONE;
344 (*br->br_callback)(br, err);
348 blockif_thr(void *arg)
350 struct blockif_ctxt *bc;
351 struct blockif_elem *be;
357 buf = malloc(MAXPHYS);
362 pthread_mutex_lock(&bc->bc_mtx);
366 /* We cannot process work if the interface is paused */
367 while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) {
368 pthread_mutex_unlock(&bc->bc_mtx);
369 blockif_proc(bc, be, buf);
370 pthread_mutex_lock(&bc->bc_mtx);
371 blockif_complete(bc, be);
376 /* If none of the workers are busy, notify the main thread */
377 if (bc->bc_work_count == 0)
378 pthread_cond_broadcast(&bc->bc_work_done_cond);
380 /* Check ctxt status here to see if exit requested */
384 /* Make all worker threads wait here if the device is paused */
385 while (bc->bc_paused)
386 pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx);
388 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
390 pthread_mutex_unlock(&bc->bc_mtx);
399 blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
401 struct blockif_sig_elem *bse;
405 * Process the entire list even if not intended for
409 bse = blockif_bse_head;
412 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
414 (uintptr_t)bse->bse_next));
416 pthread_mutex_lock(&bse->bse_mtx);
417 bse->bse_pending = 0;
418 pthread_cond_signal(&bse->bse_cond);
419 pthread_mutex_unlock(&bse->bse_mtx);
426 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
427 (void) signal(SIGCONT, SIG_IGN);
430 struct blockif_ctxt *
431 blockif_open(const char *optstr, const char *ident)
433 char tname[MAXCOMLEN + 1];
434 char name[MAXPATHLEN];
435 char *nopt, *xopts, *cp;
436 struct blockif_ctxt *bc;
438 struct diocgattr_arg arg;
439 off_t size, psectsz, psectoff;
440 int extra, fd, i, sectsz;
441 int nocache, sync, ro, candelete, geom, ssopt, pssopt;
444 #ifndef WITHOUT_CAPSICUM
446 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
449 pthread_once(&blockif_once, blockif_init);
459 * The first element in the optstring is always a pathname.
460 * Optional elements follow
462 nopt = xopts = strdup(optstr);
463 while (xopts != NULL) {
464 cp = strsep(&xopts, ",");
465 if (cp == nopt) /* file or device pathname */
467 else if (!strcmp(cp, "nocache"))
469 else if (!strcmp(cp, "nodelete"))
471 else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
473 else if (!strcmp(cp, "ro"))
475 else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
477 else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
480 EPRINTLN("Invalid device option \"%s\"", cp);
491 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
493 /* Attempt a r/w fail with a r/o open */
494 fd = open(nopt, O_RDONLY | extra);
499 warn("Could not open backing file: %s", nopt);
503 if (fstat(fd, &sbuf) < 0) {
504 warn("Could not stat backing file %s", nopt);
508 #ifndef WITHOUT_CAPSICUM
509 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
512 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
514 if (caph_rights_limit(fd, &rights) == -1)
515 errx(EX_OSERR, "Unable to apply rights for sandbox");
519 * Deal with raw devices
523 psectsz = psectoff = 0;
524 candelete = geom = 0;
525 if (S_ISCHR(sbuf.st_mode)) {
526 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
527 ioctl(fd, DIOCGSECTORSIZE, §sz)) {
528 perror("Could not fetch dev blk/sector size");
533 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
534 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
535 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
536 arg.len = sizeof(arg.value.i);
537 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
538 candelete = arg.value.i;
539 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
542 psectsz = sbuf.st_blksize;
544 #ifndef WITHOUT_CAPSICUM
545 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
546 errx(EX_OSERR, "Unable to apply rights for sandbox");
550 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
552 EPRINTLN("Invalid sector size %d/%d",
558 * Some backend drivers (e.g. cd0, ada0) require that the I/O
559 * size be a multiple of the device's sector size.
561 * Validate that the emulated sector size complies with this
564 if (S_ISCHR(sbuf.st_mode)) {
565 if (ssopt < sectsz || (ssopt % sectsz) != 0) {
566 EPRINTLN("Sector size %d incompatible "
567 "with underlying device sector size %d",
578 bc = calloc(1, sizeof(struct blockif_ctxt));
584 bc->bc_magic = BLOCKIF_SIG;
586 bc->bc_ischr = S_ISCHR(sbuf.st_mode);
587 bc->bc_isgeom = geom;
588 bc->bc_candelete = candelete;
591 bc->bc_sectsz = sectsz;
592 bc->bc_psectsz = psectsz;
593 bc->bc_psectoff = psectoff;
594 pthread_mutex_init(&bc->bc_mtx, NULL);
595 pthread_cond_init(&bc->bc_cond, NULL);
597 bc->bc_work_count = 0;
598 pthread_cond_init(&bc->bc_paused_cond, NULL);
599 pthread_cond_init(&bc->bc_work_done_cond, NULL);
600 TAILQ_INIT(&bc->bc_freeq);
601 TAILQ_INIT(&bc->bc_pendq);
602 TAILQ_INIT(&bc->bc_busyq);
603 for (i = 0; i < BLOCKIF_MAXREQ; i++) {
604 bc->bc_reqs[i].be_status = BST_FREE;
605 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
608 for (i = 0; i < BLOCKIF_NUMTHR; i++) {
609 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
610 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
611 pthread_set_name_np(bc->bc_btid[i], tname);
623 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
630 pthread_mutex_lock(&bc->bc_mtx);
631 if (!TAILQ_EMPTY(&bc->bc_freeq)) {
633 * Enqueue and inform the block i/o thread
634 * that there is work available
636 if (blockif_enqueue(bc, breq, op))
637 pthread_cond_signal(&bc->bc_cond);
640 * Callers are not allowed to enqueue more than
641 * the specified blockif queue limit. Return an
642 * error to indicate that the queue length has been
647 pthread_mutex_unlock(&bc->bc_mtx);
653 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
656 assert(bc->bc_magic == BLOCKIF_SIG);
657 return (blockif_request(bc, breq, BOP_READ));
661 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
664 assert(bc->bc_magic == BLOCKIF_SIG);
665 return (blockif_request(bc, breq, BOP_WRITE));
669 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
672 assert(bc->bc_magic == BLOCKIF_SIG);
673 return (blockif_request(bc, breq, BOP_FLUSH));
677 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
680 assert(bc->bc_magic == BLOCKIF_SIG);
681 return (blockif_request(bc, breq, BOP_DELETE));
685 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
687 struct blockif_elem *be;
689 assert(bc->bc_magic == BLOCKIF_SIG);
691 pthread_mutex_lock(&bc->bc_mtx);
692 /* XXX: not waiting while paused */
695 * Check pending requests.
697 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
698 if (be->be_req == breq)
705 blockif_complete(bc, be);
706 pthread_mutex_unlock(&bc->bc_mtx);
712 * Check in-flight requests.
714 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
715 if (be->be_req == breq)
722 pthread_mutex_unlock(&bc->bc_mtx);
727 * Interrupt the processing thread to force it return
728 * prematurely via it's normal callback path.
730 while (be->be_status == BST_BUSY) {
731 struct blockif_sig_elem bse, *old_head;
733 pthread_mutex_init(&bse.bse_mtx, NULL);
734 pthread_cond_init(&bse.bse_cond, NULL);
739 old_head = blockif_bse_head;
740 bse.bse_next = old_head;
741 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
745 pthread_kill(be->be_tid, SIGCONT);
747 pthread_mutex_lock(&bse.bse_mtx);
748 while (bse.bse_pending)
749 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
750 pthread_mutex_unlock(&bse.bse_mtx);
753 pthread_mutex_unlock(&bc->bc_mtx);
756 * The processing thread has been interrupted. Since it's not
757 * clear if the callback has been invoked yet, return EBUSY.
763 blockif_close(struct blockif_ctxt *bc)
768 assert(bc->bc_magic == BLOCKIF_SIG);
771 * Stop the block i/o thread
773 pthread_mutex_lock(&bc->bc_mtx);
775 pthread_mutex_unlock(&bc->bc_mtx);
776 pthread_cond_broadcast(&bc->bc_cond);
777 for (i = 0; i < BLOCKIF_NUMTHR; i++)
778 pthread_join(bc->bc_btid[i], &jval);
780 /* XXX Cancel queued i/o's ??? */
793 * Return virtual C/H/S values for a given block. Use the algorithm
794 * outlined in the VHD specification to calculate values.
797 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
799 off_t sectors; /* total sectors of the block dev */
800 off_t hcyl; /* cylinders times heads */
801 uint16_t secpt; /* sectors per track */
804 assert(bc->bc_magic == BLOCKIF_SIG);
806 sectors = bc->bc_size / bc->bc_sectsz;
808 /* Clamp the size to the largest possible with CHS */
809 if (sectors > 65535UL*16*255)
810 sectors = 65535UL*16*255;
812 if (sectors >= 65536UL*16*63) {
815 hcyl = sectors / secpt;
818 hcyl = sectors / secpt;
819 heads = (hcyl + 1023) / 1024;
824 if (hcyl >= (heads * 1024) || heads > 16) {
827 hcyl = sectors / secpt;
829 if (hcyl >= (heads * 1024)) {
832 hcyl = sectors / secpt;
845 blockif_size(struct blockif_ctxt *bc)
848 assert(bc->bc_magic == BLOCKIF_SIG);
849 return (bc->bc_size);
853 blockif_sectsz(struct blockif_ctxt *bc)
856 assert(bc->bc_magic == BLOCKIF_SIG);
857 return (bc->bc_sectsz);
861 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
864 assert(bc->bc_magic == BLOCKIF_SIG);
865 *size = bc->bc_psectsz;
866 *off = bc->bc_psectoff;
870 blockif_queuesz(struct blockif_ctxt *bc)
873 assert(bc->bc_magic == BLOCKIF_SIG);
874 return (BLOCKIF_MAXREQ - 1);
878 blockif_is_ro(struct blockif_ctxt *bc)
881 assert(bc->bc_magic == BLOCKIF_SIG);
882 return (bc->bc_rdonly);
886 blockif_candelete(struct blockif_ctxt *bc)
889 assert(bc->bc_magic == BLOCKIF_SIG);
890 return (bc->bc_candelete);
893 #ifdef BHYVE_SNAPSHOT
895 blockif_pause(struct blockif_ctxt *bc)
898 assert(bc->bc_magic == BLOCKIF_SIG);
900 pthread_mutex_lock(&bc->bc_mtx);
903 /* The interface is paused. Wait for workers to finish their work */
904 while (bc->bc_work_count)
905 pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
906 pthread_mutex_unlock(&bc->bc_mtx);
908 if (blockif_flush_bc(bc))
909 fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
914 blockif_resume(struct blockif_ctxt *bc)
917 assert(bc->bc_magic == BLOCKIF_SIG);
919 pthread_mutex_lock(&bc->bc_mtx);
921 /* resume the threads waiting for paused */
922 pthread_cond_broadcast(&bc->bc_paused_cond);
923 /* kick the threads after restore */
924 pthread_cond_broadcast(&bc->bc_cond);
925 pthread_mutex_unlock(&bc->bc_mtx);
929 blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta)
935 SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done);
936 SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done);
937 SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done);
940 * XXX: The callback and parameter must be filled by the virtualized
941 * device that uses the interface, during its init; we're not touching
945 /* Snapshot the iovecs. */
946 for (i = 0; i < br->br_iovcnt; i++) {
947 iov = &br->br_iov[i];
949 SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done);
951 /* We assume the iov is a guest-mapped address. */
952 SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len,
953 false, meta, ret, done);
961 blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta)
965 if (bc->bc_paused == 0) {
966 fprintf(stderr, "%s: Snapshot failed: "
967 "interface not paused.\r\n", __func__);
971 pthread_mutex_lock(&bc->bc_mtx);
973 SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done);
974 SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done);
975 SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done);
976 SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done);
977 SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done);
978 SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done);
979 SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done);
980 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done);
981 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done);
982 SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done);
985 pthread_mutex_unlock(&bc->bc_mtx);