4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
26 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
30 #pragma ident "%Z%%M% %I% %E% SMI"
33 * ZFS volume emulation driver.
35 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
36 * Volumes are accessed through the symbolic links named:
38 * /dev/zvol/dsk/<pool_name>/<dataset_name>
39 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
41 * These links are created by the ZFS-specific devfsadm link generator.
42 * Volumes are persistent through reboot. No user command needs to be
43 * run before opening and using a device.
46 #include <sys/types.h>
47 #include <sys/param.h>
48 #include <sys/kernel.h>
49 #include <sys/errno.h>
55 #include <sys/cmn_err.h>
60 #include <sys/dsl_prop.h>
62 #include <sys/byteorder.h>
63 #include <sys/sunddi.h>
64 #include <sys/dirent.h>
65 #include <sys/policy.h>
66 #include <sys/fs/zfs.h>
67 #include <sys/zfs_ioctl.h>
69 #include <sys/refcount.h>
70 #include <sys/zfs_znode.h>
71 #include <sys/zfs_rlock.h>
72 #include <geom/geom.h>
74 #include "zfs_namecheck.h"
76 struct g_class zfs_zvol_class = {
81 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
84 #define ZVOL_ZAP_OBJ 2ULL
86 static uint32_t zvol_minors;
89 * The in-core state of each volume.
91 typedef struct zvol_state {
92 char zv_name[MAXPATHLEN]; /* pool/dd name */
93 uint64_t zv_volsize; /* amount of space we advertise */
94 uint64_t zv_volblocksize; /* volume block size */
95 struct g_provider *zv_provider; /* GEOM provider */
96 uint8_t zv_min_bs; /* minimum addressable block shift */
97 uint8_t zv_readonly; /* hard readonly; like write-protect */
98 objset_t *zv_objset; /* objset handle */
99 uint32_t zv_mode; /* DS_MODE_* flags at open time */
100 uint32_t zv_total_opens; /* total open count */
101 zilog_t *zv_zilog; /* ZIL handle */
102 uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
103 znode_t zv_znode; /* for range locking */
105 struct bio_queue_head zv_queue;
106 struct mtx zv_queue_mtx; /* zv_queue mutex */
110 * zvol maximum transfer in one DMU tx.
112 int zvol_maxphys = DMU_MAX_ACCESS/2;
114 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
117 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
122 if (volsize % blocksize != 0)
126 if (volsize - 1 > SPEC_MAXOFFSET_T)
133 zvol_check_volblocksize(uint64_t volblocksize)
135 if (volblocksize < SPA_MINBLOCKSIZE ||
136 volblocksize > SPA_MAXBLOCKSIZE ||
144 zvol_readonly_changed_cb(void *arg, uint64_t newval)
146 zvol_state_t *zv = arg;
148 zv->zv_readonly = (uint8_t)newval;
152 zvol_get_stats(objset_t *os, nvlist_t *nv)
155 dmu_object_info_t doi;
159 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
163 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
165 error = dmu_object_info(os, ZVOL_OBJ, &doi);
168 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
169 doi.doi_data_block_size);
175 static zvol_state_t *
176 zvol_minor_lookup(const char *name)
178 struct g_provider *pp;
183 LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
184 LIST_FOREACH(pp, &gp->provider, provider) {
185 if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0)
186 return (pp->private);
194 zvol_access(struct g_provider *pp, int acr, int acw, int ace)
202 if (acr <= 0 && acw <= 0 && ace <= 0)
207 ASSERT(zv->zv_objset != NULL);
209 if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)))
212 zv->zv_total_opens += acr + acw + ace;
218 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
220 * We store data in the log buffers if it's small enough.
221 * Otherwise we will later flush the data out via dmu_sync().
223 ssize_t zvol_immediate_write_sz = 32768;
226 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
228 uint32_t blocksize = zv->zv_volblocksize;
232 ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
233 itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
236 len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
237 itx->itx_private = zv;
238 lr = (lr_write_t *)&itx->itx_lr;
239 lr->lr_foid = ZVOL_OBJ;
241 lr->lr_length = nbytes;
242 lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
243 BP_ZERO(&lr->lr_blkptr);
245 (void) zil_itx_assign(zv->zv_zilog, itx, tx);
252 zvol_start(struct bio *bp)
256 switch (bp->bio_cmd) {
260 zv = bp->bio_to->private;
262 mtx_lock(&zv->zv_queue_mtx);
263 bioq_insert_tail(&zv->zv_queue, bp);
264 wakeup_one(&zv->zv_queue);
265 mtx_unlock(&zv->zv_queue_mtx);
270 g_io_deliver(bp, EOPNOTSUPP);
276 zvol_serve_one(zvol_state_t *zv, struct bio *bp)
278 uint64_t off, volsize;
286 off = bp->bio_offset;
287 volsize = zv->zv_volsize;
293 resid = bp->bio_length;
298 * There must be no buffer changes when doing a dmu_sync() because
299 * we can't change the data whilst calculating the checksum.
300 * A better approach than a per zvol rwlock would be to lock ranges.
302 reading = (bp->bio_cmd == BIO_READ);
303 rl = zfs_range_lock(&zv->zv_znode, off, resid,
304 reading ? RL_READER : RL_WRITER);
306 while (resid != 0 && off < volsize) {
308 size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
310 if (size > volsize - off) /* don't write past the end */
311 size = volsize - off;
314 error = dmu_read(os, ZVOL_OBJ, off, size, addr);
316 dmu_tx_t *tx = dmu_tx_create(os);
317 dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
318 error = dmu_tx_assign(tx, TXG_WAIT);
322 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
323 zvol_log_write(zv, tx, off, size);
333 zfs_range_unlock(rl);
335 bp->bio_completed = bp->bio_length - resid;
336 if (bp->bio_completed < bp->bio_length)
337 bp->bio_error = (off > volsize ? EINVAL : error);
341 zvol_worker(void *arg)
348 mtx_lock(&zv->zv_queue_mtx);
349 bp = bioq_takefirst(&zv->zv_queue);
351 if (zv->zv_state == 1) {
353 wakeup(&zv->zv_state);
354 mtx_unlock(&zv->zv_queue_mtx);
357 msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
361 mtx_unlock(&zv->zv_queue_mtx);
362 switch (bp->bio_cmd) {
367 zvol_serve_one(zv, bp);
371 if (bp->bio_cmd != BIO_READ && !zil_disable)
372 zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
374 g_io_deliver(bp, bp->bio_error);
379 zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
381 zfs_create_data_t *zc = arg;
383 uint64_t volblocksize, volsize;
385 VERIFY(nvlist_lookup_uint64(zc->zc_props,
386 zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
387 if (nvlist_lookup_uint64(zc->zc_props,
388 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
389 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
392 * These properites must be removed from the list so the generic
393 * property setting step won't apply to them.
395 VERIFY(nvlist_remove_all(zc->zc_props,
396 zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
397 (void) nvlist_remove_all(zc->zc_props,
398 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
400 error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
404 error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
408 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
413 * Replay a TX_WRITE ZIL transaction that didn't get committed
414 * after a system failure
417 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
419 objset_t *os = zv->zv_objset;
420 char *data = (char *)(lr + 1); /* data follows lr_write_t */
421 uint64_t off = lr->lr_offset;
422 uint64_t len = lr->lr_length;
427 byteswap_uint64_array(lr, sizeof (*lr));
429 tx = dmu_tx_create(os);
430 dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
431 error = dmu_tx_assign(tx, zv->zv_txg_assign);
435 dmu_write(os, ZVOL_OBJ, off, len, data, tx);
444 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
450 * Callback vectors for replaying records.
451 * Only TX_WRITE is needed for zvol.
453 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
454 zvol_replay_err, /* 0 no such transaction type */
455 zvol_replay_err, /* TX_CREATE */
456 zvol_replay_err, /* TX_MKDIR */
457 zvol_replay_err, /* TX_MKXATTR */
458 zvol_replay_err, /* TX_SYMLINK */
459 zvol_replay_err, /* TX_REMOVE */
460 zvol_replay_err, /* TX_RMDIR */
461 zvol_replay_err, /* TX_LINK */
462 zvol_replay_err, /* TX_RENAME */
463 zvol_replay_write, /* TX_WRITE */
464 zvol_replay_err, /* TX_TRUNCATE */
465 zvol_replay_err, /* TX_SETATTR */
466 zvol_replay_err, /* TX_ACL */
470 * Create a minor node for the specified volume.
473 zvol_create_minor(const char *name, dev_t dev)
475 struct g_provider *pp;
479 dmu_object_info_t doi;
481 int ds_mode = DS_MODE_PRIMARY;
487 if ((zv = zvol_minor_lookup(name)) != NULL) {
492 if (strchr(name, '@') != 0)
493 ds_mode |= DS_MODE_READONLY;
495 error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
501 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
505 dmu_objset_close(os);
509 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
510 gp->start = zvol_start;
511 gp->access = zvol_access;
512 pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name);
513 pp->mediasize = volsize;
514 pp->sectorsize = DEV_BSIZE;
516 zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
517 (void) strcpy(zv->zv_name, name);
518 zv->zv_min_bs = DEV_BSHIFT;
519 zv->zv_provider = pp;
520 zv->zv_volsize = pp->mediasize;
522 zv->zv_mode = ds_mode;
523 zv->zv_zilog = zil_open(os, zvol_get_data);
524 mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
525 avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
526 sizeof (rl_t), offsetof(rl_t, r_node));
529 /* get and cache the blocksize */
530 error = dmu_object_info(os, ZVOL_OBJ, &doi);
532 zv->zv_volblocksize = doi.doi_data_block_size;
534 zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
536 /* XXX this should handle the possible i/o error */
537 VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
538 "readonly", zvol_readonly_changed_cb, zv) == 0);
541 g_error_provider(pp, 0);
543 bioq_init(&zv->zv_queue);
544 mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
546 kthread_create(zvol_worker, zv, NULL, 0, 0, "zvol:worker %s", pp->name);
557 * Remove minor node for the specified volume.
560 zvol_remove_minor(const char *name)
562 struct g_provider *pp;
569 if ((zv = zvol_minor_lookup(name)) == NULL) {
574 if (zv->zv_total_opens != 0) {
579 VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
580 "readonly", zvol_readonly_changed_cb, zv) == 0);
582 mtx_lock(&zv->zv_queue_mtx);
584 wakeup_one(&zv->zv_queue);
585 while (zv->zv_state != 2)
586 msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
587 mtx_unlock(&zv->zv_queue_mtx);
588 mtx_destroy(&zv->zv_queue_mtx);
590 pp = zv->zv_provider;
592 g_wither_geom(pp->geom, ENXIO);
594 zil_close(zv->zv_zilog);
596 dmu_objset_close(zv->zv_objset);
597 zv->zv_objset = NULL;
598 avl_destroy(&zv->zv_znode.z_range_avl);
599 mutex_destroy(&zv->zv_znode.z_range_lock);
601 kmem_free(zv, sizeof(*zv));
612 zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize)
617 dmu_object_info_t doi;
622 if ((zv = zvol_minor_lookup(name)) == NULL) {
627 if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
628 (error = zvol_check_volsize(volsize,
629 doi.doi_data_block_size)) != 0) {
633 if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
638 tx = dmu_tx_create(zv->zv_objset);
639 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
640 dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
641 error = dmu_tx_assign(tx, TXG_WAIT);
647 error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
650 error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
657 zv->zv_volsize = volsize;
658 zv->zv_provider->mediasize = volsize; /* XXX: Not supported. */
668 zvol_set_volblocksize(const char *name, uint64_t volblocksize)
677 if ((zv = zvol_minor_lookup(name)) == NULL) {
682 if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
687 tx = dmu_tx_create(zv->zv_objset);
688 dmu_tx_hold_bonus(tx, ZVOL_OBJ);
689 error = dmu_tx_assign(tx, TXG_WAIT);
693 error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
694 volblocksize, 0, tx);
695 if (error == ENOTSUP)
698 /* XXX: Not supported. */
701 zv->zv_provider->sectorsize = zc->zc_volblocksize;
712 zvol_get_done(dmu_buf_t *db, void *vzgd)
714 zgd_t *zgd = (zgd_t *)vzgd;
715 rl_t *rl = zgd->zgd_rl;
717 dmu_buf_rele(db, vzgd);
718 zfs_range_unlock(rl);
719 zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
720 kmem_free(zgd, sizeof (zgd_t));
724 * Get data to generate a TX_WRITE intent log record.
727 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
729 zvol_state_t *zv = arg;
730 objset_t *os = zv->zv_objset;
734 uint64_t boff; /* block starting offset */
735 int dlen = lr->lr_length; /* length of user data */
742 * Write records come in two flavors: immediate and indirect.
743 * For small writes it's cheaper to store the data with the
744 * log record (immediate); for large writes it's cheaper to
745 * sync the data and get a pointer to it (indirect) so that
746 * we don't have to write the data twice.
748 if (buf != NULL) /* immediate write */
749 return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
751 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
752 zgd->zgd_zilog = zv->zv_zilog;
753 zgd->zgd_bp = &lr->lr_blkptr;
756 * Lock the range of the block to ensure that when the data is
757 * written out and it's checksum is being calculated that no other
758 * thread can change the block.
760 boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
761 rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
765 VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
766 error = dmu_sync(zio, db, &lr->lr_blkptr,
767 lr->lr_common.lrc_txg, zvol_get_done, zgd);
769 zil_add_vdev(zv->zv_zilog,
770 DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
772 * If we get EINPROGRESS, then we need to wait for a
773 * write IO initiated by dmu_sync() to complete before
774 * we can release this dbuf. We will finish everything
775 * up in the zvol_get_done() callback.
777 if (error == EINPROGRESS)
779 dmu_buf_rele(db, zgd);
780 zfs_range_unlock(rl);
781 kmem_free(zgd, sizeof (zgd_t));
788 return (zvol_minors != 0);
794 ZFS_LOG(1, "ZVOL Initialized.");
800 ZFS_LOG(1, "ZVOL Deinitialized.");