4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
26 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
31 * ZFS volume emulation driver.
33 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
34 * Volumes are accessed through the symbolic links named:
36 * /dev/zvol/dsk/<pool_name>/<dataset_name>
37 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
39 * These links are created by the ZFS-specific devfsadm link generator.
40 * Volumes are persistent through reboot. No user command needs to be
41 * run before opening and using a device.
44 #include <sys/types.h>
45 #include <sys/param.h>
46 #include <sys/kernel.h>
47 #include <sys/errno.h>
53 #include <sys/cmn_err.h>
58 #include <sys/dmu_traverse.h>
59 #include <sys/dnode.h>
60 #include <sys/dsl_dataset.h>
61 #include <sys/dsl_prop.h>
63 #include <sys/byteorder.h>
64 #include <sys/sunddi.h>
65 #include <sys/dirent.h>
66 #include <sys/policy.h>
67 #include <sys/fs/zfs.h>
68 #include <sys/zfs_ioctl.h>
70 #include <sys/refcount.h>
71 #include <sys/zfs_znode.h>
72 #include <sys/zfs_rlock.h>
73 #include <sys/vdev_impl.h>
75 #include <geom/geom.h>
77 #include "zfs_namecheck.h"
79 #define ZVOL_DUMPSIZE "dumpsize"
81 struct g_class zfs_zvol_class = {
86 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
89 * This lock protects the zvol_state structure from being modified
90 * while it's being used, e.g. an open that comes in before a create
91 * finishes. It also protects temporary opens of the dataset so that,
92 * e.g., an open doesn't get a spurious EBUSY.
94 static kmutex_t zvol_state_lock;
95 static uint32_t zvol_minors;
97 typedef struct zvol_extent {
99 dva_t ze_dva; /* dva associated with this extent */
100 uint64_t ze_nblks; /* number of blocks in extent */
104 * The in-core state of each volume.
106 typedef struct zvol_state {
107 char zv_name[MAXPATHLEN]; /* pool/dd name */
108 uint64_t zv_volsize; /* amount of space we advertise */
109 uint64_t zv_volblocksize; /* volume block size */
110 struct g_provider *zv_provider; /* GEOM provider */
111 uint8_t zv_min_bs; /* minimum addressable block shift */
112 uint8_t zv_flags; /* readonly; dumpified */
113 objset_t *zv_objset; /* objset handle */
114 uint32_t zv_mode; /* DS_MODE_* flags at open time */
115 uint32_t zv_total_opens; /* total open count */
116 zilog_t *zv_zilog; /* ZIL handle */
117 list_t zv_extents; /* List of extents for dump */
118 uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
119 znode_t zv_znode; /* for range locking */
121 struct bio_queue_head zv_queue;
122 struct mtx zv_queue_mtx; /* zv_queue mutex */
126 * zvol specific flags
128 #define ZVOL_RDONLY 0x1
129 #define ZVOL_DUMPIFIED 0x2
130 #define ZVOL_EXCL 0x4
133 * zvol maximum transfer in one DMU tx.
135 int zvol_maxphys = DMU_MAX_ACCESS/2;
137 extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
138 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
139 static int zvol_dumpify(zvol_state_t *zv);
140 static int zvol_dump_fini(zvol_state_t *zv);
141 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
144 zvol_size_changed(zvol_state_t *zv, major_t maj)
146 struct g_provider *pp;
150 pp = zv->zv_provider;
153 if (zv->zv_volsize == pp->mediasize)
156 * Changing provider size is not really supported by GEOM, but it
157 * should be safe when provider is closed.
159 if (zv->zv_total_opens > 0)
161 pp->mediasize = zv->zv_volsize;
165 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
170 if (volsize % blocksize != 0)
174 if (volsize - 1 > SPEC_MAXOFFSET_T)
181 zvol_check_volblocksize(uint64_t volblocksize)
183 if (volblocksize < SPA_MINBLOCKSIZE ||
184 volblocksize > SPA_MAXBLOCKSIZE ||
192 zvol_readonly_changed_cb(void *arg, uint64_t newval)
194 zvol_state_t *zv = arg;
197 zv->zv_flags |= ZVOL_RDONLY;
199 zv->zv_flags &= ~ZVOL_RDONLY;
203 zvol_get_stats(objset_t *os, nvlist_t *nv)
206 dmu_object_info_t doi;
210 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
214 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
216 error = dmu_object_info(os, ZVOL_OBJ, &doi);
219 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
220 doi.doi_data_block_size);
226 static zvol_state_t *
227 zvol_minor_lookup(const char *name)
229 struct g_provider *pp;
233 ASSERT(MUTEX_HELD(&zvol_state_lock));
235 LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
236 LIST_FOREACH(pp, &gp->provider, provider) {
237 if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0)
238 return (pp->private);
246 zvol_access(struct g_provider *pp, int acr, int acw, int ace)
251 mutex_enter(&zvol_state_lock);
255 if (acr <= 0 && acw <= 0 && ace <= 0)
257 mutex_exit(&zvol_state_lock);
261 ASSERT(zv->zv_objset != NULL);
264 ((zv->zv_flags & ZVOL_RDONLY) ||
265 (zv->zv_mode & DS_MODE_READONLY))) {
266 mutex_exit(&zvol_state_lock);
270 zv->zv_total_opens += acr + acw + ace;
271 zvol_size_changed(zv, 0);
273 mutex_exit(&zvol_state_lock);
279 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
281 * We store data in the log buffers if it's small enough.
282 * Otherwise we will later flush the data out via dmu_sync().
284 ssize_t zvol_immediate_write_sz = 32768;
287 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
289 uint32_t blocksize = zv->zv_volblocksize;
293 ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
294 itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
297 len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
298 itx->itx_private = zv;
299 lr = (lr_write_t *)&itx->itx_lr;
300 lr->lr_foid = ZVOL_OBJ;
302 lr->lr_length = nbytes;
303 lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
304 BP_ZERO(&lr->lr_blkptr);
306 (void) zil_itx_assign(zv->zv_zilog, itx, tx);
313 zvol_start(struct bio *bp)
317 switch (bp->bio_cmd) {
321 zv = bp->bio_to->private;
323 mtx_lock(&zv->zv_queue_mtx);
324 bioq_insert_tail(&zv->zv_queue, bp);
325 wakeup_one(&zv->zv_queue);
326 mtx_unlock(&zv->zv_queue_mtx);
329 if (g_handleattr_int(bp, "ZFS::iszvol", 1))
334 g_io_deliver(bp, EOPNOTSUPP);
340 zvol_serve_one(zvol_state_t *zv, struct bio *bp)
342 uint64_t off, volsize;
348 boolean_t doread = (bp->bio_cmd == BIO_READ);
350 off = bp->bio_offset;
351 volsize = zv->zv_volsize;
357 resid = bp->bio_length;
362 * There must be no buffer changes when doing a dmu_sync() because
363 * we can't change the data whilst calculating the checksum.
364 * A better approach than a per zvol rwlock would be to lock ranges.
366 rl = zfs_range_lock(&zv->zv_znode, off, resid,
367 doread ? RL_READER : RL_WRITER);
369 while (resid != 0 && off < volsize) {
370 size_t size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
372 if (size > volsize - off) /* don't write past the end */
373 size = volsize - off;
376 error = dmu_read(os, ZVOL_OBJ, off, size, addr);
378 dmu_tx_t *tx = dmu_tx_create(os);
379 dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
380 error = dmu_tx_assign(tx, TXG_WAIT);
384 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
385 zvol_log_write(zv, tx, off, size);
390 /* convert checksum errors into IO errors */
399 zfs_range_unlock(rl);
401 bp->bio_completed = bp->bio_length - resid;
402 if (bp->bio_completed < bp->bio_length)
403 bp->bio_error = (off > volsize ? EINVAL : error);
407 zvol_worker(void *arg)
412 thread_lock(curthread);
413 sched_prio(curthread, PRIBIO);
414 thread_unlock(curthread);
418 mtx_lock(&zv->zv_queue_mtx);
419 bp = bioq_takefirst(&zv->zv_queue);
421 if (zv->zv_state == 1) {
423 wakeup(&zv->zv_state);
424 mtx_unlock(&zv->zv_queue_mtx);
427 msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
431 mtx_unlock(&zv->zv_queue_mtx);
432 switch (bp->bio_cmd) {
437 zvol_serve_one(zv, bp);
441 if (bp->bio_cmd == BIO_FLUSH && !zil_disable)
442 zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
444 g_io_deliver(bp, bp->bio_error);
448 /* extent mapping arg */
456 zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
457 const dnode_phys_t *dnp, void *arg)
459 struct maparg *ma = arg;
461 int bs = ma->ma_zv->zv_volblocksize;
463 if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
466 VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
469 /* Abort immediately if we have encountered gang blocks */
474 * See if the block is at the end of the previous extent.
476 ze = list_tail(&ma->ma_zv->zv_extents);
478 DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
479 DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
480 DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
485 dprintf_bp(bp, "%s", "next blkptr:");
487 /* start a new extent */
488 ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
489 ze->ze_dva = bp->blk_dva[0]; /* structure assignment */
491 list_insert_tail(&ma->ma_zv->zv_extents, ze);
496 zvol_free_extents(zvol_state_t *zv)
500 while (ze = list_head(&zv->zv_extents)) {
501 list_remove(&zv->zv_extents, ze);
502 kmem_free(ze, sizeof (zvol_extent_t));
507 zvol_get_lbas(zvol_state_t *zv)
514 zvol_free_extents(zv);
516 err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0,
517 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
518 if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
519 zvol_free_extents(zv);
520 return (err ? err : EIO);
528 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
530 zfs_creat_t *zct = arg;
531 nvlist_t *nvprops = zct->zct_props;
533 uint64_t volblocksize, volsize;
535 VERIFY(nvlist_lookup_uint64(nvprops,
536 zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
537 if (nvlist_lookup_uint64(nvprops,
538 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
539 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
542 * These properties must be removed from the list so the generic
543 * property setting step won't apply to them.
545 VERIFY(nvlist_remove_all(nvprops,
546 zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
547 (void) nvlist_remove_all(nvprops,
548 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
550 error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
554 error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
558 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
563 * Replay a TX_WRITE ZIL transaction that didn't get committed
564 * after a system failure
567 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
569 objset_t *os = zv->zv_objset;
570 char *data = (char *)(lr + 1); /* data follows lr_write_t */
571 uint64_t off = lr->lr_offset;
572 uint64_t len = lr->lr_length;
577 byteswap_uint64_array(lr, sizeof (*lr));
579 tx = dmu_tx_create(os);
580 dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
581 error = dmu_tx_assign(tx, zv->zv_txg_assign);
585 dmu_write(os, ZVOL_OBJ, off, len, data, tx);
594 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
600 * Callback vectors for replaying records.
601 * Only TX_WRITE is needed for zvol.
603 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
604 zvol_replay_err, /* 0 no such transaction type */
605 zvol_replay_err, /* TX_CREATE */
606 zvol_replay_err, /* TX_MKDIR */
607 zvol_replay_err, /* TX_MKXATTR */
608 zvol_replay_err, /* TX_SYMLINK */
609 zvol_replay_err, /* TX_REMOVE */
610 zvol_replay_err, /* TX_RMDIR */
611 zvol_replay_err, /* TX_LINK */
612 zvol_replay_err, /* TX_RENAME */
613 zvol_replay_write, /* TX_WRITE */
614 zvol_replay_err, /* TX_TRUNCATE */
615 zvol_replay_err, /* TX_SETATTR */
616 zvol_replay_err, /* TX_ACL */
620 * Create a minor node (plus a whole lot more) for the specified volume.
623 zvol_create_minor(const char *name, major_t maj)
625 struct g_provider *pp;
629 dmu_object_info_t doi;
631 int ds_mode = DS_MODE_OWNER;
636 mutex_enter(&zvol_state_lock);
638 if ((zv = zvol_minor_lookup(name)) != NULL) {
643 if (strchr(name, '@') != 0)
644 ds_mode |= DS_MODE_READONLY;
646 error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
650 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
652 dmu_objset_close(os);
656 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
657 gp->start = zvol_start;
658 gp->access = zvol_access;
659 pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name);
660 pp->mediasize = volsize;
661 pp->sectorsize = DEV_BSIZE;
663 zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
664 (void) strcpy(zv->zv_name, name);
665 zv->zv_min_bs = DEV_BSHIFT;
666 zv->zv_provider = pp;
667 zv->zv_volsize = pp->mediasize;
669 zv->zv_mode = ds_mode;
670 zv->zv_zilog = zil_open(os, zvol_get_data);
671 mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
672 avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
673 sizeof (rl_t), offsetof(rl_t, r_node));
674 list_create(&zv->zv_extents, sizeof (zvol_extent_t),
675 offsetof(zvol_extent_t, ze_node));
676 /* get and cache the blocksize */
677 error = dmu_object_info(os, ZVOL_OBJ, &doi);
679 zv->zv_volblocksize = doi.doi_data_block_size;
681 zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
683 /* XXX this should handle the possible i/o error */
684 VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
685 "readonly", zvol_readonly_changed_cb, zv) == 0);
688 g_error_provider(pp, 0);
690 bioq_init(&zv->zv_queue);
691 mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
693 kproc_kthread_add(zvol_worker, zv, &zfsproc, NULL, 0, 0, "zfskern",
694 "zvol %s", pp->name + strlen(ZVOL_DEV_DIR) + 1);
698 mutex_exit(&zvol_state_lock);
706 * Remove minor node for the specified volume.
709 zvol_remove_minor(const char *name)
711 struct g_provider *pp;
717 mutex_enter(&zvol_state_lock);
719 if ((zv = zvol_minor_lookup(name)) == NULL) {
724 if (zv->zv_total_opens != 0) {
729 VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
730 "readonly", zvol_readonly_changed_cb, zv) == 0);
732 mtx_lock(&zv->zv_queue_mtx);
734 wakeup_one(&zv->zv_queue);
735 while (zv->zv_state != 2)
736 msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
737 mtx_unlock(&zv->zv_queue_mtx);
738 mtx_destroy(&zv->zv_queue_mtx);
740 pp = zv->zv_provider;
742 g_wither_geom(pp->geom, ENXIO);
744 zil_close(zv->zv_zilog);
746 dmu_objset_close(zv->zv_objset);
747 zv->zv_objset = NULL;
748 avl_destroy(&zv->zv_znode.z_range_avl);
749 mutex_destroy(&zv->zv_znode.z_range_lock);
751 kmem_free(zv, sizeof(*zv));
755 mutex_exit(&zvol_state_lock);
763 zvol_prealloc(zvol_state_t *zv)
765 objset_t *os = zv->zv_objset;
768 uint64_t refd, avail, usedobjs, availobjs;
769 uint64_t resid = zv->zv_volsize;
772 /* Check the space usage before attempting to allocate the space */
773 dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
774 if (avail < zv->zv_volsize)
777 /* Free old extents if they exist */
778 zvol_free_extents(zv);
780 /* allocate the blocks by writing each one */
781 data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP);
785 uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
787 tx = dmu_tx_create(os);
788 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
789 error = dmu_tx_assign(tx, TXG_WAIT);
792 kmem_free(data, SPA_MAXBLOCKSIZE);
793 (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
796 dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
801 kmem_free(data, SPA_MAXBLOCKSIZE);
802 txg_wait_synced(dmu_objset_pool(os), 0);
808 zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
813 ASSERT(MUTEX_HELD(&zvol_state_lock));
815 tx = dmu_tx_create(zv->zv_objset);
816 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
817 error = dmu_tx_assign(tx, TXG_WAIT);
823 error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
828 error = dmu_free_long_range(zv->zv_objset,
829 ZVOL_OBJ, volsize, DMU_OBJECT_END);
832 * If we are using a faked-up state (zv_provider == NULL) then don't
833 * try to update the in-core zvol state.
835 if (error == 0 && zv->zv_provider) {
836 zv->zv_volsize = volsize;
837 zvol_size_changed(zv, maj);
843 zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
847 dmu_object_info_t doi;
848 uint64_t old_volsize = 0ULL;
849 zvol_state_t state = { 0 };
853 mutex_enter(&zvol_state_lock);
855 if ((zv = zvol_minor_lookup(name)) == NULL) {
857 * If we are doing a "zfs clone -o volsize=", then the
858 * minor node won't exist yet.
860 error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER,
866 old_volsize = zv->zv_volsize;
868 if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
869 (error = zvol_check_volsize(volsize,
870 doi.doi_data_block_size)) != 0)
873 if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
878 error = zvol_update_volsize(zv, maj, volsize);
882 * Reinitialize the dump area to the new size. If we
883 * failed to resize the dump area then restore the it back to
884 * it's original size.
886 if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) {
887 if ((error = zvol_dumpify(zv)) != 0 ||
888 (error = dumpvp_resize()) != 0) {
889 (void) zvol_update_volsize(zv, maj, old_volsize);
890 error = zvol_dumpify(zv);
897 dmu_objset_close(state.zv_objset);
899 mutex_exit(&zvol_state_lock);
907 zvol_set_volblocksize(const char *name, uint64_t volblocksize)
915 mutex_enter(&zvol_state_lock);
917 if ((zv = zvol_minor_lookup(name)) == NULL) {
921 if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
926 tx = dmu_tx_create(zv->zv_objset);
927 dmu_tx_hold_bonus(tx, ZVOL_OBJ);
928 error = dmu_tx_assign(tx, TXG_WAIT);
932 error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
933 volblocksize, 0, tx);
934 if (error == ENOTSUP)
938 zv->zv_volblocksize = volblocksize;
941 mutex_exit(&zvol_state_lock);
949 zvol_get_done(dmu_buf_t *db, void *vzgd)
951 zgd_t *zgd = (zgd_t *)vzgd;
952 rl_t *rl = zgd->zgd_rl;
954 dmu_buf_rele(db, vzgd);
955 zfs_range_unlock(rl);
956 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
957 kmem_free(zgd, sizeof (zgd_t));
961 * Get data to generate a TX_WRITE intent log record.
964 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
966 zvol_state_t *zv = arg;
967 objset_t *os = zv->zv_objset;
971 uint64_t boff; /* block starting offset */
972 int dlen = lr->lr_length; /* length of user data */
979 * Write records come in two flavors: immediate and indirect.
980 * For small writes it's cheaper to store the data with the
981 * log record (immediate); for large writes it's cheaper to
982 * sync the data and get a pointer to it (indirect) so that
983 * we don't have to write the data twice.
985 if (buf != NULL) /* immediate write */
986 return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
988 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
989 zgd->zgd_zilog = zv->zv_zilog;
990 zgd->zgd_bp = &lr->lr_blkptr;
993 * Lock the range of the block to ensure that when the data is
994 * written out and its checksum is being calculated that no other
995 * thread can change the block.
997 boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
998 rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
1002 VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
1003 error = dmu_sync(zio, db, &lr->lr_blkptr,
1004 lr->lr_common.lrc_txg, zvol_get_done, zgd);
1006 zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
1008 * If we get EINPROGRESS, then we need to wait for a
1009 * write IO initiated by dmu_sync() to complete before
1010 * we can release this dbuf. We will finish everything
1011 * up in the zvol_get_done() callback.
1013 if (error == EINPROGRESS)
1015 dmu_buf_rele(db, zgd);
1016 zfs_range_unlock(rl);
1017 kmem_free(zgd, sizeof (zgd_t));
1024 return (zvol_minors != 0);
1030 mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
1031 ZFS_LOG(1, "ZVOL Initialized.");
1037 mutex_destroy(&zvol_state_lock);
1038 ZFS_LOG(1, "ZVOL Deinitialized.");
1042 zvol_is_swap(zvol_state_t *zv)
1045 boolean_t ret = B_FALSE;
1051 devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1;
1052 devpath = kmem_alloc(devpathlen, KM_SLEEP);
1053 (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
1054 error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
1055 kmem_free(devpath, devpathlen);
1057 ret = !error && IS_SWAPVP(common_specvp(vp));
1067 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1071 objset_t *os = zv->zv_objset;
1072 nvlist_t *nv = NULL;
1074 ASSERT(MUTEX_HELD(&zvol_state_lock));
1076 tx = dmu_tx_create(os);
1077 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1078 error = dmu_tx_assign(tx, TXG_WAIT);
1085 * If we are resizing the dump device then we only need to
1086 * update the refreservation to match the newly updated
1087 * zvolsize. Otherwise, we save off the original state of the
1088 * zvol so that we can restore them if the zvol is ever undumpified.
1091 error = zap_update(os, ZVOL_ZAP_OBJ,
1092 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1093 &zv->zv_volsize, tx);
1095 uint64_t checksum, compress, refresrv, vbs;
1097 error = dsl_prop_get_integer(zv->zv_name,
1098 zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1099 error = error ? error : dsl_prop_get_integer(zv->zv_name,
1100 zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
1101 error = error ? error : dsl_prop_get_integer(zv->zv_name,
1102 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
1103 error = error ? error : dsl_prop_get_integer(zv->zv_name,
1104 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
1106 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1107 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1109 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1110 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
1111 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1112 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1114 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1115 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1120 /* Truncate the file */
1122 error = dmu_free_long_range(zv->zv_objset,
1123 ZVOL_OBJ, 0, DMU_OBJECT_END);
1129 * We only need update the zvol's property if we are initializing
1130 * the dump area for the first time.
1133 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1134 VERIFY(nvlist_add_uint64(nv,
1135 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
1136 VERIFY(nvlist_add_uint64(nv,
1137 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
1138 ZIO_COMPRESS_OFF) == 0);
1139 VERIFY(nvlist_add_uint64(nv,
1140 zfs_prop_to_name(ZFS_PROP_CHECKSUM),
1141 ZIO_CHECKSUM_OFF) == 0);
1142 VERIFY(nvlist_add_uint64(nv,
1143 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
1144 SPA_MAXBLOCKSIZE) == 0);
1146 error = zfs_set_prop_nvlist(zv->zv_name, nv);
1153 /* Allocate the space for the dump */
1154 error = zvol_prealloc(zv);
1159 zvol_dumpify(zvol_state_t *zv)
1162 uint64_t dumpsize = 0;
1164 objset_t *os = zv->zv_objset;
1166 if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))
1170 * We do not support swap devices acting as dump devices.
1172 if (zvol_is_swap(zv))
1175 if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
1176 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
1177 boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE;
1179 if ((error = zvol_dump_init(zv, resize)) != 0) {
1180 (void) zvol_dump_fini(zv);
1186 * Build up our lba mapping.
1188 error = zvol_get_lbas(zv);
1190 (void) zvol_dump_fini(zv);
1194 tx = dmu_tx_create(os);
1195 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1196 error = dmu_tx_assign(tx, TXG_WAIT);
1199 (void) zvol_dump_fini(zv);
1203 zv->zv_flags |= ZVOL_DUMPIFIED;
1204 error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
1205 &zv->zv_volsize, tx);
1209 (void) zvol_dump_fini(zv);
1213 txg_wait_synced(dmu_objset_pool(os), 0);
1218 zvol_dump_fini(zvol_state_t *zv)
1221 objset_t *os = zv->zv_objset;
1224 uint64_t checksum, compress, refresrv, vbs;
1227 * Attempt to restore the zvol back to its pre-dumpified state.
1228 * This is a best-effort attempt as it's possible that not all
1229 * of these properties were initialized during the dumpify process
1230 * (i.e. error during zvol_dump_init).
1233 tx = dmu_tx_create(os);
1234 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1235 error = dmu_tx_assign(tx, TXG_WAIT);
1240 (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
1243 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1244 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
1245 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1246 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
1247 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1248 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
1249 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1250 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
1252 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1253 (void) nvlist_add_uint64(nv,
1254 zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
1255 (void) nvlist_add_uint64(nv,
1256 zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
1257 (void) nvlist_add_uint64(nv,
1258 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
1259 (void) nvlist_add_uint64(nv,
1260 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs);
1261 (void) zfs_set_prop_nvlist(zv->zv_name, nv);
1264 zvol_free_extents(zv);
1265 zv->zv_flags &= ~ZVOL_DUMPIFIED;
1266 (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);