4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
27 * ZFS volume emulation driver.
29 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
30 * Volumes are accessed through the symbolic links named:
32 * /dev/<pool_name>/<dataset_name>
34 * Volumes are persistent through reboot and module load. No user command
35 * needs to be run before opening and using a device.
39 #include <sys/dmu_traverse.h>
40 #include <sys/dsl_dataset.h>
41 #include <sys/dsl_prop.h>
43 #include <sys/zfeature.h>
44 #include <sys/zil_impl.h>
46 #include <sys/zfs_rlock.h>
47 #include <sys/zfs_znode.h>
49 #include <linux/blkdev_compat.h>
51 unsigned int zvol_inhibit_dev = 0;
52 unsigned int zvol_major = ZVOL_MAJOR;
53 unsigned int zvol_prefetch_bytes = (128 * 1024);
54 unsigned long zvol_max_discard_blocks = 16384;
56 static kmutex_t zvol_state_lock;
57 static list_t zvol_state_list;
58 static char *zvol_tag = "zvol_tag";
61 * The in-core state of each volume.
63 typedef struct zvol_state {
64 char zv_name[MAXNAMELEN]; /* name */
65 uint64_t zv_volsize; /* advertised space */
66 uint64_t zv_volblocksize; /* volume block size */
67 objset_t *zv_objset; /* objset handle */
68 uint32_t zv_flags; /* ZVOL_* flags */
69 uint32_t zv_open_count; /* open counts */
70 uint32_t zv_changed; /* disk changed */
71 zilog_t *zv_zilog; /* ZIL handle */
72 znode_t zv_znode; /* for range locking */
73 dmu_buf_t *zv_dbuf; /* bonus handle */
74 dev_t zv_dev; /* device id */
75 struct gendisk *zv_disk; /* generic disk */
76 struct request_queue *zv_queue; /* request queue */
77 spinlock_t zv_lock; /* request queue lock */
78 list_node_t zv_next; /* next zvol_state_t linkage */
81 #define ZVOL_RDONLY 0x1
84 * Find the next available range of ZVOL_MINORS minor numbers. The
85 * zvol_state_list is kept in ascending minor order so we simply need
86 * to scan the list for the first gap in the sequence. This allows us
87 * to recycle minor number as devices are created and removed.
90 zvol_find_minor(unsigned *minor)
95 ASSERT(MUTEX_HELD(&zvol_state_lock));
96 for (zv = list_head(&zvol_state_list); zv != NULL;
97 zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) {
98 if (MINOR(zv->zv_dev) != MINOR(*minor))
102 /* All minors are in use */
103 if (*minor >= (1 << MINORBITS))
104 return (SET_ERROR(ENXIO));
110 * Find a zvol_state_t given the full major+minor dev_t.
112 static zvol_state_t *
113 zvol_find_by_dev(dev_t dev)
117 ASSERT(MUTEX_HELD(&zvol_state_lock));
118 for (zv = list_head(&zvol_state_list); zv != NULL;
119 zv = list_next(&zvol_state_list, zv)) {
120 if (zv->zv_dev == dev)
128 * Find a zvol_state_t given the name provided at zvol_alloc() time.
130 static zvol_state_t *
131 zvol_find_by_name(const char *name)
135 ASSERT(MUTEX_HELD(&zvol_state_lock));
136 for (zv = list_head(&zvol_state_list); zv != NULL;
137 zv = list_next(&zvol_state_list, zv)) {
138 if (strncmp(zv->zv_name, name, MAXNAMELEN) == 0)
147 * Given a path, return TRUE if path is a ZVOL.
150 zvol_is_zvol(const char *device)
152 struct block_device *bdev;
155 bdev = lookup_bdev(device);
159 major = MAJOR(bdev->bd_dev);
162 if (major == zvol_major)
169 * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
172 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
174 zfs_creat_t *zct = arg;
175 nvlist_t *nvprops = zct->zct_props;
177 uint64_t volblocksize, volsize;
179 VERIFY(nvlist_lookup_uint64(nvprops,
180 zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
181 if (nvlist_lookup_uint64(nvprops,
182 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
183 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
186 * These properties must be removed from the list so the generic
187 * property setting step won't apply to them.
189 VERIFY(nvlist_remove_all(nvprops,
190 zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
191 (void) nvlist_remove_all(nvprops,
192 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
194 error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
198 error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
202 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
207 * ZFS_IOC_OBJSET_STATS entry point.
210 zvol_get_stats(objset_t *os, nvlist_t *nv)
213 dmu_object_info_t *doi;
216 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
218 return (SET_ERROR(error));
220 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
221 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
222 error = dmu_object_info(os, ZVOL_OBJ, doi);
225 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
226 doi->doi_data_block_size);
229 kmem_free(doi, sizeof (dmu_object_info_t));
231 return (SET_ERROR(error));
235 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
237 struct block_device *bdev;
239 bdev = bdget_disk(zv->zv_disk, 0);
244 * Added check_disk_size_change() helper function.
246 #ifdef HAVE_CHECK_DISK_SIZE_CHANGE
247 set_capacity(zv->zv_disk, volsize >> 9);
248 zv->zv_volsize = volsize;
249 check_disk_size_change(zv->zv_disk, bdev);
251 zv->zv_volsize = volsize;
253 (void) check_disk_change(bdev);
254 #endif /* HAVE_CHECK_DISK_SIZE_CHANGE */
260 * Sanity check volume size.
263 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
266 return (SET_ERROR(EINVAL));
268 if (volsize % blocksize != 0)
269 return (SET_ERROR(EINVAL));
272 if (volsize - 1 > MAXOFFSET_T)
273 return (SET_ERROR(EOVERFLOW));
279 * Ensure the zap is flushed then inform the VFS of the capacity change.
282 zvol_update_volsize(uint64_t volsize, objset_t *os)
287 ASSERT(MUTEX_HELD(&zvol_state_lock));
289 tx = dmu_tx_create(os);
290 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
291 error = dmu_tx_assign(tx, TXG_WAIT);
294 return (SET_ERROR(error));
297 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
302 error = dmu_free_long_range(os,
303 ZVOL_OBJ, volsize, DMU_OBJECT_END);
309 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
311 zvol_size_changed(zv, volsize);
314 * We should post a event here describing the expansion. However,
315 * the zfs_ereport_post() interface doesn't nicely support posting
316 * events for zvols, it assumes events relate to vdevs or zios.
323 * Set ZFS_PROP_VOLSIZE set entry point.
326 zvol_set_volsize(const char *name, uint64_t volsize)
328 zvol_state_t *zv = NULL;
331 dmu_object_info_t *doi;
333 boolean_t owned = B_FALSE;
335 error = dsl_prop_get_integer(name,
336 zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
338 return (SET_ERROR(error));
340 return (SET_ERROR(EROFS));
342 mutex_enter(&zvol_state_lock);
343 zv = zvol_find_by_name(name);
345 if (zv == NULL || zv->zv_objset == NULL) {
346 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
348 mutex_exit(&zvol_state_lock);
349 return (SET_ERROR(error));
358 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
360 if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
361 (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
364 error = zvol_update_volsize(volsize, os);
365 kmem_free(doi, sizeof (dmu_object_info_t));
367 if (error == 0 && zv != NULL)
368 error = zvol_update_live_volsize(zv, volsize);
371 dmu_objset_disown(os, FTAG);
373 zv->zv_objset = NULL;
375 mutex_exit(&zvol_state_lock);
380 * Sanity check volume block size.
383 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
385 /* Record sizes above 128k need the feature to be enabled */
386 if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
390 if ((error = spa_open(name, &spa, FTAG)) != 0)
393 if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
394 spa_close(spa, FTAG);
395 return (SET_ERROR(ENOTSUP));
399 * We don't allow setting the property above 1MB,
400 * unless the tunable has been changed.
402 if (volblocksize > zfs_max_recordsize)
403 return (SET_ERROR(EDOM));
405 spa_close(spa, FTAG);
408 if (volblocksize < SPA_MINBLOCKSIZE ||
409 volblocksize > SPA_MAXBLOCKSIZE ||
411 return (SET_ERROR(EDOM));
417 * Set ZFS_PROP_VOLBLOCKSIZE set entry point.
420 zvol_set_volblocksize(const char *name, uint64_t volblocksize)
426 mutex_enter(&zvol_state_lock);
428 zv = zvol_find_by_name(name);
430 error = SET_ERROR(ENXIO);
434 if (zv->zv_flags & ZVOL_RDONLY) {
435 error = SET_ERROR(EROFS);
439 tx = dmu_tx_create(zv->zv_objset);
440 dmu_tx_hold_bonus(tx, ZVOL_OBJ);
441 error = dmu_tx_assign(tx, TXG_WAIT);
445 error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
446 volblocksize, 0, tx);
447 if (error == ENOTSUP)
448 error = SET_ERROR(EBUSY);
451 zv->zv_volblocksize = volblocksize;
454 mutex_exit(&zvol_state_lock);
456 return (SET_ERROR(error));
460 * Replay a TX_WRITE ZIL transaction that didn't get committed
461 * after a system failure
464 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
466 objset_t *os = zv->zv_objset;
467 char *data = (char *)(lr + 1); /* data follows lr_write_t */
468 uint64_t off = lr->lr_offset;
469 uint64_t len = lr->lr_length;
474 byteswap_uint64_array(lr, sizeof (*lr));
476 tx = dmu_tx_create(os);
477 dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
478 error = dmu_tx_assign(tx, TXG_WAIT);
482 dmu_write(os, ZVOL_OBJ, off, len, data, tx);
486 return (SET_ERROR(error));
490 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
492 return (SET_ERROR(ENOTSUP));
496 * Callback vectors for replaying records.
497 * Only TX_WRITE is needed for zvol.
499 zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = {
500 (zil_replay_func_t)zvol_replay_err, /* no such transaction type */
501 (zil_replay_func_t)zvol_replay_err, /* TX_CREATE */
502 (zil_replay_func_t)zvol_replay_err, /* TX_MKDIR */
503 (zil_replay_func_t)zvol_replay_err, /* TX_MKXATTR */
504 (zil_replay_func_t)zvol_replay_err, /* TX_SYMLINK */
505 (zil_replay_func_t)zvol_replay_err, /* TX_REMOVE */
506 (zil_replay_func_t)zvol_replay_err, /* TX_RMDIR */
507 (zil_replay_func_t)zvol_replay_err, /* TX_LINK */
508 (zil_replay_func_t)zvol_replay_err, /* TX_RENAME */
509 (zil_replay_func_t)zvol_replay_write, /* TX_WRITE */
510 (zil_replay_func_t)zvol_replay_err, /* TX_TRUNCATE */
511 (zil_replay_func_t)zvol_replay_err, /* TX_SETATTR */
512 (zil_replay_func_t)zvol_replay_err, /* TX_ACL */
516 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
518 * We store data in the log buffers if it's small enough.
519 * Otherwise we will later flush the data out via dmu_sync().
521 ssize_t zvol_immediate_write_sz = 32768;
524 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
525 uint64_t size, int sync)
527 uint32_t blocksize = zv->zv_volblocksize;
528 zilog_t *zilog = zv->zv_zilog;
530 ssize_t immediate_write_sz;
532 if (zil_replaying(zilog, tx))
535 immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
536 ? 0 : zvol_immediate_write_sz;
537 slogging = spa_has_slogs(zilog->zl_spa) &&
538 (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
544 itx_wr_state_t write_state;
547 * Unlike zfs_log_write() we can be called with
548 * up to DMU_MAX_ACCESS/2 (5MB) writes.
550 if (blocksize > immediate_write_sz && !slogging &&
551 size >= blocksize && offset % blocksize == 0) {
552 write_state = WR_INDIRECT; /* uses dmu_sync */
555 write_state = WR_COPIED;
556 len = MIN(ZIL_MAX_LOG_DATA, size);
558 write_state = WR_NEED_COPY;
559 len = MIN(ZIL_MAX_LOG_DATA, size);
562 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
563 (write_state == WR_COPIED ? len : 0));
564 lr = (lr_write_t *)&itx->itx_lr;
565 if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
566 ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
567 zil_itx_destroy(itx);
568 itx = zil_itx_create(TX_WRITE, sizeof (*lr));
569 lr = (lr_write_t *)&itx->itx_lr;
570 write_state = WR_NEED_COPY;
573 itx->itx_wr_state = write_state;
574 if (write_state == WR_NEED_COPY)
576 lr->lr_foid = ZVOL_OBJ;
577 lr->lr_offset = offset;
580 BP_ZERO(&lr->lr_blkptr);
582 itx->itx_private = zv;
583 itx->itx_sync = sync;
585 (void) zil_itx_assign(zilog, itx, tx);
593 zvol_write(struct bio *bio)
595 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
596 uint64_t offset = BIO_BI_SECTOR(bio) << 9;
597 uint64_t size = BIO_BI_SIZE(bio);
602 if (bio->bi_rw & VDEV_REQ_FLUSH)
603 zil_commit(zv->zv_zilog, ZVOL_OBJ);
606 * Some requests are just for flush and nothing else.
611 rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
613 tx = dmu_tx_create(zv->zv_objset);
614 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size);
616 /* This will only fail for ENOSPC */
617 error = dmu_tx_assign(tx, TXG_WAIT);
620 zfs_range_unlock(rl);
624 error = dmu_write_bio(zv->zv_objset, ZVOL_OBJ, bio, tx);
626 zvol_log_write(zv, tx, offset, size,
627 !!(bio->bi_rw & VDEV_REQ_FUA));
630 zfs_range_unlock(rl);
632 if ((bio->bi_rw & VDEV_REQ_FUA) ||
633 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
634 zil_commit(zv->zv_zilog, ZVOL_OBJ);
641 zvol_discard(struct bio *bio)
643 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
644 uint64_t start = BIO_BI_SECTOR(bio) << 9;
645 uint64_t size = BIO_BI_SIZE(bio);
646 uint64_t end = start + size;
650 if (end > zv->zv_volsize)
651 return (SET_ERROR(EIO));
654 * Align the request to volume block boundaries when REQ_SECURE is
655 * available, but not requested. If we don't, then this will force
656 * dnode_free_range() to zero out the unaligned parts, which is slow
657 * (read-modify-write) and useless since we are not freeing any space
658 * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
659 * 2.6.35) will not receive this optimization.
662 if (!(bio->bi_rw & REQ_SECURE)) {
663 start = P2ROUNDUP(start, zv->zv_volblocksize);
664 end = P2ALIGN(end, zv->zv_volblocksize);
671 rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
673 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size);
676 * TODO: maybe we should add the operation to the log.
679 zfs_range_unlock(rl);
685 zvol_read(struct bio *bio)
687 zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
688 uint64_t offset = BIO_BI_SECTOR(bio) << 9;
689 uint64_t len = BIO_BI_SIZE(bio);
697 rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
699 error = dmu_read_bio(zv->zv_objset, ZVOL_OBJ, bio);
701 zfs_range_unlock(rl);
703 /* convert checksum errors into IO errors */
705 error = SET_ERROR(EIO);
710 static MAKE_REQUEST_FN_RET
711 zvol_request(struct request_queue *q, struct bio *bio)
713 zvol_state_t *zv = q->queuedata;
714 fstrans_cookie_t cookie = spl_fstrans_mark();
715 uint64_t offset = BIO_BI_SECTOR(bio);
716 unsigned int sectors = bio_sectors(bio);
717 int rw = bio_data_dir(bio);
718 #ifdef HAVE_GENERIC_IO_ACCT
719 unsigned long start = jiffies;
723 if (bio_has_data(bio) && offset + sectors >
724 get_capacity(zv->zv_disk)) {
726 "%s: bad access: block=%llu, count=%lu\n",
727 zv->zv_disk->disk_name,
728 (long long unsigned)offset,
729 (long unsigned)sectors);
730 error = SET_ERROR(EIO);
734 generic_start_io_acct(rw, sectors, &zv->zv_disk->part0);
737 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
738 error = SET_ERROR(EROFS);
742 if (bio->bi_rw & VDEV_REQ_DISCARD) {
743 error = zvol_discard(bio);
747 error = zvol_write(bio);
749 error = zvol_read(bio);
752 generic_end_io_acct(rw, &zv->zv_disk->part0, start);
754 bio_endio(bio, -error);
755 spl_fstrans_unmark(cookie);
756 #ifdef HAVE_MAKE_REQUEST_FN_RET_INT
762 zvol_get_done(zgd_t *zgd, int error)
765 dmu_buf_rele(zgd->zgd_db, zgd);
767 zfs_range_unlock(zgd->zgd_rl);
769 if (error == 0 && zgd->zgd_bp)
770 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
772 kmem_free(zgd, sizeof (zgd_t));
776 * Get data to generate a TX_WRITE intent log record.
779 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
781 zvol_state_t *zv = arg;
782 objset_t *os = zv->zv_objset;
783 uint64_t object = ZVOL_OBJ;
784 uint64_t offset = lr->lr_offset;
785 uint64_t size = lr->lr_length;
786 blkptr_t *bp = &lr->lr_blkptr;
794 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
795 zgd->zgd_zilog = zv->zv_zilog;
796 zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
799 * Write records come in two flavors: immediate and indirect.
800 * For small writes it's cheaper to store the data with the
801 * log record (immediate); for large writes it's cheaper to
802 * sync the data and get a pointer to it (indirect) so that
803 * we don't have to write the data twice.
805 if (buf != NULL) { /* immediate write */
806 error = dmu_read(os, object, offset, size, buf,
807 DMU_READ_NO_PREFETCH);
809 size = zv->zv_volblocksize;
810 offset = P2ALIGN_TYPED(offset, size, uint64_t);
811 error = dmu_buf_hold(os, object, offset, zgd, &db,
812 DMU_READ_NO_PREFETCH);
814 blkptr_t *obp = dmu_buf_get_blkptr(db);
816 ASSERT(BP_IS_HOLE(bp));
821 zgd->zgd_bp = &lr->lr_blkptr;
824 ASSERT(db->db_offset == offset);
825 ASSERT(db->db_size == size);
827 error = dmu_sync(zio, lr->lr_common.lrc_txg,
835 zvol_get_done(zgd, error);
837 return (SET_ERROR(error));
841 * The zvol_state_t's are inserted in increasing MINOR(dev_t) order.
844 zvol_insert(zvol_state_t *zv_insert)
846 zvol_state_t *zv = NULL;
848 ASSERT(MUTEX_HELD(&zvol_state_lock));
849 ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
850 for (zv = list_head(&zvol_state_list); zv != NULL;
851 zv = list_next(&zvol_state_list, zv)) {
852 if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev))
856 list_insert_before(&zvol_state_list, zv, zv_insert);
860 * Simply remove the zvol from to list of zvols.
863 zvol_remove(zvol_state_t *zv_remove)
865 ASSERT(MUTEX_HELD(&zvol_state_lock));
866 list_remove(&zvol_state_list, zv_remove);
870 zvol_first_open(zvol_state_t *zv)
879 * In all other cases the spa_namespace_lock is taken before the
880 * bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
881 * function calls fops->open() with the bdev->bd_mutex lock held.
883 * To avoid a potential lock inversion deadlock we preemptively
884 * try to take the spa_namespace_lock(). Normally it will not
885 * be contended and this is safe because spa_open_common() handles
886 * the case where the caller already holds the spa_namespace_lock.
888 * When it is contended we risk a lock inversion if we were to
889 * block waiting for the lock. Luckily, the __blkdev_get()
890 * function allows us to return -ERESTARTSYS which will result in
891 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
892 * called again. This process can be repeated safely until both
893 * locks are acquired.
895 if (!mutex_owned(&spa_namespace_lock)) {
896 locked = mutex_tryenter(&spa_namespace_lock);
898 return (-SET_ERROR(ERESTARTSYS));
901 error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
905 /* lie and say we're read-only */
906 error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
910 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
912 dmu_objset_disown(os, zvol_tag);
917 error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
919 dmu_objset_disown(os, zvol_tag);
923 set_capacity(zv->zv_disk, volsize >> 9);
924 zv->zv_volsize = volsize;
925 zv->zv_zilog = zil_open(os, zvol_get_data);
927 if (ro || dmu_objset_is_snapshot(os) ||
928 !spa_writeable(dmu_objset_spa(os))) {
929 set_disk_ro(zv->zv_disk, 1);
930 zv->zv_flags |= ZVOL_RDONLY;
932 set_disk_ro(zv->zv_disk, 0);
933 zv->zv_flags &= ~ZVOL_RDONLY;
938 mutex_exit(&spa_namespace_lock);
940 return (SET_ERROR(-error));
944 zvol_last_close(zvol_state_t *zv)
946 zil_close(zv->zv_zilog);
949 dmu_buf_rele(zv->zv_dbuf, zvol_tag);
955 if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
956 !(zv->zv_flags & ZVOL_RDONLY))
957 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
958 (void) dmu_objset_evict_dbufs(zv->zv_objset);
960 dmu_objset_disown(zv->zv_objset, zvol_tag);
961 zv->zv_objset = NULL;
965 zvol_open(struct block_device *bdev, fmode_t flag)
967 zvol_state_t *zv = bdev->bd_disk->private_data;
968 int error = 0, drop_mutex = 0;
971 * If the caller is already holding the mutex do not take it
972 * again, this will happen as part of zvol_create_minor().
973 * Once add_disk() is called the device is live and the kernel
974 * will attempt to open it to read the partition information.
976 if (!mutex_owned(&zvol_state_lock)) {
977 mutex_enter(&zvol_state_lock);
981 ASSERT3P(zv, !=, NULL);
983 if (zv->zv_open_count == 0) {
984 error = zvol_first_open(zv);
989 if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
997 if (zv->zv_open_count == 0)
1002 mutex_exit(&zvol_state_lock);
1004 check_disk_change(bdev);
1006 return (SET_ERROR(error));
1009 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
1014 zvol_release(struct gendisk *disk, fmode_t mode)
1016 zvol_state_t *zv = disk->private_data;
1019 if (!mutex_owned(&zvol_state_lock)) {
1020 mutex_enter(&zvol_state_lock);
1024 if (zv->zv_open_count > 0) {
1025 zv->zv_open_count--;
1026 if (zv->zv_open_count == 0)
1027 zvol_last_close(zv);
1031 mutex_exit(&zvol_state_lock);
1033 #ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
1039 zvol_ioctl(struct block_device *bdev, fmode_t mode,
1040 unsigned int cmd, unsigned long arg)
1042 zvol_state_t *zv = bdev->bd_disk->private_data;
1046 return (SET_ERROR(-ENXIO));
1050 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1053 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
1062 return (SET_ERROR(error));
1065 #ifdef CONFIG_COMPAT
1067 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
1068 unsigned cmd, unsigned long arg)
1070 return (zvol_ioctl(bdev, mode, cmd, arg));
1073 #define zvol_compat_ioctl NULL
1076 static int zvol_media_changed(struct gendisk *disk)
1078 zvol_state_t *zv = disk->private_data;
1080 return (zv->zv_changed);
1083 static int zvol_revalidate_disk(struct gendisk *disk)
1085 zvol_state_t *zv = disk->private_data;
1088 set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
1094 * Provide a simple virtual geometry for legacy compatibility. For devices
1095 * smaller than 1 MiB a small head and sector count is used to allow very
1096 * tiny devices. For devices over 1 Mib a standard head and sector count
1097 * is used to keep the cylinders count reasonable.
1100 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1102 zvol_state_t *zv = bdev->bd_disk->private_data;
1103 sector_t sectors = get_capacity(zv->zv_disk);
1105 if (sectors > 2048) {
1114 geo->cylinders = sectors / (geo->heads * geo->sectors);
1119 static struct kobject *
1120 zvol_probe(dev_t dev, int *part, void *arg)
1123 struct kobject *kobj;
1125 mutex_enter(&zvol_state_lock);
1126 zv = zvol_find_by_dev(dev);
1127 kobj = zv ? get_disk(zv->zv_disk) : NULL;
1128 mutex_exit(&zvol_state_lock);
1133 #ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
1134 static struct block_device_operations zvol_ops = {
1136 .release = zvol_release,
1137 .ioctl = zvol_ioctl,
1138 .compat_ioctl = zvol_compat_ioctl,
1139 .media_changed = zvol_media_changed,
1140 .revalidate_disk = zvol_revalidate_disk,
1141 .getgeo = zvol_getgeo,
1142 .owner = THIS_MODULE,
1145 #else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
1148 zvol_open_by_inode(struct inode *inode, struct file *file)
1150 return (zvol_open(inode->i_bdev, file->f_mode));
1154 zvol_release_by_inode(struct inode *inode, struct file *file)
1156 return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
1160 zvol_ioctl_by_inode(struct inode *inode, struct file *file,
1161 unsigned int cmd, unsigned long arg)
1163 if (file == NULL || inode == NULL)
1164 return (SET_ERROR(-EINVAL));
1166 return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
1169 #ifdef CONFIG_COMPAT
1171 zvol_compat_ioctl_by_inode(struct file *file,
1172 unsigned int cmd, unsigned long arg)
1175 return (SET_ERROR(-EINVAL));
1177 return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
1178 file->f_mode, cmd, arg));
1181 #define zvol_compat_ioctl_by_inode NULL
1184 static struct block_device_operations zvol_ops = {
1185 .open = zvol_open_by_inode,
1186 .release = zvol_release_by_inode,
1187 .ioctl = zvol_ioctl_by_inode,
1188 .compat_ioctl = zvol_compat_ioctl_by_inode,
1189 .media_changed = zvol_media_changed,
1190 .revalidate_disk = zvol_revalidate_disk,
1191 .getgeo = zvol_getgeo,
1192 .owner = THIS_MODULE,
1194 #endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
1197 * Allocate memory for a new zvol_state_t and setup the required
1198 * request queue and generic disk structures for the block device.
1200 static zvol_state_t *
1201 zvol_alloc(dev_t dev, const char *name)
1205 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
1207 spin_lock_init(&zv->zv_lock);
1208 list_link_init(&zv->zv_next);
1210 zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
1211 if (zv->zv_queue == NULL)
1214 blk_queue_make_request(zv->zv_queue, zvol_request);
1216 #ifdef HAVE_BLK_QUEUE_FLUSH
1217 blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
1219 blk_queue_ordered(zv->zv_queue, QUEUE_ORDERED_DRAIN, NULL);
1220 #endif /* HAVE_BLK_QUEUE_FLUSH */
1222 zv->zv_disk = alloc_disk(ZVOL_MINORS);
1223 if (zv->zv_disk == NULL)
1226 zv->zv_queue->queuedata = zv;
1228 zv->zv_open_count = 0;
1229 strlcpy(zv->zv_name, name, MAXNAMELEN);
1231 mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
1232 avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
1233 sizeof (rl_t), offsetof(rl_t, r_node));
1234 zv->zv_znode.z_is_zvol = TRUE;
1236 zv->zv_disk->major = zvol_major;
1237 zv->zv_disk->first_minor = (dev & MINORMASK);
1238 zv->zv_disk->fops = &zvol_ops;
1239 zv->zv_disk->private_data = zv;
1240 zv->zv_disk->queue = zv->zv_queue;
1241 snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d",
1242 ZVOL_DEV_NAME, (dev & MINORMASK));
1247 blk_cleanup_queue(zv->zv_queue);
1249 kmem_free(zv, sizeof (zvol_state_t));
1255 * Cleanup then free a zvol_state_t which was created by zvol_alloc().
1258 zvol_free(zvol_state_t *zv)
1260 avl_destroy(&zv->zv_znode.z_range_avl);
1261 mutex_destroy(&zv->zv_znode.z_range_lock);
1263 del_gendisk(zv->zv_disk);
1264 blk_cleanup_queue(zv->zv_queue);
1265 put_disk(zv->zv_disk);
1267 kmem_free(zv, sizeof (zvol_state_t));
1271 __zvol_snapdev_hidden(const char *name)
1278 parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1279 (void) strlcpy(parent, name, MAXPATHLEN);
1281 if ((atp = strrchr(parent, '@')) != NULL) {
1283 error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL);
1284 if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN))
1285 error = SET_ERROR(ENODEV);
1288 kmem_free(parent, MAXPATHLEN);
1290 return (SET_ERROR(error));
1294 __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
1298 dmu_object_info_t *doi;
1304 ASSERT(MUTEX_HELD(&zvol_state_lock));
1306 zv = zvol_find_by_name(name);
1308 error = SET_ERROR(EEXIST);
1312 if (ignore_snapdev == B_FALSE) {
1313 error = __zvol_snapdev_hidden(name);
1318 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1320 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
1324 error = dmu_object_info(os, ZVOL_OBJ, doi);
1326 goto out_dmu_objset_disown;
1328 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1330 goto out_dmu_objset_disown;
1332 error = zvol_find_minor(&minor);
1334 goto out_dmu_objset_disown;
1336 zv = zvol_alloc(MKDEV(zvol_major, minor), name);
1338 error = SET_ERROR(EAGAIN);
1339 goto out_dmu_objset_disown;
1342 if (dmu_objset_is_snapshot(os))
1343 zv->zv_flags |= ZVOL_RDONLY;
1345 zv->zv_volblocksize = doi->doi_data_block_size;
1346 zv->zv_volsize = volsize;
1349 set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
1351 blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
1352 blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
1353 blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
1354 blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
1355 blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
1356 blk_queue_max_discard_sectors(zv->zv_queue,
1357 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
1358 blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
1359 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
1360 #ifdef QUEUE_FLAG_NONROT
1361 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
1363 #ifdef QUEUE_FLAG_ADD_RANDOM
1364 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
1367 if (spa_writeable(dmu_objset_spa(os))) {
1368 if (zil_replay_disable)
1369 zil_destroy(dmu_objset_zil(os), B_FALSE);
1371 zil_replay(os, zv, zvol_replay_vector);
1375 * When udev detects the addition of the device it will immediately
1376 * invoke blkid(8) to determine the type of content on the device.
1377 * Prefetching the blocks commonly scanned by blkid(8) will speed
1380 len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
1382 dmu_prefetch(os, ZVOL_OBJ, 0, len);
1383 dmu_prefetch(os, ZVOL_OBJ, volsize - len, len);
1386 zv->zv_objset = NULL;
1387 out_dmu_objset_disown:
1388 dmu_objset_disown(os, zvol_tag);
1390 kmem_free(doi, sizeof (dmu_object_info_t));
1395 add_disk(zv->zv_disk);
1398 return (SET_ERROR(error));
1402 * Create a block device minor node and setup the linkage between it
1403 * and the specified volume. Once this function returns the block
1404 * device is live and ready for use.
1407 zvol_create_minor(const char *name)
1411 mutex_enter(&zvol_state_lock);
1412 error = __zvol_create_minor(name, B_FALSE);
1413 mutex_exit(&zvol_state_lock);
1415 return (SET_ERROR(error));
1419 __zvol_remove_minor(const char *name)
1423 ASSERT(MUTEX_HELD(&zvol_state_lock));
1425 zv = zvol_find_by_name(name);
1427 return (SET_ERROR(ENXIO));
1429 if (zv->zv_open_count > 0)
1430 return (SET_ERROR(EBUSY));
1439 * Remove a block device minor node for the specified volume.
1442 zvol_remove_minor(const char *name)
1446 mutex_enter(&zvol_state_lock);
1447 error = __zvol_remove_minor(name);
1448 mutex_exit(&zvol_state_lock);
1450 return (SET_ERROR(error));
1454 * Rename a block device minor mode for the specified volume.
1457 __zvol_rename_minor(zvol_state_t *zv, const char *newname)
1459 int readonly = get_disk_ro(zv->zv_disk);
1461 ASSERT(MUTEX_HELD(&zvol_state_lock));
1463 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1466 * The block device's read-only state is briefly changed causing
1467 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
1468 * the name change and fixes the symlinks. This does not change
1469 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1470 * changes. This would normally be done using kobject_uevent() but
1471 * that is a GPL-only symbol which is why we need this workaround.
1473 set_disk_ro(zv->zv_disk, !readonly);
1474 set_disk_ro(zv->zv_disk, readonly);
1478 zvol_create_minors_cb(const char *dsname, void *arg)
1480 (void) zvol_create_minor(dsname);
1486 * Create minors for specified dataset including children and snapshots.
1489 zvol_create_minors(const char *name)
1493 if (!zvol_inhibit_dev)
1494 error = dmu_objset_find((char *)name, zvol_create_minors_cb,
1495 NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1497 return (SET_ERROR(error));
1501 * Remove minors for specified dataset including children and snapshots.
1504 zvol_remove_minors(const char *name)
1506 zvol_state_t *zv, *zv_next;
1507 int namelen = ((name) ? strlen(name) : 0);
1509 if (zvol_inhibit_dev)
1512 mutex_enter(&zvol_state_lock);
1514 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1515 zv_next = list_next(&zvol_state_list, zv);
1517 if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
1518 (strncmp(zv->zv_name, name, namelen) == 0 &&
1519 zv->zv_name[namelen] == '/')) {
1525 mutex_exit(&zvol_state_lock);
1529 * Rename minors for specified dataset including children and snapshots.
1532 zvol_rename_minors(const char *oldname, const char *newname)
1534 zvol_state_t *zv, *zv_next;
1535 int oldnamelen, newnamelen;
1538 if (zvol_inhibit_dev)
1541 oldnamelen = strlen(oldname);
1542 newnamelen = strlen(newname);
1543 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1545 mutex_enter(&zvol_state_lock);
1547 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1548 zv_next = list_next(&zvol_state_list, zv);
1550 if (strcmp(zv->zv_name, oldname) == 0) {
1551 __zvol_rename_minor(zv, newname);
1552 } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
1553 (zv->zv_name[oldnamelen] == '/' ||
1554 zv->zv_name[oldnamelen] == '@')) {
1555 snprintf(name, MAXNAMELEN, "%s%c%s", newname,
1556 zv->zv_name[oldnamelen],
1557 zv->zv_name + oldnamelen + 1);
1558 __zvol_rename_minor(zv, name);
1562 mutex_exit(&zvol_state_lock);
1564 kmem_free(name, MAXNAMELEN);
1568 snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
1569 uint64_t snapdev = *(uint64_t *) arg;
1571 if (strchr(dsname, '@') == NULL)
1575 case ZFS_SNAPDEV_VISIBLE:
1576 mutex_enter(&zvol_state_lock);
1577 (void) __zvol_create_minor(dsname, B_TRUE);
1578 mutex_exit(&zvol_state_lock);
1580 case ZFS_SNAPDEV_HIDDEN:
1581 (void) zvol_remove_minor(dsname);
1589 zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
1590 (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
1591 &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
1592 /* caller should continue to modify snapdev property */
1601 list_create(&zvol_state_list, sizeof (zvol_state_t),
1602 offsetof(zvol_state_t, zv_next));
1604 mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
1606 error = register_blkdev(zvol_major, ZVOL_DRIVER);
1608 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1612 blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
1613 THIS_MODULE, zvol_probe, NULL, NULL);
1618 mutex_destroy(&zvol_state_lock);
1619 list_destroy(&zvol_state_list);
1621 return (SET_ERROR(error));
1627 zvol_remove_minors(NULL);
1628 blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
1629 unregister_blkdev(zvol_major, ZVOL_DRIVER);
1630 mutex_destroy(&zvol_state_lock);
1631 list_destroy(&zvol_state_list);
1634 module_param(zvol_inhibit_dev, uint, 0644);
1635 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1637 module_param(zvol_major, uint, 0444);
1638 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1640 module_param(zvol_max_discard_blocks, ulong, 0444);
1641 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1643 module_param(zvol_prefetch_bytes, uint, 0644);
1644 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");