]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
ZFS: MFV 2.0-rc1-gfd20a8
[FreeBSD/FreeBSD.git] / sys / contrib / openzfs / module / os / freebsd / zfs / zvol_os.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95
96 #include <geom/geom.h>
97 #include <sys/zvol.h>
98 #include <sys/zvol_impl.h>
99
100 #include "zfs_namecheck.h"
101
102 #define ZVOL_DUMPSIZE           "dumpsize"
103
104 #ifdef ZVOL_LOCK_DEBUG
105 #define ZVOL_RW_READER          RW_WRITER
106 #define ZVOL_RW_READ_HELD       RW_WRITE_HELD
107 #else
108 #define ZVOL_RW_READER          RW_READER
109 #define ZVOL_RW_READ_HELD       RW_READ_HELD
110 #endif
111
112 enum zvol_geom_state {
113         ZVOL_GEOM_UNINIT,
114         ZVOL_GEOM_STOPPED,
115         ZVOL_GEOM_RUNNING,
116 };
117
118 struct zvol_state_os {
119         int zso_volmode;
120 #define zso_dev         _zso_state._zso_dev
121 #define zso_geom        _zso_state._zso_geom
122         union {
123                 /* volmode=dev */
124                 struct zvol_state_dev {
125                         struct cdev *zsd_cdev;
126                         uint64_t zsd_sync_cnt;
127                 } _zso_dev;
128
129                 /* volmode=geom */
130                 struct zvol_state_geom {
131                         struct g_provider *zsg_provider;
132                         struct bio_queue_head zsg_queue;
133                         struct mtx zsg_queue_mtx;
134                         enum zvol_geom_state zsg_state;
135                 } _zso_geom;
136         } _zso_state;
137 };
138
139 static uint32_t zvol_minors;
140
141 SYSCTL_DECL(_vfs_zfs);
142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
144         "Expose as GEOM providers (1), device files (2) or neither");
145 static boolean_t zpool_on_zvol = B_FALSE;
146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
147         "Allow zpools to use zvols as vdevs (DANGEROUS)");
148
149 /*
150  * Toggle unmap functionality.
151  */
152 boolean_t zvol_unmap_enabled = B_TRUE;
153
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
155         &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
156
157 /*
158  * zvol maximum transfer in one DMU tx.
159  */
160 int zvol_maxphys = DMU_MAX_ACCESS / 2;
161
162 static void zvol_ensure_zilog(zvol_state_t *zv);
163
164 static d_open_t         zvol_cdev_open;
165 static d_close_t        zvol_cdev_close;
166 static d_ioctl_t        zvol_cdev_ioctl;
167 static d_read_t         zvol_cdev_read;
168 static d_write_t        zvol_cdev_write;
169 static d_strategy_t     zvol_geom_bio_strategy;
170
171 static struct cdevsw zvol_cdevsw = {
172         .d_name =       "zvol",
173         .d_version =    D_VERSION,
174         .d_flags =      D_DISK | D_TRACKCLOSE,
175         .d_open =       zvol_cdev_open,
176         .d_close =      zvol_cdev_close,
177         .d_ioctl =      zvol_cdev_ioctl,
178         .d_read =       zvol_cdev_read,
179         .d_write =      zvol_cdev_write,
180         .d_strategy =   zvol_geom_bio_strategy,
181 };
182
183 extern uint_t zfs_geom_probe_vdev_key;
184
185 struct g_class zfs_zvol_class = {
186         .name = "ZFS::ZVOL",
187         .version = G_VERSION,
188 };
189
190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
191
192 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
193 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
194 static void zvol_geom_run(zvol_state_t *zv);
195 static void zvol_geom_destroy(zvol_state_t *zv);
196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
197 static void zvol_geom_worker(void *arg);
198 static void zvol_geom_bio_start(struct bio *bp);
199 static int zvol_geom_bio_getattr(struct bio *bp);
200 /* static d_strategy_t  zvol_geom_bio_strategy; (declared elsewhere) */
201
202 /*
203  * GEOM mode implementation
204  */
205
206 /*ARGSUSED*/
207 static int
208 zvol_geom_open(struct g_provider *pp, int flag, int count)
209 {
210         zvol_state_t *zv;
211         int err = 0;
212         boolean_t drop_suspend = B_TRUE;
213         boolean_t drop_namespace = B_FALSE;
214
215         if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
216                 /*
217                  * if zfs_geom_probe_vdev_key is set, that means that zfs is
218                  * attempting to probe geom providers while looking for a
219                  * replacement for a missing VDEV.  In this case, the
220                  * spa_namespace_lock will not be held, but it is still illegal
221                  * to use a zvol as a vdev.  Deadlocks can result if another
222                  * thread has spa_namespace_lock
223                  */
224                 return (SET_ERROR(EOPNOTSUPP));
225         }
226
227 retry:
228         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
229         zv = pp->private;
230         if (zv == NULL) {
231                 if (drop_namespace)
232                         mutex_exit(&spa_namespace_lock);
233                 rw_exit(&zvol_state_lock);
234                 return (SET_ERROR(ENXIO));
235         }
236
237         if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
238                 /*
239                  * We need to guarantee that the namespace lock is held
240                  * to avoid spurious failures in zvol_first_open
241                  */
242                 drop_namespace = B_TRUE;
243                 if (!mutex_tryenter(&spa_namespace_lock)) {
244                         rw_exit(&zvol_state_lock);
245                         mutex_enter(&spa_namespace_lock);
246                         goto retry;
247                 }
248         }
249         mutex_enter(&zv->zv_state_lock);
250
251         ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
252
253         /*
254          * make sure zvol is not suspended during first open
255          * (hold zv_suspend_lock) and respect proper lock acquisition
256          * ordering - zv_suspend_lock before zv_state_lock
257          */
258         if (zv->zv_open_count == 0) {
259                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
260                         mutex_exit(&zv->zv_state_lock);
261                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
262                         mutex_enter(&zv->zv_state_lock);
263                         /* check to see if zv_suspend_lock is needed */
264                         if (zv->zv_open_count != 0) {
265                                 rw_exit(&zv->zv_suspend_lock);
266                                 drop_suspend = B_FALSE;
267                         }
268                 }
269         } else {
270                 drop_suspend = B_FALSE;
271         }
272         rw_exit(&zvol_state_lock);
273
274         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
275
276         if (zv->zv_open_count == 0) {
277                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
278                 err = zvol_first_open(zv, !(flag & FWRITE));
279                 if (err)
280                         goto out_mutex;
281                 pp->mediasize = zv->zv_volsize;
282                 pp->stripeoffset = 0;
283                 pp->stripesize = zv->zv_volblocksize;
284         }
285
286         /*
287          * Check for a bad on-disk format version now since we
288          * lied about owning the dataset readonly before.
289          */
290         if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
291             dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
292                 err = EROFS;
293                 goto out_open_count;
294         }
295         if (zv->zv_flags & ZVOL_EXCL) {
296                 err = EBUSY;
297                 goto out_open_count;
298         }
299 #ifdef FEXCL
300         if (flag & FEXCL) {
301                 if (zv->zv_open_count != 0) {
302                         err = EBUSY;
303                         goto out_open_count;
304                 }
305                 zv->zv_flags |= ZVOL_EXCL;
306         }
307 #endif
308
309         zv->zv_open_count += count;
310         if (drop_namespace)
311                 mutex_exit(&spa_namespace_lock);
312         mutex_exit(&zv->zv_state_lock);
313         if (drop_suspend)
314                 rw_exit(&zv->zv_suspend_lock);
315         return (0);
316
317 out_open_count:
318         if (zv->zv_open_count == 0)
319                 zvol_last_close(zv);
320 out_mutex:
321         if (drop_namespace)
322                 mutex_exit(&spa_namespace_lock);
323         mutex_exit(&zv->zv_state_lock);
324         if (drop_suspend)
325                 rw_exit(&zv->zv_suspend_lock);
326         return (SET_ERROR(err));
327 }
328
329 /*ARGSUSED*/
330 static int
331 zvol_geom_close(struct g_provider *pp, int flag, int count)
332 {
333         zvol_state_t *zv;
334         boolean_t drop_suspend = B_TRUE;
335
336         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
337         zv = pp->private;
338         if (zv == NULL) {
339                 rw_exit(&zvol_state_lock);
340                 return (SET_ERROR(ENXIO));
341         }
342
343         mutex_enter(&zv->zv_state_lock);
344         if (zv->zv_flags & ZVOL_EXCL) {
345                 ASSERT(zv->zv_open_count == 1);
346                 zv->zv_flags &= ~ZVOL_EXCL;
347         }
348
349         ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
350
351         /*
352          * If the open count is zero, this is a spurious close.
353          * That indicates a bug in the kernel / DDI framework.
354          */
355         ASSERT(zv->zv_open_count > 0);
356
357         /*
358          * make sure zvol is not suspended during last close
359          * (hold zv_suspend_lock) and respect proper lock acquisition
360          * ordering - zv_suspend_lock before zv_state_lock
361          */
362         if ((zv->zv_open_count - count) == 0) {
363                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
364                         mutex_exit(&zv->zv_state_lock);
365                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
366                         mutex_enter(&zv->zv_state_lock);
367                         /* check to see if zv_suspend_lock is needed */
368                         if (zv->zv_open_count != 1) {
369                                 rw_exit(&zv->zv_suspend_lock);
370                                 drop_suspend = B_FALSE;
371                         }
372                 }
373         } else {
374                 drop_suspend = B_FALSE;
375         }
376         rw_exit(&zvol_state_lock);
377
378         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
379
380         /*
381          * You may get multiple opens, but only one close.
382          */
383         zv->zv_open_count -= count;
384
385         if (zv->zv_open_count == 0) {
386                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
387                 zvol_last_close(zv);
388         }
389
390         mutex_exit(&zv->zv_state_lock);
391
392         if (drop_suspend)
393                 rw_exit(&zv->zv_suspend_lock);
394         return (0);
395 }
396
397 static void
398 zvol_geom_run(zvol_state_t *zv)
399 {
400         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
401         struct g_provider *pp = zsg->zsg_provider;
402
403         ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
404
405         g_error_provider(pp, 0);
406
407         kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
408             "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
409 }
410
411 static void
412 zvol_geom_destroy(zvol_state_t *zv)
413 {
414         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
415         struct g_provider *pp = zsg->zsg_provider;
416
417         ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
418
419         g_topology_assert();
420
421         mutex_enter(&zv->zv_state_lock);
422         VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
423         mutex_exit(&zv->zv_state_lock);
424         zsg->zsg_provider = NULL;
425         pp->private = NULL;
426         g_wither_geom(pp->geom, ENXIO);
427 }
428
429 static int
430 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
431 {
432         int count, error, flags;
433
434         g_topology_assert();
435
436         /*
437          * To make it easier we expect either open or close, but not both
438          * at the same time.
439          */
440         KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
441             (acr <= 0 && acw <= 0 && ace <= 0),
442             ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
443             pp->name, acr, acw, ace));
444
445         if (pp->private == NULL) {
446                 if (acr <= 0 && acw <= 0 && ace <= 0)
447                         return (0);
448                 return (pp->error);
449         }
450
451         /*
452          * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
453          * ace != 0, because GEOM already handles that and handles it a bit
454          * differently. GEOM allows for multiple read/exclusive consumers and
455          * ZFS allows only one exclusive consumer, no matter if it is reader or
456          * writer. I like better the way GEOM works so I'll leave it for GEOM
457          * to decide what to do.
458          */
459
460         count = acr + acw + ace;
461         if (count == 0)
462                 return (0);
463
464         flags = 0;
465         if (acr != 0 || ace != 0)
466                 flags |= FREAD;
467         if (acw != 0)
468                 flags |= FWRITE;
469
470         g_topology_unlock();
471         if (count > 0)
472                 error = zvol_geom_open(pp, flags, count);
473         else
474                 error = zvol_geom_close(pp, flags, -count);
475         g_topology_lock();
476         return (error);
477 }
478
479 static void
480 zvol_geom_worker(void *arg)
481 {
482         zvol_state_t *zv = arg;
483         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
484         struct bio *bp;
485
486         ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
487
488         thread_lock(curthread);
489         sched_prio(curthread, PRIBIO);
490         thread_unlock(curthread);
491
492         for (;;) {
493                 mtx_lock(&zsg->zsg_queue_mtx);
494                 bp = bioq_takefirst(&zsg->zsg_queue);
495                 if (bp == NULL) {
496                         if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
497                                 zsg->zsg_state = ZVOL_GEOM_RUNNING;
498                                 wakeup(&zsg->zsg_state);
499                                 mtx_unlock(&zsg->zsg_queue_mtx);
500                                 kthread_exit();
501                         }
502                         msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
503                             PRIBIO | PDROP, "zvol:io", 0);
504                         continue;
505                 }
506                 mtx_unlock(&zsg->zsg_queue_mtx);
507                 zvol_geom_bio_strategy(bp);
508         }
509 }
510
511 static void
512 zvol_geom_bio_start(struct bio *bp)
513 {
514         zvol_state_t *zv = bp->bio_to->private;
515         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
516         boolean_t first;
517
518         if (bp->bio_cmd == BIO_GETATTR) {
519                 if (zvol_geom_bio_getattr(bp))
520                         g_io_deliver(bp, EOPNOTSUPP);
521                 return;
522         }
523
524         if (!THREAD_CAN_SLEEP()) {
525                 mtx_lock(&zsg->zsg_queue_mtx);
526                 first = (bioq_first(&zsg->zsg_queue) == NULL);
527                 bioq_insert_tail(&zsg->zsg_queue, bp);
528                 mtx_unlock(&zsg->zsg_queue_mtx);
529                 if (first)
530                         wakeup_one(&zsg->zsg_queue);
531                 return;
532         }
533
534         zvol_geom_bio_strategy(bp);
535 }
536
537 static int
538 zvol_geom_bio_getattr(struct bio *bp)
539 {
540         zvol_state_t *zv;
541
542         zv = bp->bio_to->private;
543         ASSERT(zv != NULL);
544
545         spa_t *spa = dmu_objset_spa(zv->zv_objset);
546         uint64_t refd, avail, usedobjs, availobjs;
547
548         if (g_handleattr_int(bp, "GEOM::candelete", 1))
549                 return (0);
550         if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
551                 dmu_objset_space(zv->zv_objset, &refd, &avail,
552                     &usedobjs, &availobjs);
553                 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
554                         return (0);
555         } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
556                 dmu_objset_space(zv->zv_objset, &refd, &avail,
557                     &usedobjs, &availobjs);
558                 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
559                         return (0);
560         } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
561                 avail = metaslab_class_get_space(spa_normal_class(spa));
562                 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
563                 if (g_handleattr_off_t(bp, "poolblocksavail",
564                     avail / DEV_BSIZE))
565                         return (0);
566         } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
567                 refd = metaslab_class_get_alloc(spa_normal_class(spa));
568                 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
569                         return (0);
570         }
571         return (1);
572 }
573
574 static void
575 zvol_geom_bio_strategy(struct bio *bp)
576 {
577         zvol_state_t *zv;
578         uint64_t off, volsize;
579         size_t resid;
580         char *addr;
581         objset_t *os;
582         zfs_locked_range_t *lr;
583         int error = 0;
584         boolean_t doread = B_FALSE;
585         boolean_t is_dumpified;
586         boolean_t sync;
587
588         if (bp->bio_to)
589                 zv = bp->bio_to->private;
590         else
591                 zv = bp->bio_dev->si_drv2;
592
593         if (zv == NULL) {
594                 error = SET_ERROR(ENXIO);
595                 goto out;
596         }
597
598         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
599
600         switch (bp->bio_cmd) {
601         case BIO_READ:
602                 doread = B_TRUE;
603                 break;
604         case BIO_WRITE:
605         case BIO_FLUSH:
606         case BIO_DELETE:
607                 if (zv->zv_flags & ZVOL_RDONLY) {
608                         error = SET_ERROR(EROFS);
609                         goto resume;
610                 }
611                 zvol_ensure_zilog(zv);
612                 if (bp->bio_cmd == BIO_FLUSH)
613                         goto sync;
614                 break;
615         default:
616                 error = EOPNOTSUPP;
617                 goto resume;
618         }
619
620         off = bp->bio_offset;
621         volsize = zv->zv_volsize;
622
623         os = zv->zv_objset;
624         ASSERT(os != NULL);
625
626         addr = bp->bio_data;
627         resid = bp->bio_length;
628
629         if (resid > 0 && off >= volsize) {
630                 error = SET_ERROR(EIO);
631                 goto resume;
632         }
633
634         is_dumpified = B_FALSE;
635         sync = !doread && !is_dumpified &&
636             zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
637
638         /*
639          * There must be no buffer changes when doing a dmu_sync() because
640          * we can't change the data whilst calculating the checksum.
641          */
642         lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
643             doread ? RL_READER : RL_WRITER);
644
645         if (bp->bio_cmd == BIO_DELETE) {
646                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
647                 error = dmu_tx_assign(tx, TXG_WAIT);
648                 if (error != 0) {
649                         dmu_tx_abort(tx);
650                 } else {
651                         zvol_log_truncate(zv, tx, off, resid, sync);
652                         dmu_tx_commit(tx);
653                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
654                             off, resid);
655                         resid = 0;
656                 }
657                 goto unlock;
658         }
659         while (resid != 0 && off < volsize) {
660                 size_t size = MIN(resid, zvol_maxphys);
661                 if (doread) {
662                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
663                             DMU_READ_PREFETCH);
664                 } else {
665                         dmu_tx_t *tx = dmu_tx_create(os);
666                         dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
667                         error = dmu_tx_assign(tx, TXG_WAIT);
668                         if (error) {
669                                 dmu_tx_abort(tx);
670                         } else {
671                                 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
672                                 zvol_log_write(zv, tx, off, size, sync);
673                                 dmu_tx_commit(tx);
674                         }
675                 }
676                 if (error) {
677                         /* convert checksum errors into IO errors */
678                         if (error == ECKSUM)
679                                 error = SET_ERROR(EIO);
680                         break;
681                 }
682                 off += size;
683                 addr += size;
684                 resid -= size;
685         }
686 unlock:
687         zfs_rangelock_exit(lr);
688
689         bp->bio_completed = bp->bio_length - resid;
690         if (bp->bio_completed < bp->bio_length && off > volsize)
691                 error = EINVAL;
692
693         switch (bp->bio_cmd) {
694         case BIO_FLUSH:
695                 break;
696         case BIO_READ:
697                 dataset_kstats_update_read_kstats(&zv->zv_kstat,
698                     bp->bio_completed);
699                 break;
700         case BIO_WRITE:
701                 dataset_kstats_update_write_kstats(&zv->zv_kstat,
702                     bp->bio_completed);
703                 break;
704         case BIO_DELETE:
705                 break;
706         default:
707                 break;
708         }
709
710         if (sync) {
711 sync:
712                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
713         }
714 resume:
715         rw_exit(&zv->zv_suspend_lock);
716 out:
717         if (bp->bio_to)
718                 g_io_deliver(bp, error);
719         else
720                 biofinish(bp, NULL, error);
721 }
722
723 /*
724  * Character device mode implementation
725  */
726
727 static int
728 zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag)
729 {
730         zvol_state_t *zv;
731         uint64_t volsize;
732         zfs_locked_range_t *lr;
733         int error = 0;
734
735         zv = dev->si_drv2;
736
737         volsize = zv->zv_volsize;
738         /*
739          * uio_loffset == volsize isn't an error as
740          * its required for EOF processing.
741          */
742         if (uio->uio_resid > 0 &&
743             (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
744                 return (SET_ERROR(EIO));
745
746         lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
747             uio->uio_resid, RL_READER);
748         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
749                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
750
751                 /* don't read past the end */
752                 if (bytes > volsize - uio->uio_loffset)
753                         bytes = volsize - uio->uio_loffset;
754
755                 error =  dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
756                 if (error) {
757                         /* convert checksum errors into IO errors */
758                         if (error == ECKSUM)
759                                 error = SET_ERROR(EIO);
760                         break;
761                 }
762         }
763         zfs_rangelock_exit(lr);
764
765         return (error);
766 }
767
768 static int
769 zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag)
770 {
771         zvol_state_t *zv;
772         uint64_t volsize;
773         zfs_locked_range_t *lr;
774         int error = 0;
775         boolean_t sync;
776
777         zv = dev->si_drv2;
778
779         volsize = zv->zv_volsize;
780
781         if (uio->uio_resid > 0 &&
782             (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
783                 return (SET_ERROR(EIO));
784
785         sync = (ioflag & IO_SYNC) ||
786             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
787
788         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
789         zvol_ensure_zilog(zv);
790
791         lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
792             uio->uio_resid, RL_WRITER);
793         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
794                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
795                 uint64_t off = uio->uio_loffset;
796                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
797
798                 if (bytes > volsize - off)      /* don't write past the end */
799                         bytes = volsize - off;
800
801                 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
802                 error = dmu_tx_assign(tx, TXG_WAIT);
803                 if (error) {
804                         dmu_tx_abort(tx);
805                         break;
806                 }
807                 error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
808                 if (error == 0)
809                         zvol_log_write(zv, tx, off, bytes, sync);
810                 dmu_tx_commit(tx);
811
812                 if (error)
813                         break;
814         }
815         zfs_rangelock_exit(lr);
816         if (sync)
817                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
818         rw_exit(&zv->zv_suspend_lock);
819         return (error);
820 }
821
822 static int
823 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
824 {
825         zvol_state_t *zv;
826         struct zvol_state_dev *zsd;
827         int err = 0;
828         boolean_t drop_suspend = B_TRUE;
829
830         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
831         zv = dev->si_drv2;
832         if (zv == NULL) {
833                 rw_exit(&zvol_state_lock);
834                 return (SET_ERROR(ENXIO));
835         }
836
837         mutex_enter(&zv->zv_state_lock);
838
839         ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
840
841         /*
842          * make sure zvol is not suspended during first open
843          * (hold zv_suspend_lock) and respect proper lock acquisition
844          * ordering - zv_suspend_lock before zv_state_lock
845          */
846         if (zv->zv_open_count == 0) {
847                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
848                         mutex_exit(&zv->zv_state_lock);
849                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
850                         mutex_enter(&zv->zv_state_lock);
851                         /* check to see if zv_suspend_lock is needed */
852                         if (zv->zv_open_count != 0) {
853                                 rw_exit(&zv->zv_suspend_lock);
854                                 drop_suspend = B_FALSE;
855                         }
856                 }
857         } else {
858                 drop_suspend = B_FALSE;
859         }
860         rw_exit(&zvol_state_lock);
861
862         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
863
864         if (zv->zv_open_count == 0) {
865                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
866                 err = zvol_first_open(zv, !(flags & FWRITE));
867                 if (err)
868                         goto out_locked;
869         }
870
871         if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
872                 err = EROFS;
873                 goto out_opened;
874         }
875         if (zv->zv_flags & ZVOL_EXCL) {
876                 err = EBUSY;
877                 goto out_opened;
878         }
879 #ifdef FEXCL
880         if (flags & FEXCL) {
881                 if (zv->zv_open_count != 0) {
882                         err = EBUSY;
883                         goto out_opened;
884                 }
885                 zv->zv_flags |= ZVOL_EXCL;
886         }
887 #endif
888
889         zv->zv_open_count++;
890         if (flags & (FSYNC | FDSYNC)) {
891                 zsd = &zv->zv_zso->zso_dev;
892                 zsd->zsd_sync_cnt++;
893                 if (zsd->zsd_sync_cnt == 1)
894                         zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
895         }
896
897         mutex_exit(&zv->zv_state_lock);
898         if (drop_suspend)
899                 rw_exit(&zv->zv_suspend_lock);
900         return (0);
901
902 out_opened:
903         if (zv->zv_open_count == 0)
904                 zvol_last_close(zv);
905 out_locked:
906         mutex_exit(&zv->zv_state_lock);
907         if (drop_suspend)
908                 rw_exit(&zv->zv_suspend_lock);
909         return (SET_ERROR(err));
910 }
911
912 static int
913 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
914 {
915         zvol_state_t *zv;
916         struct zvol_state_dev *zsd;
917         boolean_t drop_suspend = B_TRUE;
918
919         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
920         zv = dev->si_drv2;
921         if (zv == NULL) {
922                 rw_exit(&zvol_state_lock);
923                 return (SET_ERROR(ENXIO));
924         }
925
926         mutex_enter(&zv->zv_state_lock);
927         if (zv->zv_flags & ZVOL_EXCL) {
928                 ASSERT(zv->zv_open_count == 1);
929                 zv->zv_flags &= ~ZVOL_EXCL;
930         }
931
932         ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
933
934         /*
935          * If the open count is zero, this is a spurious close.
936          * That indicates a bug in the kernel / DDI framework.
937          */
938         ASSERT(zv->zv_open_count > 0);
939         /*
940          * make sure zvol is not suspended during last close
941          * (hold zv_suspend_lock) and respect proper lock acquisition
942          * ordering - zv_suspend_lock before zv_state_lock
943          */
944         if (zv->zv_open_count == 1) {
945                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
946                         mutex_exit(&zv->zv_state_lock);
947                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
948                         mutex_enter(&zv->zv_state_lock);
949                         /* check to see if zv_suspend_lock is needed */
950                         if (zv->zv_open_count != 1) {
951                                 rw_exit(&zv->zv_suspend_lock);
952                                 drop_suspend = B_FALSE;
953                         }
954                 }
955         } else {
956                 drop_suspend = B_FALSE;
957         }
958         rw_exit(&zvol_state_lock);
959
960         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
961
962         /*
963          * You may get multiple opens, but only one close.
964          */
965         zv->zv_open_count--;
966         if (flags & (FSYNC | FDSYNC)) {
967                 zsd = &zv->zv_zso->zso_dev;
968                 zsd->zsd_sync_cnt--;
969         }
970
971         if (zv->zv_open_count == 0) {
972                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
973                 zvol_last_close(zv);
974         }
975
976         mutex_exit(&zv->zv_state_lock);
977
978         if (drop_suspend)
979                 rw_exit(&zv->zv_suspend_lock);
980         return (0);
981 }
982
983 static int
984 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
985     int fflag, struct thread *td)
986 {
987         zvol_state_t *zv;
988         zfs_locked_range_t *lr;
989         off_t offset, length;
990         int i, error;
991         boolean_t sync;
992
993         zv = dev->si_drv2;
994
995         error = 0;
996         KASSERT(zv->zv_open_count > 0,
997             ("Device with zero access count in %s", __func__));
998
999         i = IOCPARM_LEN(cmd);
1000         switch (cmd) {
1001         case DIOCGSECTORSIZE:
1002                 *(uint32_t *)data = DEV_BSIZE;
1003                 break;
1004         case DIOCGMEDIASIZE:
1005                 *(off_t *)data = zv->zv_volsize;
1006                 break;
1007         case DIOCGFLUSH:
1008                 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1009                 if (zv->zv_zilog != NULL)
1010                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1011                 rw_exit(&zv->zv_suspend_lock);
1012                 break;
1013         case DIOCGDELETE:
1014                 if (!zvol_unmap_enabled)
1015                         break;
1016
1017                 offset = ((off_t *)data)[0];
1018                 length = ((off_t *)data)[1];
1019                 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1020                     offset < 0 || offset >= zv->zv_volsize ||
1021                     length <= 0) {
1022                         printf("%s: offset=%jd length=%jd\n", __func__, offset,
1023                             length);
1024                         error = EINVAL;
1025                         break;
1026                 }
1027                 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1028                 zvol_ensure_zilog(zv);
1029                 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1030                     RL_WRITER);
1031                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1032                 error = dmu_tx_assign(tx, TXG_WAIT);
1033                 if (error != 0) {
1034                         sync = FALSE;
1035                         dmu_tx_abort(tx);
1036                 } else {
1037                         sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1038                         zvol_log_truncate(zv, tx, offset, length, sync);
1039                         dmu_tx_commit(tx);
1040                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1041                             offset, length);
1042                 }
1043                 zfs_rangelock_exit(lr);
1044                 if (sync)
1045                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1046                 rw_exit(&zv->zv_suspend_lock);
1047                 break;
1048         case DIOCGSTRIPESIZE:
1049                 *(off_t *)data = zv->zv_volblocksize;
1050                 break;
1051         case DIOCGSTRIPEOFFSET:
1052                 *(off_t *)data = 0;
1053                 break;
1054         case DIOCGATTR: {
1055                 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1056                 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1057                 uint64_t refd, avail, usedobjs, availobjs;
1058
1059                 if (strcmp(arg->name, "GEOM::candelete") == 0)
1060                         arg->value.i = 1;
1061                 else if (strcmp(arg->name, "blocksavail") == 0) {
1062                         dmu_objset_space(zv->zv_objset, &refd, &avail,
1063                             &usedobjs, &availobjs);
1064                         arg->value.off = avail / DEV_BSIZE;
1065                 } else if (strcmp(arg->name, "blocksused") == 0) {
1066                         dmu_objset_space(zv->zv_objset, &refd, &avail,
1067                             &usedobjs, &availobjs);
1068                         arg->value.off = refd / DEV_BSIZE;
1069                 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1070                         avail = metaslab_class_get_space(spa_normal_class(spa));
1071                         avail -= metaslab_class_get_alloc(
1072                             spa_normal_class(spa));
1073                         arg->value.off = avail / DEV_BSIZE;
1074                 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1075                         refd = metaslab_class_get_alloc(spa_normal_class(spa));
1076                         arg->value.off = refd / DEV_BSIZE;
1077                 } else
1078                         error = ENOIOCTL;
1079                 break;
1080         }
1081         case FIOSEEKHOLE:
1082         case FIOSEEKDATA: {
1083                 off_t *off = (off_t *)data;
1084                 uint64_t noff;
1085                 boolean_t hole;
1086
1087                 hole = (cmd == FIOSEEKHOLE);
1088                 noff = *off;
1089                 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1090                 *off = noff;
1091                 break;
1092         }
1093         default:
1094                 error = ENOIOCTL;
1095         }
1096
1097         return (error);
1098 }
1099
1100 /*
1101  * Misc. helpers
1102  */
1103
1104 static void
1105 zvol_ensure_zilog(zvol_state_t *zv)
1106 {
1107         ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1108
1109         /*
1110          * Open a ZIL if this is the first time we have written to this
1111          * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1112          * than zv_state_lock so that we don't need to acquire an
1113          * additional lock in this path.
1114          */
1115         if (zv->zv_zilog == NULL) {
1116                 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1117                         rw_exit(&zv->zv_suspend_lock);
1118                         rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1119                 }
1120                 if (zv->zv_zilog == NULL) {
1121                         zv->zv_zilog = zil_open(zv->zv_objset,
1122                             zvol_get_data);
1123                         zv->zv_flags |= ZVOL_WRITTEN_TO;
1124                 }
1125                 rw_downgrade(&zv->zv_suspend_lock);
1126         }
1127 }
1128
1129 static boolean_t
1130 zvol_is_zvol_impl(const char *device)
1131 {
1132         return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1133 }
1134
1135 static void
1136 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1137 {
1138         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1139         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1140
1141         /* move to new hashtable entry  */
1142         zv->zv_hash = zvol_name_hash(zv->zv_name);
1143         hlist_del(&zv->zv_hlink);
1144         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1145
1146         if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1147                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1148                 struct g_provider *pp = zsg->zsg_provider;
1149                 struct g_geom *gp;
1150
1151                 g_topology_lock();
1152                 gp = pp->geom;
1153                 ASSERT(gp != NULL);
1154
1155                 zsg->zsg_provider = NULL;
1156                 g_wither_provider(pp, ENXIO);
1157
1158                 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1159                 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1160                 pp->sectorsize = DEV_BSIZE;
1161                 pp->mediasize = zv->zv_volsize;
1162                 pp->private = zv;
1163                 zsg->zsg_provider = pp;
1164                 g_error_provider(pp, 0);
1165                 g_topology_unlock();
1166         } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1167                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1168                 struct cdev *dev;
1169                 struct make_dev_args args;
1170
1171                 dev = zsd->zsd_cdev;
1172                 if (dev != NULL) {
1173                         destroy_dev(dev);
1174                         dev = zsd->zsd_cdev = NULL;
1175                         if (zv->zv_open_count > 0) {
1176                                 zv->zv_flags &= ~ZVOL_EXCL;
1177                                 zv->zv_open_count = 0;
1178                                 /* XXX  need suspend lock but lock order */
1179                                 zvol_last_close(zv);
1180                         }
1181                 }
1182
1183                 make_dev_args_init(&args);
1184                 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1185                 args.mda_devsw = &zvol_cdevsw;
1186                 args.mda_cr = NULL;
1187                 args.mda_uid = UID_ROOT;
1188                 args.mda_gid = GID_OPERATOR;
1189                 args.mda_mode = 0640;
1190                 args.mda_si_drv2 = zv;
1191                 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1192                     == 0) {
1193                         dev->si_iosize_max = MAXPHYS;
1194                         zsd->zsd_cdev = dev;
1195                 }
1196         }
1197         strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1198 }
1199
1200 /*
1201  * Remove minor node for the specified volume.
1202  */
1203 static void
1204 zvol_free(zvol_state_t *zv)
1205 {
1206         ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1207         ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1208         ASSERT(zv->zv_open_count == 0);
1209
1210         ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1211
1212         rw_destroy(&zv->zv_suspend_lock);
1213         zfs_rangelock_fini(&zv->zv_rangelock);
1214
1215         if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1216                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1217
1218                 g_topology_lock();
1219                 zvol_geom_destroy(zv);
1220                 g_topology_unlock();
1221                 mtx_destroy(&zsg->zsg_queue_mtx);
1222         } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1223                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1224                 struct cdev *dev = zsd->zsd_cdev;
1225
1226                 if (dev != NULL)
1227                         destroy_dev(dev);
1228         }
1229
1230         mutex_destroy(&zv->zv_state_lock);
1231         dataset_kstats_destroy(&zv->zv_kstat);
1232         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1233         kmem_free(zv, sizeof (zvol_state_t));
1234         zvol_minors--;
1235 }
1236
1237 /*
1238  * Create a minor node (plus a whole lot more) for the specified volume.
1239  */
1240 static int
1241 zvol_create_minor_impl(const char *name)
1242 {
1243         zvol_state_t *zv;
1244         objset_t *os;
1245         dmu_object_info_t *doi;
1246         uint64_t volsize;
1247         uint64_t volmode, hash;
1248         int error;
1249
1250         ZFS_LOG(1, "Creating ZVOL %s...", name);
1251
1252         hash = zvol_name_hash(name);
1253         if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1254                 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1255                 mutex_exit(&zv->zv_state_lock);
1256                 return (SET_ERROR(EEXIST));
1257         }
1258
1259         DROP_GIANT();
1260         /* lie and say we're read-only */
1261         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1262         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1263
1264         if (error)
1265                 goto out_doi;
1266
1267         error = dmu_object_info(os, ZVOL_OBJ, doi);
1268         if (error)
1269                 goto out_dmu_objset_disown;
1270
1271         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1272         if (error)
1273                 goto out_dmu_objset_disown;
1274
1275         error = dsl_prop_get_integer(name,
1276             zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1277         if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT)
1278                 volmode = zvol_volmode;
1279         /*
1280          * zvol_alloc equivalent ...
1281          */
1282         zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1283         zv->zv_hash = hash;
1284         mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1285         zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1286         zv->zv_zso->zso_volmode = volmode;
1287         if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1288                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1289                 struct g_provider *pp;
1290                 struct g_geom *gp;
1291
1292                 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1293                 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1294
1295                 g_topology_lock();
1296                 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1297                 gp->start = zvol_geom_bio_start;
1298                 gp->access = zvol_geom_access;
1299                 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1300                 /* TODO: NULL check? */
1301                 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1302                 pp->sectorsize = DEV_BSIZE;
1303                 pp->mediasize = 0;
1304                 pp->private = zv;
1305
1306                 zsg->zsg_provider = pp;
1307                 bioq_init(&zsg->zsg_queue);
1308         } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
1309                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1310                 struct cdev *dev;
1311                 struct make_dev_args args;
1312
1313                 make_dev_args_init(&args);
1314                 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1315                 args.mda_devsw = &zvol_cdevsw;
1316                 args.mda_cr = NULL;
1317                 args.mda_uid = UID_ROOT;
1318                 args.mda_gid = GID_OPERATOR;
1319                 args.mda_mode = 0640;
1320                 args.mda_si_drv2 = zv;
1321                 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1322                 if (error != 0) {
1323                         mutex_destroy(&zv->zv_state_lock);
1324                         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1325                         kmem_free(zv, sizeof (*zv));
1326                         dmu_objset_disown(os, B_TRUE, FTAG);
1327                         goto out_giant;
1328                 }
1329                 dev->si_iosize_max = MAXPHYS;
1330                 zsd->zsd_cdev = dev;
1331         }
1332         (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1333         rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1334         zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1335
1336         if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1337                 zv->zv_flags |= ZVOL_RDONLY;
1338
1339         zv->zv_volblocksize = doi->doi_data_block_size;
1340         zv->zv_volsize = volsize;
1341         zv->zv_objset = os;
1342
1343         if (spa_writeable(dmu_objset_spa(os))) {
1344                 if (zil_replay_disable)
1345                         zil_destroy(dmu_objset_zil(os), B_FALSE);
1346                 else
1347                         zil_replay(os, zv, zvol_replay_vector);
1348         }
1349         ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1350         dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1351
1352         /* XXX do prefetch */
1353
1354         zv->zv_objset = NULL;
1355 out_dmu_objset_disown:
1356         dmu_objset_disown(os, B_TRUE, FTAG);
1357
1358         if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1359                 if (error == 0)
1360                         zvol_geom_run(zv);
1361                 g_topology_unlock();
1362         }
1363 out_doi:
1364         kmem_free(doi, sizeof (dmu_object_info_t));
1365         if (error == 0) {
1366                 rw_enter(&zvol_state_lock, RW_WRITER);
1367                 zvol_insert(zv);
1368                 zvol_minors++;
1369                 rw_exit(&zvol_state_lock);
1370         }
1371         ZFS_LOG(1, "ZVOL %s created.", name);
1372 out_giant:
1373         PICKUP_GIANT();
1374         return (error);
1375 }
1376
1377 static void
1378 zvol_clear_private(zvol_state_t *zv)
1379 {
1380         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1381         if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1382                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1383                 struct g_provider *pp = zsg->zsg_provider;
1384
1385                 if (pp == NULL) /* XXX when? */
1386                         return;
1387
1388                 mtx_lock(&zsg->zsg_queue_mtx);
1389                 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1390                 pp->private = NULL;
1391                 wakeup_one(&zsg->zsg_queue);
1392                 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1393                         msleep(&zsg->zsg_state,
1394                             &zsg->zsg_queue_mtx,
1395                             0, "zvol:w", 0);
1396                 mtx_unlock(&zsg->zsg_queue_mtx);
1397                 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1398         }
1399 }
1400
1401 static int
1402 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
1403 {
1404         zv->zv_volsize = volsize;
1405         if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
1406                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1407                 struct g_provider *pp = zsg->zsg_provider;
1408
1409                 if (pp == NULL) /* XXX when? */
1410                         return (0);
1411
1412                 g_topology_lock();
1413
1414                 /*
1415                  * Do not invoke resize event when initial size was zero.
1416                  * ZVOL initializes the size on first open, this is not
1417                  * real resizing.
1418                  */
1419                 if (pp->mediasize == 0)
1420                         pp->mediasize = zv->zv_volsize;
1421                 else
1422                         g_resize_provider(pp, zv->zv_volsize);
1423
1424                 g_topology_unlock();
1425         }
1426         return (0);
1427 }
1428
1429 static void
1430 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1431 {
1432         // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1433 }
1434
1435 static void
1436 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1437 {
1438         // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1439 }
1440
1441 const static zvol_platform_ops_t zvol_freebsd_ops = {
1442         .zv_free = zvol_free,
1443         .zv_rename_minor = zvol_rename_minor,
1444         .zv_create_minor = zvol_create_minor_impl,
1445         .zv_update_volsize = zvol_update_volsize,
1446         .zv_clear_private = zvol_clear_private,
1447         .zv_is_zvol = zvol_is_zvol_impl,
1448         .zv_set_disk_ro = zvol_set_disk_ro_impl,
1449         .zv_set_capacity = zvol_set_capacity_impl,
1450 };
1451
1452 /*
1453  * Public interfaces
1454  */
1455
1456 int
1457 zvol_busy(void)
1458 {
1459         return (zvol_minors != 0);
1460 }
1461
1462 int
1463 zvol_init(void)
1464 {
1465         zvol_init_impl();
1466         zvol_register_ops(&zvol_freebsd_ops);
1467         return (0);
1468 }
1469
1470 void
1471 zvol_fini(void)
1472 {
1473         zvol_fini_impl();
1474 }