sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  23  */
  24
  25 #include <sys/dataset_kstats.h>
  26 #include <sys/dbuf.h>
  27 #include <sys/dmu_traverse.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_prop.h>
  30 #include <sys/dsl_dir.h>
  31 #include <sys/zap.h>
  32 #include <sys/zfeature.h>
  33 #include <sys/zil_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/zio.h>
  36 #include <sys/zfs_rlock.h>
  37 #include <sys/spa_impl.h>
  38 #include <sys/zvol.h>
  39 #include <sys/zvol_impl.h>
  40
  41 #include <linux/blkdev_compat.h>
  42 #include <linux/task_io_accounting_ops.h>
  43
  44 unsigned int zvol_major = ZVOL_MAJOR;
  45 unsigned int zvol_request_sync = 0;
  46 unsigned int zvol_prefetch_bytes = (128 * 1024);
  47 unsigned long zvol_max_discard_blocks = 16384;
  48 unsigned int zvol_threads = 32;
  49
  50 struct zvol_state_os {
  51         struct gendisk          *zvo_disk;      /* generic disk */
  52         struct request_queue    *zvo_queue;     /* request queue */
  53         dev_t                   zvo_dev;        /* device id */
  54 };
  55
  56 taskq_t *zvol_taskq;
  57 static struct ida zvol_ida;
  58
  59 typedef struct zv_request_stack {
  60         zvol_state_t    *zv;
  61         struct bio      *bio;
  62 } zv_request_t;
  63
  64 typedef struct zv_request_task {
  65         zv_request_t zvr;
  66         taskq_ent_t     ent;
  67 } zv_request_task_t;
  68
  69 static zv_request_task_t *
  70 zv_request_task_create(zv_request_t zvr)
  71 {
  72         zv_request_task_t *task;
  73         task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
  74         taskq_init_ent(&task->ent);
  75         task->zvr = zvr;
  76         return (task);
  77 }
  78
  79 static void
  80 zv_request_task_free(zv_request_task_t *task)
  81 {
  82         kmem_free(task, sizeof (*task));
  83 }
  84
  85 /*
  86  * Given a path, return TRUE if path is a ZVOL.
  87  */
  88 static boolean_t
  89 zvol_is_zvol_impl(const char *path)
  90 {
  91         dev_t dev = 0;
  92
  93         if (vdev_lookup_bdev(path, &dev) != 0)
  94                 return (B_FALSE);
  95
  96         if (MAJOR(dev) == zvol_major)
  97                 return (B_TRUE);
  98
  99         return (B_FALSE);
 100 }
 101
 102 static void
 103 zvol_write(zv_request_t *zvr)
 104 {
 105         struct bio *bio = zvr->bio;
 106         int error = 0;
 107         zfs_uio_t uio;
 108
 109         zfs_uio_bvec_init(&uio, bio);
 110
 111         zvol_state_t *zv = zvr->zv;
 112         ASSERT3P(zv, !=, NULL);
 113         ASSERT3U(zv->zv_open_count, >, 0);
 114         ASSERT3P(zv->zv_zilog, !=, NULL);
 115
 116         /* bio marked as FLUSH need to flush before write */
 117         if (bio_is_flush(bio))
 118                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 119
 120         /* Some requests are just for flush and nothing else. */
 121         if (uio.uio_resid == 0) {
 122                 rw_exit(&zv->zv_suspend_lock);
 123                 BIO_END_IO(bio, 0);
 124                 return;
 125         }
 126
 127         struct request_queue *q = zv->zv_zso->zvo_queue;
 128         struct gendisk *disk = zv->zv_zso->zvo_disk;
 129         ssize_t start_resid = uio.uio_resid;
 130         unsigned long start_time;
 131
 132         boolean_t acct = blk_queue_io_stat(q);
 133         if (acct)
 134                 start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
 135
 136         boolean_t sync =
 137             bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 138
 139         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 140             uio.uio_loffset, uio.uio_resid, RL_WRITER);
 141
 142         uint64_t volsize = zv->zv_volsize;
 143         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 144                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 145                 uint64_t off = uio.uio_loffset;
 146                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 147
 148                 if (bytes > volsize - off)      /* don't write past the end */
 149                         bytes = volsize - off;
 150
 151                 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 152
 153                 /* This will only fail for ENOSPC */
 154                 error = dmu_tx_assign(tx, TXG_WAIT);
 155                 if (error) {
 156                         dmu_tx_abort(tx);
 157                         break;
 158                 }
 159                 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 160                 if (error == 0) {
 161                         zvol_log_write(zv, tx, off, bytes, sync);
 162                 }
 163                 dmu_tx_commit(tx);
 164
 165                 if (error)
 166                         break;
 167         }
 168         zfs_rangelock_exit(lr);
 169
 170         int64_t nwritten = start_resid - uio.uio_resid;
 171         dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
 172         task_io_account_write(nwritten);
 173
 174         if (sync)
 175                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 176
 177         rw_exit(&zv->zv_suspend_lock);
 178
 179         if (acct)
 180                 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 181
 182         BIO_END_IO(bio, -error);
 183 }
 184
 185 static void
 186 zvol_write_task(void *arg)
 187 {
 188         zv_request_task_t *task = arg;
 189         zvol_write(&task->zvr);
 190         zv_request_task_free(task);
 191 }
 192
 193 static void
 194 zvol_discard(zv_request_t *zvr)
 195 {
 196         struct bio *bio = zvr->bio;
 197         zvol_state_t *zv = zvr->zv;
 198         uint64_t start = BIO_BI_SECTOR(bio) << 9;
 199         uint64_t size = BIO_BI_SIZE(bio);
 200         uint64_t end = start + size;
 201         boolean_t sync;
 202         int error = 0;
 203         dmu_tx_t *tx;
 204
 205         ASSERT3P(zv, !=, NULL);
 206         ASSERT3U(zv->zv_open_count, >, 0);
 207         ASSERT3P(zv->zv_zilog, !=, NULL);
 208
 209         struct request_queue *q = zv->zv_zso->zvo_queue;
 210         struct gendisk *disk = zv->zv_zso->zvo_disk;
 211         unsigned long start_time;
 212
 213         boolean_t acct = blk_queue_io_stat(q);
 214         if (acct)
 215                 start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
 216
 217         sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 218
 219         if (end > zv->zv_volsize) {
 220                 error = SET_ERROR(EIO);
 221                 goto unlock;
 222         }
 223
 224         /*
 225          * Align the request to volume block boundaries when a secure erase is
 226          * not required.  This will prevent dnode_free_range() from zeroing out
 227          * the unaligned parts which is slow (read-modify-write) and useless
 228          * since we are not freeing any space by doing so.
 229          */
 230         if (!bio_is_secure_erase(bio)) {
 231                 start = P2ROUNDUP(start, zv->zv_volblocksize);
 232                 end = P2ALIGN(end, zv->zv_volblocksize);
 233                 size = end - start;
 234         }
 235
 236         if (start >= end)
 237                 goto unlock;
 238
 239         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 240             start, size, RL_WRITER);
 241
 242         tx = dmu_tx_create(zv->zv_objset);
 243         dmu_tx_mark_netfree(tx);
 244         error = dmu_tx_assign(tx, TXG_WAIT);
 245         if (error != 0) {
 246                 dmu_tx_abort(tx);
 247         } else {
 248                 zvol_log_truncate(zv, tx, start, size, B_TRUE);
 249                 dmu_tx_commit(tx);
 250                 error = dmu_free_long_range(zv->zv_objset,
 251                     ZVOL_OBJ, start, size);
 252         }
 253         zfs_rangelock_exit(lr);
 254
 255         if (error == 0 && sync)
 256                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 257
 258 unlock:
 259         rw_exit(&zv->zv_suspend_lock);
 260
 261         if (acct)
 262                 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 263
 264         BIO_END_IO(bio, -error);
 265 }
 266
 267 static void
 268 zvol_discard_task(void *arg)
 269 {
 270         zv_request_task_t *task = arg;
 271         zvol_discard(&task->zvr);
 272         zv_request_task_free(task);
 273 }
 274
 275 static void
 276 zvol_read(zv_request_t *zvr)
 277 {
 278         struct bio *bio = zvr->bio;
 279         int error = 0;
 280         zfs_uio_t uio;
 281
 282         zfs_uio_bvec_init(&uio, bio);
 283
 284         zvol_state_t *zv = zvr->zv;
 285         ASSERT3P(zv, !=, NULL);
 286         ASSERT3U(zv->zv_open_count, >, 0);
 287
 288         struct request_queue *q = zv->zv_zso->zvo_queue;
 289         struct gendisk *disk = zv->zv_zso->zvo_disk;
 290         ssize_t start_resid = uio.uio_resid;
 291         unsigned long start_time;
 292
 293         boolean_t acct = blk_queue_io_stat(q);
 294         if (acct)
 295                 start_time = blk_generic_start_io_acct(q, disk, READ, bio);
 296
 297         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 298             uio.uio_loffset, uio.uio_resid, RL_READER);
 299
 300         uint64_t volsize = zv->zv_volsize;
 301         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 302                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 303
 304                 /* don't read past the end */
 305                 if (bytes > volsize - uio.uio_loffset)
 306                         bytes = volsize - uio.uio_loffset;
 307
 308                 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 309                 if (error) {
 310                         /* convert checksum errors into IO errors */
 311                         if (error == ECKSUM)
 312                                 error = SET_ERROR(EIO);
 313                         break;
 314                 }
 315         }
 316         zfs_rangelock_exit(lr);
 317
 318         int64_t nread = start_resid - uio.uio_resid;
 319         dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
 320         task_io_account_read(nread);
 321
 322         rw_exit(&zv->zv_suspend_lock);
 323
 324         if (acct)
 325                 blk_generic_end_io_acct(q, disk, READ, bio, start_time);
 326
 327         BIO_END_IO(bio, -error);
 328 }
 329
 330 static void
 331 zvol_read_task(void *arg)
 332 {
 333         zv_request_task_t *task = arg;
 334         zvol_read(&task->zvr);
 335         zv_request_task_free(task);
 336 }
 337
 338 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 339 static blk_qc_t
 340 zvol_submit_bio(struct bio *bio)
 341 #else
 342 static MAKE_REQUEST_FN_RET
 343 zvol_request(struct request_queue *q, struct bio *bio)
 344 #endif
 345 {
 346 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 347 #if defined(HAVE_BIO_BDEV_DISK)
 348         struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 349 #else
 350         struct request_queue *q = bio->bi_disk->queue;
 351 #endif
 352 #endif
 353         zvol_state_t *zv = q->queuedata;
 354         fstrans_cookie_t cookie = spl_fstrans_mark();
 355         uint64_t offset = BIO_BI_SECTOR(bio) << 9;
 356         uint64_t size = BIO_BI_SIZE(bio);
 357         int rw = bio_data_dir(bio);
 358
 359         if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
 360                 printk(KERN_INFO
 361                     "%s: bad access: offset=%llu, size=%lu\n",
 362                     zv->zv_zso->zvo_disk->disk_name,
 363                     (long long unsigned)offset,
 364                     (long unsigned)size);
 365
 366                 BIO_END_IO(bio, -SET_ERROR(EIO));
 367                 goto out;
 368         }
 369
 370         zv_request_t zvr = {
 371                 .zv = zv,
 372                 .bio = bio,
 373         };
 374         zv_request_task_t *task;
 375
 376         if (rw == WRITE) {
 377                 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
 378                         BIO_END_IO(bio, -SET_ERROR(EROFS));
 379                         goto out;
 380                 }
 381
 382                 /*
 383                  * Prevents the zvol from being suspended, or the ZIL being
 384                  * concurrently opened.  Will be released after the i/o
 385                  * completes.
 386                  */
 387                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 388
 389                 /*
 390                  * Open a ZIL if this is the first time we have written to this
 391                  * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
 392                  * than zv_state_lock so that we don't need to acquire an
 393                  * additional lock in this path.
 394                  */
 395                 if (zv->zv_zilog == NULL) {
 396                         rw_exit(&zv->zv_suspend_lock);
 397                         rw_enter(&zv->zv_suspend_lock, RW_WRITER);
 398                         if (zv->zv_zilog == NULL) {
 399                                 zv->zv_zilog = zil_open(zv->zv_objset,
 400                                     zvol_get_data);
 401                                 zv->zv_flags |= ZVOL_WRITTEN_TO;
 402                                 /* replay / destroy done in zvol_create_minor */
 403                                 VERIFY0((zv->zv_zilog->zl_header->zh_flags &
 404                                     ZIL_REPLAY_NEEDED));
 405                         }
 406                         rw_downgrade(&zv->zv_suspend_lock);
 407                 }
 408
 409                 /*
 410                  * We don't want this thread to be blocked waiting for i/o to
 411                  * complete, so we instead wait from a taskq callback. The
 412                  * i/o may be a ZIL write (via zil_commit()), or a read of an
 413                  * indirect block, or a read of a data block (if this is a
 414                  * partial-block write).  We will indicate that the i/o is
 415                  * complete by calling BIO_END_IO() from the taskq callback.
 416                  *
 417                  * This design allows the calling thread to continue and
 418                  * initiate more concurrent operations by calling
 419                  * zvol_request() again. There are typically only a small
 420                  * number of threads available to call zvol_request() (e.g.
 421                  * one per iSCSI target), so keeping the latency of
 422                  * zvol_request() low is important for performance.
 423                  *
 424                  * The zvol_request_sync module parameter allows this
 425                  * behavior to be altered, for performance evaluation
 426                  * purposes.  If the callback blocks, setting
 427                  * zvol_request_sync=1 will result in much worse performance.
 428                  *
 429                  * We can have up to zvol_threads concurrent i/o's being
 430                  * processed for all zvols on the system.  This is typically
 431                  * a vast improvement over the zvol_request_sync=1 behavior
 432                  * of one i/o at a time per zvol.  However, an even better
 433                  * design would be for zvol_request() to initiate the zio
 434                  * directly, and then be notified by the zio_done callback,
 435                  * which would call BIO_END_IO().  Unfortunately, the DMU/ZIL
 436                  * interfaces lack this functionality (they block waiting for
 437                  * the i/o to complete).
 438                  */
 439                 if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
 440                         if (zvol_request_sync) {
 441                                 zvol_discard(&zvr);
 442                         } else {
 443                                 task = zv_request_task_create(zvr);
 444                                 taskq_dispatch_ent(zvol_taskq,
 445                                     zvol_discard_task, task, 0, &task->ent);
 446                         }
 447                 } else {
 448                         if (zvol_request_sync) {
 449                                 zvol_write(&zvr);
 450                         } else {
 451                                 task = zv_request_task_create(zvr);
 452                                 taskq_dispatch_ent(zvol_taskq,
 453                                     zvol_write_task, task, 0, &task->ent);
 454                         }
 455                 }
 456         } else {
 457                 /*
 458                  * The SCST driver, and possibly others, may issue READ I/Os
 459                  * with a length of zero bytes.  These empty I/Os contain no
 460                  * data and require no additional handling.
 461                  */
 462                 if (size == 0) {
 463                         BIO_END_IO(bio, 0);
 464                         goto out;
 465                 }
 466
 467                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 468
 469                 /* See comment in WRITE case above. */
 470                 if (zvol_request_sync) {
 471                         zvol_read(&zvr);
 472                 } else {
 473                         task = zv_request_task_create(zvr);
 474                         taskq_dispatch_ent(zvol_taskq,
 475                             zvol_read_task, task, 0, &task->ent);
 476                 }
 477         }
 478
 479 out:
 480         spl_fstrans_unmark(cookie);
 481 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
 482         defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
 483         return (BLK_QC_T_NONE);
 484 #endif
 485 }
 486
 487 static int
 488 zvol_open(struct block_device *bdev, fmode_t flag)
 489 {
 490         zvol_state_t *zv;
 491         int error = 0;
 492         boolean_t drop_suspend = B_TRUE;
 493
 494         rw_enter(&zvol_state_lock, RW_READER);
 495         /*
 496          * Obtain a copy of private_data under the zvol_state_lock to make
 497          * sure that either the result of zvol free code path setting
 498          * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
 499          * is not called on this zv because of the positive zv_open_count.
 500          */
 501         zv = bdev->bd_disk->private_data;
 502         if (zv == NULL) {
 503                 rw_exit(&zvol_state_lock);
 504                 return (SET_ERROR(-ENXIO));
 505         }
 506
 507         mutex_enter(&zv->zv_state_lock);
 508         /*
 509          * make sure zvol is not suspended during first open
 510          * (hold zv_suspend_lock) and respect proper lock acquisition
 511          * ordering - zv_suspend_lock before zv_state_lock
 512          */
 513         if (zv->zv_open_count == 0) {
 514                 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 515                         mutex_exit(&zv->zv_state_lock);
 516                         rw_enter(&zv->zv_suspend_lock, RW_READER);
 517                         mutex_enter(&zv->zv_state_lock);
 518                         /* check to see if zv_suspend_lock is needed */
 519                         if (zv->zv_open_count != 0) {
 520                                 rw_exit(&zv->zv_suspend_lock);
 521                                 drop_suspend = B_FALSE;
 522                         }
 523                 }
 524         } else {
 525                 drop_suspend = B_FALSE;
 526         }
 527         rw_exit(&zvol_state_lock);
 528
 529         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 530
 531         if (zv->zv_open_count == 0) {
 532                 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 533                 error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
 534                 if (error)
 535                         goto out_mutex;
 536         }
 537
 538         if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 539                 error = -EROFS;
 540                 goto out_open_count;
 541         }
 542
 543         zv->zv_open_count++;
 544
 545         mutex_exit(&zv->zv_state_lock);
 546         if (drop_suspend)
 547                 rw_exit(&zv->zv_suspend_lock);
 548
 549         zfs_check_media_change(bdev);
 550
 551         return (0);
 552
 553 out_open_count:
 554         if (zv->zv_open_count == 0)
 555                 zvol_last_close(zv);
 556
 557 out_mutex:
 558         mutex_exit(&zv->zv_state_lock);
 559         if (drop_suspend)
 560                 rw_exit(&zv->zv_suspend_lock);
 561         if (error == -EINTR) {
 562                 error = -ERESTARTSYS;
 563                 schedule();
 564         }
 565         return (SET_ERROR(error));
 566 }
 567
 568 static void
 569 zvol_release(struct gendisk *disk, fmode_t mode)
 570 {
 571         zvol_state_t *zv;
 572         boolean_t drop_suspend = B_TRUE;
 573
 574         rw_enter(&zvol_state_lock, RW_READER);
 575         zv = disk->private_data;
 576
 577         mutex_enter(&zv->zv_state_lock);
 578         ASSERT3U(zv->zv_open_count, >, 0);
 579         /*
 580          * make sure zvol is not suspended during last close
 581          * (hold zv_suspend_lock) and respect proper lock acquisition
 582          * ordering - zv_suspend_lock before zv_state_lock
 583          */
 584         if (zv->zv_open_count == 1) {
 585                 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 586                         mutex_exit(&zv->zv_state_lock);
 587                         rw_enter(&zv->zv_suspend_lock, RW_READER);
 588                         mutex_enter(&zv->zv_state_lock);
 589                         /* check to see if zv_suspend_lock is needed */
 590                         if (zv->zv_open_count != 1) {
 591                                 rw_exit(&zv->zv_suspend_lock);
 592                                 drop_suspend = B_FALSE;
 593                         }
 594                 }
 595         } else {
 596                 drop_suspend = B_FALSE;
 597         }
 598         rw_exit(&zvol_state_lock);
 599
 600         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 601
 602         zv->zv_open_count--;
 603         if (zv->zv_open_count == 0) {
 604                 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 605                 zvol_last_close(zv);
 606         }
 607
 608         mutex_exit(&zv->zv_state_lock);
 609
 610         if (drop_suspend)
 611                 rw_exit(&zv->zv_suspend_lock);
 612 }
 613
 614 static int
 615 zvol_ioctl(struct block_device *bdev, fmode_t mode,
 616     unsigned int cmd, unsigned long arg)
 617 {
 618         zvol_state_t *zv = bdev->bd_disk->private_data;
 619         int error = 0;
 620
 621         ASSERT3U(zv->zv_open_count, >, 0);
 622
 623         switch (cmd) {
 624         case BLKFLSBUF:
 625                 fsync_bdev(bdev);
 626                 invalidate_bdev(bdev);
 627                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 628
 629                 if (!(zv->zv_flags & ZVOL_RDONLY))
 630                         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 631
 632                 rw_exit(&zv->zv_suspend_lock);
 633                 break;
 634
 635         case BLKZNAME:
 636                 mutex_enter(&zv->zv_state_lock);
 637                 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 638                 mutex_exit(&zv->zv_state_lock);
 639                 break;
 640
 641         default:
 642                 error = -ENOTTY;
 643                 break;
 644         }
 645
 646         return (SET_ERROR(error));
 647 }
 648
 649 #ifdef CONFIG_COMPAT
 650 static int
 651 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
 652     unsigned cmd, unsigned long arg)
 653 {
 654         return (zvol_ioctl(bdev, mode, cmd, arg));
 655 }
 656 #else
 657 #define zvol_compat_ioctl       NULL
 658 #endif
 659
 660 static unsigned int
 661 zvol_check_events(struct gendisk *disk, unsigned int clearing)
 662 {
 663         unsigned int mask = 0;
 664
 665         rw_enter(&zvol_state_lock, RW_READER);
 666
 667         zvol_state_t *zv = disk->private_data;
 668         if (zv != NULL) {
 669                 mutex_enter(&zv->zv_state_lock);
 670                 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 671                 zv->zv_changed = 0;
 672                 mutex_exit(&zv->zv_state_lock);
 673         }
 674
 675         rw_exit(&zvol_state_lock);
 676
 677         return (mask);
 678 }
 679
 680 static int
 681 zvol_revalidate_disk(struct gendisk *disk)
 682 {
 683         rw_enter(&zvol_state_lock, RW_READER);
 684
 685         zvol_state_t *zv = disk->private_data;
 686         if (zv != NULL) {
 687                 mutex_enter(&zv->zv_state_lock);
 688                 set_capacity(zv->zv_zso->zvo_disk,
 689                     zv->zv_volsize >> SECTOR_BITS);
 690                 mutex_exit(&zv->zv_state_lock);
 691         }
 692
 693         rw_exit(&zvol_state_lock);
 694
 695         return (0);
 696 }
 697
 698 static int
 699 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
 700 {
 701         struct gendisk *disk = zv->zv_zso->zvo_disk;
 702
 703 #if defined(HAVE_REVALIDATE_DISK_SIZE)
 704         revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
 705 #elif defined(HAVE_REVALIDATE_DISK)
 706         revalidate_disk(disk);
 707 #else
 708         zvol_revalidate_disk(disk);
 709 #endif
 710         return (0);
 711 }
 712
 713 static void
 714 zvol_clear_private(zvol_state_t *zv)
 715 {
 716         /*
 717          * Cleared while holding zvol_state_lock as a writer
 718          * which will prevent zvol_open() from opening it.
 719          */
 720         zv->zv_zso->zvo_disk->private_data = NULL;
 721 }
 722
 723 /*
 724  * Provide a simple virtual geometry for legacy compatibility.  For devices
 725  * smaller than 1 MiB a small head and sector count is used to allow very
 726  * tiny devices.  For devices over 1 Mib a standard head and sector count
 727  * is used to keep the cylinders count reasonable.
 728  */
 729 static int
 730 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 731 {
 732         zvol_state_t *zv = bdev->bd_disk->private_data;
 733         sector_t sectors;
 734
 735         ASSERT3U(zv->zv_open_count, >, 0);
 736
 737         sectors = get_capacity(zv->zv_zso->zvo_disk);
 738
 739         if (sectors > 2048) {
 740                 geo->heads = 16;
 741                 geo->sectors = 63;
 742         } else {
 743                 geo->heads = 2;
 744                 geo->sectors = 4;
 745         }
 746
 747         geo->start = 0;
 748         geo->cylinders = sectors / (geo->heads * geo->sectors);
 749
 750         return (0);
 751 }
 752
 753 static struct block_device_operations zvol_ops = {
 754         .open                   = zvol_open,
 755         .release                = zvol_release,
 756         .ioctl                  = zvol_ioctl,
 757         .compat_ioctl           = zvol_compat_ioctl,
 758         .check_events           = zvol_check_events,
 759 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
 760         .revalidate_disk        = zvol_revalidate_disk,
 761 #endif
 762         .getgeo                 = zvol_getgeo,
 763         .owner                  = THIS_MODULE,
 764 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 765         .submit_bio             = zvol_submit_bio,
 766 #endif
 767 };
 768
 769 /*
 770  * Allocate memory for a new zvol_state_t and setup the required
 771  * request queue and generic disk structures for the block device.
 772  */
 773 static zvol_state_t *
 774 zvol_alloc(dev_t dev, const char *name)
 775 {
 776         zvol_state_t *zv;
 777         struct zvol_state_os *zso;
 778         uint64_t volmode;
 779
 780         if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
 781                 return (NULL);
 782
 783         if (volmode == ZFS_VOLMODE_DEFAULT)
 784                 volmode = zvol_volmode;
 785
 786         if (volmode == ZFS_VOLMODE_NONE)
 787                 return (NULL);
 788
 789         zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 790         zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 791         zv->zv_zso = zso;
 792         zv->zv_volmode = volmode;
 793
 794         list_link_init(&zv->zv_next);
 795         mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
 796
 797 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 798 #ifdef HAVE_BLK_ALLOC_DISK
 799         zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
 800         if (zso->zvo_disk == NULL)
 801                 goto out_kmem;
 802
 803         zso->zvo_disk->minors = ZVOL_MINORS;
 804         zso->zvo_queue = zso->zvo_disk->queue;
 805 #else
 806         zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 807         if (zso->zvo_queue == NULL)
 808                 goto out_kmem;
 809
 810         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 811         if (zso->zvo_disk == NULL) {
 812                 blk_cleanup_queue(zso->zvo_queue);
 813                 goto out_kmem;
 814         }
 815
 816         zso->zvo_disk->queue = zso->zvo_queue;
 817 #endif /* HAVE_BLK_ALLOC_DISK */
 818 #else
 819         zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
 820         if (zso->zvo_queue == NULL)
 821                 goto out_kmem;
 822
 823         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 824         if (zso->zvo_disk == NULL) {
 825                 blk_cleanup_queue(zso->zvo_queue);
 826                 goto out_kmem;
 827         }
 828
 829         zso->zvo_disk->queue = zso->zvo_queue;
 830 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
 831
 832         blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
 833
 834         /* Limit read-ahead to a single page to prevent over-prefetching. */
 835         blk_queue_set_read_ahead(zso->zvo_queue, 1);
 836
 837         /* Disable write merging in favor of the ZIO pipeline. */
 838         blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
 839
 840         /* Enable /proc/diskstats */
 841         blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
 842
 843         zso->zvo_queue->queuedata = zv;
 844         zso->zvo_dev = dev;
 845         zv->zv_open_count = 0;
 846         strlcpy(zv->zv_name, name, MAXNAMELEN);
 847
 848         zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
 849         rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
 850
 851         zso->zvo_disk->major = zvol_major;
 852         zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
 853
 854         if (volmode == ZFS_VOLMODE_DEV) {
 855                 /*
 856                  * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
 857                  * gendisk->minors = 1 as noted in include/linux/genhd.h.
 858                  * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
 859                  * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
 860                  * setting gendisk->flags accordingly.
 861                  */
 862                 zso->zvo_disk->minors = 1;
 863 #if defined(GENHD_FL_EXT_DEVT)
 864                 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
 865 #endif
 866 #if defined(GENHD_FL_NO_PART_SCAN)
 867                 zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 868 #endif
 869         }
 870         zso->zvo_disk->first_minor = (dev & MINORMASK);
 871         zso->zvo_disk->fops = &zvol_ops;
 872         zso->zvo_disk->private_data = zv;
 873         snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
 874             ZVOL_DEV_NAME, (dev & MINORMASK));
 875
 876         return (zv);
 877
 878 out_kmem:
 879         kmem_free(zso, sizeof (struct zvol_state_os));
 880         kmem_free(zv, sizeof (zvol_state_t));
 881         return (NULL);
 882 }
 883
 884 /*
 885  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
 886  * At this time, the structure is not opened by anyone, is taken off
 887  * the zvol_state_list, and has its private data set to NULL.
 888  * The zvol_state_lock is dropped.
 889  *
 890  * This function may take many milliseconds to complete (e.g. we've seen
 891  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
 892  * "del_gendisk". Thus, consumers need to be careful to account for this
 893  * latency when calling this function.
 894  */
 895 static void
 896 zvol_free(zvol_state_t *zv)
 897 {
 898
 899         ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
 900         ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
 901         ASSERT0(zv->zv_open_count);
 902         ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
 903
 904         rw_destroy(&zv->zv_suspend_lock);
 905         zfs_rangelock_fini(&zv->zv_rangelock);
 906
 907         del_gendisk(zv->zv_zso->zvo_disk);
 908 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 909         defined(HAVE_BLK_ALLOC_DISK)
 910         blk_cleanup_disk(zv->zv_zso->zvo_disk);
 911 #else
 912         blk_cleanup_queue(zv->zv_zso->zvo_queue);
 913         put_disk(zv->zv_zso->zvo_disk);
 914 #endif
 915
 916         ida_simple_remove(&zvol_ida,
 917             MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
 918
 919         mutex_destroy(&zv->zv_state_lock);
 920         dataset_kstats_destroy(&zv->zv_kstat);
 921
 922         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 923         kmem_free(zv, sizeof (zvol_state_t));
 924 }
 925
 926 void
 927 zvol_wait_close(zvol_state_t *zv)
 928 {
 929 }
 930
 931 /*
 932  * Create a block device minor node and setup the linkage between it
 933  * and the specified volume.  Once this function returns the block
 934  * device is live and ready for use.
 935  */
 936 static int
 937 zvol_os_create_minor(const char *name)
 938 {
 939         zvol_state_t *zv;
 940         objset_t *os;
 941         dmu_object_info_t *doi;
 942         uint64_t volsize;
 943         uint64_t len;
 944         unsigned minor = 0;
 945         int error = 0;
 946         int idx;
 947         uint64_t hash = zvol_name_hash(name);
 948
 949         if (zvol_inhibit_dev)
 950                 return (0);
 951
 952         idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
 953         if (idx < 0)
 954                 return (SET_ERROR(-idx));
 955         minor = idx << ZVOL_MINOR_BITS;
 956
 957         zv = zvol_find_by_name_hash(name, hash, RW_NONE);
 958         if (zv) {
 959                 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 960                 mutex_exit(&zv->zv_state_lock);
 961                 ida_simple_remove(&zvol_ida, idx);
 962                 return (SET_ERROR(EEXIST));
 963         }
 964
 965         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 966
 967         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
 968         if (error)
 969                 goto out_doi;
 970
 971         error = dmu_object_info(os, ZVOL_OBJ, doi);
 972         if (error)
 973                 goto out_dmu_objset_disown;
 974
 975         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 976         if (error)
 977                 goto out_dmu_objset_disown;
 978
 979         zv = zvol_alloc(MKDEV(zvol_major, minor), name);
 980         if (zv == NULL) {
 981                 error = SET_ERROR(EAGAIN);
 982                 goto out_dmu_objset_disown;
 983         }
 984         zv->zv_hash = hash;
 985
 986         if (dmu_objset_is_snapshot(os))
 987                 zv->zv_flags |= ZVOL_RDONLY;
 988
 989         zv->zv_volblocksize = doi->doi_data_block_size;
 990         zv->zv_volsize = volsize;
 991         zv->zv_objset = os;
 992
 993         set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
 994
 995         blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
 996             (DMU_MAX_ACCESS / 4) >> 9);
 997         blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
 998         blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
 999         blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
1000             zv->zv_volblocksize);
1001         blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
1002         blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
1003             (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
1004         blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
1005             zv->zv_volblocksize);
1006         blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
1007 #ifdef QUEUE_FLAG_NONROT
1008         blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
1009 #endif
1010 #ifdef QUEUE_FLAG_ADD_RANDOM
1011         blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
1012 #endif
1013         /* This flag was introduced in kernel version 4.12. */
1014 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
1015         blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
1016 #endif
1017
1018         ASSERT3P(zv->zv_zilog, ==, NULL);
1019         zv->zv_zilog = zil_open(os, zvol_get_data);
1020         if (spa_writeable(dmu_objset_spa(os))) {
1021                 if (zil_replay_disable)
1022                         zil_destroy(zv->zv_zilog, B_FALSE);
1023                 else
1024                         zil_replay(os, zv, zvol_replay_vector);
1025         }
1026         zil_close(zv->zv_zilog);
1027         zv->zv_zilog = NULL;
1028         ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1029         dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1030
1031         /*
1032          * When udev detects the addition of the device it will immediately
1033          * invoke blkid(8) to determine the type of content on the device.
1034          * Prefetching the blocks commonly scanned by blkid(8) will speed
1035          * up this process.
1036          */
1037         len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
1038         if (len > 0) {
1039                 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1040                 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1041                     ZIO_PRIORITY_SYNC_READ);
1042         }
1043
1044         zv->zv_objset = NULL;
1045 out_dmu_objset_disown:
1046         dmu_objset_disown(os, B_TRUE, FTAG);
1047 out_doi:
1048         kmem_free(doi, sizeof (dmu_object_info_t));
1049
1050         /*
1051          * Keep in mind that once add_disk() is called, the zvol is
1052          * announced to the world, and zvol_open()/zvol_release() can
1053          * be called at any time. Incidentally, add_disk() itself calls
1054          * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1055          * directly as well.
1056          */
1057         if (error == 0) {
1058                 rw_enter(&zvol_state_lock, RW_WRITER);
1059                 zvol_insert(zv);
1060                 rw_exit(&zvol_state_lock);
1061                 add_disk(zv->zv_zso->zvo_disk);
1062         } else {
1063                 ida_simple_remove(&zvol_ida, idx);
1064         }
1065
1066         return (error);
1067 }
1068
1069 static void
1070 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1071 {
1072         int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1073
1074         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1075         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1076
1077         strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1078
1079         /* move to new hashtable entry  */
1080         zv->zv_hash = zvol_name_hash(zv->zv_name);
1081         hlist_del(&zv->zv_hlink);
1082         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1083
1084         /*
1085          * The block device's read-only state is briefly changed causing
1086          * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
1087          * the name change and fixes the symlinks.  This does not change
1088          * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1089          * changes.  This would normally be done using kobject_uevent() but
1090          * that is a GPL-only symbol which is why we need this workaround.
1091          */
1092         set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1093         set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1094 }
1095
1096 static void
1097 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1098 {
1099
1100         set_disk_ro(zv->zv_zso->zvo_disk, flags);
1101 }
1102
1103 static void
1104 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1105 {
1106
1107         set_capacity(zv->zv_zso->zvo_disk, capacity);
1108 }
1109
1110 const static zvol_platform_ops_t zvol_linux_ops = {
1111         .zv_free = zvol_free,
1112         .zv_rename_minor = zvol_rename_minor,
1113         .zv_create_minor = zvol_os_create_minor,
1114         .zv_update_volsize = zvol_update_volsize,
1115         .zv_clear_private = zvol_clear_private,
1116         .zv_is_zvol = zvol_is_zvol_impl,
1117         .zv_set_disk_ro = zvol_set_disk_ro_impl,
1118         .zv_set_capacity = zvol_set_capacity_impl,
1119 };
1120
1121 int
1122 zvol_init(void)
1123 {
1124         int error;
1125         int threads = MIN(MAX(zvol_threads, 1), 1024);
1126
1127         error = register_blkdev(zvol_major, ZVOL_DRIVER);
1128         if (error) {
1129                 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1130                 return (error);
1131         }
1132         zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
1133             threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1134         if (zvol_taskq == NULL) {
1135                 unregister_blkdev(zvol_major, ZVOL_DRIVER);
1136                 return (-ENOMEM);
1137         }
1138         zvol_init_impl();
1139         ida_init(&zvol_ida);
1140         zvol_register_ops(&zvol_linux_ops);
1141         return (0);
1142 }
1143
1144 void
1145 zvol_fini(void)
1146 {
1147         zvol_fini_impl();
1148         unregister_blkdev(zvol_major, ZVOL_DRIVER);
1149         taskq_destroy(zvol_taskq);
1150         ida_destroy(&zvol_ida);
1151 }
1152
1153 /* BEGIN CSTYLED */
1154 module_param(zvol_inhibit_dev, uint, 0644);
1155 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1156
1157 module_param(zvol_major, uint, 0444);
1158 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1159
1160 module_param(zvol_threads, uint, 0444);
1161 MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
1162
1163 module_param(zvol_request_sync, uint, 0644);
1164 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1165
1166 module_param(zvol_max_discard_blocks, ulong, 0444);
1167 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1168
1169 module_param(zvol_prefetch_bytes, uint, 0644);
1170 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1171
1172 module_param(zvol_volmode, uint, 0644);
1173 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1174 /* END CSTYLED */