sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  */
  27
  28 #include <sys/sysmacros.h>
  29 #include <sys/zfs_context.h>
  30 #include <sys/fm/fs/zfs.h>
  31 #include <sys/spa.h>
  32 #include <sys/txg.h>
  33 #include <sys/spa_impl.h>
  34 #include <sys/vdev_impl.h>
  35 #include <sys/zio_impl.h>
  36 #include <sys/zio_compress.h>
  37 #include <sys/zio_checksum.h>
  38 #include <sys/dmu_objset.h>
  39 #include <sys/arc.h>
  40 #include <sys/ddt.h>
  41 #include <sys/trim_map.h>
  42 #include <sys/blkptr.h>
  43 #include <sys/zfeature.h>
  44
  45 SYSCTL_DECL(_vfs_zfs);
  46 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
  47 #if defined(__amd64__)
  48 static int zio_use_uma = 1;
  49 #else
  50 static int zio_use_uma = 0;
  51 #endif
  52 TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
  53 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
  54     "Use uma(9) for ZIO allocations");
  55 static int zio_exclude_metadata = 0;
  56 TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata);
  57 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
  58     "Exclude metadata buffers from dumps as well");
  59
  60 zio_trim_stats_t zio_trim_stats = {
  61         { "bytes",              KSTAT_DATA_UINT64,
  62           "Number of bytes successfully TRIMmed" },
  63         { "success",            KSTAT_DATA_UINT64,
  64           "Number of successful TRIM requests" },
  65         { "unsupported",        KSTAT_DATA_UINT64,
  66           "Number of TRIM requests that failed because TRIM is not supported" },
  67         { "failed",             KSTAT_DATA_UINT64,
  68           "Number of TRIM requests that failed for reasons other than not supported" },
  69 };
  70
  71 static kstat_t *zio_trim_ksp;
  72
  73 /*
  74  * ==========================================================================
  75  * I/O type descriptions
  76  * ==========================================================================
  77  */
  78 const char *zio_type_name[ZIO_TYPES] = {
  79         "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  80         "zio_ioctl"
  81 };
  82
  83 /*
  84  * ==========================================================================
  85  * I/O kmem caches
  86  * ==========================================================================
  87  */
  88 kmem_cache_t *zio_cache;
  89 kmem_cache_t *zio_link_cache;
  90 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  91 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  92
  93 #ifdef _KERNEL
  94 extern vmem_t *zio_alloc_arena;
  95 #endif
  96
  97 #define ZIO_PIPELINE_CONTINUE           0x100
  98 #define ZIO_PIPELINE_STOP               0x101
  99
 100 #define BP_SPANB(indblkshift, level) \
 101         (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 102 #define COMPARE_META_LEVEL      0x80000000ul
 103 /*
 104  * The following actions directly effect the spa's sync-to-convergence logic.
 105  * The values below define the sync pass when we start performing the action.
 106  * Care should be taken when changing these values as they directly impact
 107  * spa_sync() performance. Tuning these values may introduce subtle performance
 108  * pathologies and should only be done in the context of performance analysis.
 109  * These tunables will eventually be removed and replaced with #defines once
 110  * enough analysis has been done to determine optimal values.
 111  *
 112  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
 113  * regular blocks are not deferred.
 114  */
 115 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
 116 TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free);
 117 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
 118     &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
 119 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
 120 TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress);
 121 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
 122     &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
 123 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
 124 TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite);
 125 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
 126     &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
 127
 128 /*
 129  * An allocating zio is one that either currently has the DVA allocate
 130  * stage set or will have it later in its lifetime.
 131  */
 132 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 133
 134 boolean_t       zio_requeue_io_start_cut_in_line = B_TRUE;
 135
 136 #ifdef illumos
 137 #ifdef ZFS_DEBUG
 138 int zio_buf_debug_limit = 16384;
 139 #else
 140 int zio_buf_debug_limit = 0;
 141 #endif
 142 #endif
 143
 144 void
 145 zio_init(void)
 146 {
 147         size_t c;
 148         zio_cache = kmem_cache_create("zio_cache",
 149             sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 150         zio_link_cache = kmem_cache_create("zio_link_cache",
 151             sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 152         if (!zio_use_uma)
 153                 goto out;
 154
 155         /*
 156          * For small buffers, we want a cache for each multiple of
 157          * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 158          * for each quarter-power of 2.
 159          */
 160         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 161                 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 162                 size_t p2 = size;
 163                 size_t align = 0;
 164                 int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
 165
 166                 while (!ISP2(p2))
 167                         p2 &= p2 - 1;
 168
 169 #ifdef illumos
 170 #ifndef _KERNEL
 171                 /*
 172                  * If we are using watchpoints, put each buffer on its own page,
 173                  * to eliminate the performance overhead of trapping to the
 174                  * kernel when modifying a non-watched buffer that shares the
 175                  * page with a watched buffer.
 176                  */
 177                 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 178                         continue;
 179 #endif
 180 #endif /* illumos */
 181                 if (size <= 4 * SPA_MINBLOCKSIZE) {
 182                         align = SPA_MINBLOCKSIZE;
 183                 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 184                         align = MIN(p2 >> 2, PAGESIZE);
 185                 }
 186
 187                 if (align != 0) {
 188                         char name[36];
 189                         (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 190                         zio_buf_cache[c] = kmem_cache_create(name, size,
 191                             align, NULL, NULL, NULL, NULL, NULL, cflags);
 192
 193                         /*
 194                          * Since zio_data bufs do not appear in crash dumps, we
 195                          * pass KMC_NOTOUCH so that no allocator metadata is
 196                          * stored with the buffers.
 197                          */
 198                         (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 199                         zio_data_buf_cache[c] = kmem_cache_create(name, size,
 200                             align, NULL, NULL, NULL, NULL, NULL,
 201                             cflags | KMC_NOTOUCH | KMC_NODEBUG);
 202                 }
 203         }
 204
 205         while (--c != 0) {
 206                 ASSERT(zio_buf_cache[c] != NULL);
 207                 if (zio_buf_cache[c - 1] == NULL)
 208                         zio_buf_cache[c - 1] = zio_buf_cache[c];
 209
 210                 ASSERT(zio_data_buf_cache[c] != NULL);
 211                 if (zio_data_buf_cache[c - 1] == NULL)
 212                         zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 213         }
 214 out:
 215
 216         zio_inject_init();
 217
 218         zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
 219             KSTAT_TYPE_NAMED,
 220             sizeof(zio_trim_stats) / sizeof(kstat_named_t),
 221             KSTAT_FLAG_VIRTUAL);
 222
 223         if (zio_trim_ksp != NULL) {
 224                 zio_trim_ksp->ks_data = &zio_trim_stats;
 225                 kstat_install(zio_trim_ksp);
 226         }
 227 }
 228
 229 void
 230 zio_fini(void)
 231 {
 232         size_t c;
 233         kmem_cache_t *last_cache = NULL;
 234         kmem_cache_t *last_data_cache = NULL;
 235
 236         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 237                 if (zio_buf_cache[c] != last_cache) {
 238                         last_cache = zio_buf_cache[c];
 239                         kmem_cache_destroy(zio_buf_cache[c]);
 240                 }
 241                 zio_buf_cache[c] = NULL;
 242
 243                 if (zio_data_buf_cache[c] != last_data_cache) {
 244                         last_data_cache = zio_data_buf_cache[c];
 245                         kmem_cache_destroy(zio_data_buf_cache[c]);
 246                 }
 247                 zio_data_buf_cache[c] = NULL;
 248         }
 249
 250         kmem_cache_destroy(zio_link_cache);
 251         kmem_cache_destroy(zio_cache);
 252
 253         zio_inject_fini();
 254
 255         if (zio_trim_ksp != NULL) {
 256                 kstat_delete(zio_trim_ksp);
 257                 zio_trim_ksp = NULL;
 258         }
 259 }
 260
 261 /*
 262  * ==========================================================================
 263  * Allocate and free I/O buffers
 264  * ==========================================================================
 265  */
 266
 267 /*
 268  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 269  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 270  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 271  * excess / transient data in-core during a crashdump.
 272  */
 273 void *
 274 zio_buf_alloc(size_t size)
 275 {
 276         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 277         int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
 278
 279         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 280
 281         if (zio_use_uma)
 282                 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 283         else
 284                 return (kmem_alloc(size, KM_SLEEP|flags));
 285 }
 286
 287 /*
 288  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 289  * crashdump if the kernel panics.  This exists so that we will limit the amount
 290  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 291  * of kernel heap dumped to disk when the kernel panics)
 292  */
 293 void *
 294 zio_data_buf_alloc(size_t size)
 295 {
 296         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 297
 298         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 299
 300         if (zio_use_uma)
 301                 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 302         else
 303                 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
 304 }
 305
 306 void
 307 zio_buf_free(void *buf, size_t size)
 308 {
 309         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 310
 311         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 312
 313         if (zio_use_uma)
 314                 kmem_cache_free(zio_buf_cache[c], buf);
 315         else
 316                 kmem_free(buf, size);
 317 }
 318
 319 void
 320 zio_data_buf_free(void *buf, size_t size)
 321 {
 322         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 323
 324         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 325
 326         if (zio_use_uma)
 327                 kmem_cache_free(zio_data_buf_cache[c], buf);
 328         else
 329                 kmem_free(buf, size);
 330 }
 331
 332 /*
 333  * ==========================================================================
 334  * Push and pop I/O transform buffers
 335  * ==========================================================================
 336  */
 337 static void
 338 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 339     zio_transform_func_t *transform)
 340 {
 341         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 342
 343         zt->zt_orig_data = zio->io_data;
 344         zt->zt_orig_size = zio->io_size;
 345         zt->zt_bufsize = bufsize;
 346         zt->zt_transform = transform;
 347
 348         zt->zt_next = zio->io_transform_stack;
 349         zio->io_transform_stack = zt;
 350
 351         zio->io_data = data;
 352         zio->io_size = size;
 353 }
 354
 355 static void
 356 zio_pop_transforms(zio_t *zio)
 357 {
 358         zio_transform_t *zt;
 359
 360         while ((zt = zio->io_transform_stack) != NULL) {
 361                 if (zt->zt_transform != NULL)
 362                         zt->zt_transform(zio,
 363                             zt->zt_orig_data, zt->zt_orig_size);
 364
 365                 if (zt->zt_bufsize != 0)
 366                         zio_buf_free(zio->io_data, zt->zt_bufsize);
 367
 368                 zio->io_data = zt->zt_orig_data;
 369                 zio->io_size = zt->zt_orig_size;
 370                 zio->io_transform_stack = zt->zt_next;
 371
 372                 kmem_free(zt, sizeof (zio_transform_t));
 373         }
 374 }
 375
 376 /*
 377  * ==========================================================================
 378  * I/O transform callbacks for subblocks and decompression
 379  * ==========================================================================
 380  */
 381 static void
 382 zio_subblock(zio_t *zio, void *data, uint64_t size)
 383 {
 384         ASSERT(zio->io_size > size);
 385
 386         if (zio->io_type == ZIO_TYPE_READ)
 387                 bcopy(zio->io_data, data, size);
 388 }
 389
 390 static void
 391 zio_decompress(zio_t *zio, void *data, uint64_t size)
 392 {
 393         if (zio->io_error == 0 &&
 394             zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 395             zio->io_data, data, zio->io_size, size) != 0)
 396                 zio->io_error = SET_ERROR(EIO);
 397 }
 398
 399 /*
 400  * ==========================================================================
 401  * I/O parent/child relationships and pipeline interlocks
 402  * ==========================================================================
 403  */
 404 /*
 405  * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 406  *        continue calling these functions until they return NULL.
 407  *        Otherwise, the next caller will pick up the list walk in
 408  *        some indeterminate state.  (Otherwise every caller would
 409  *        have to pass in a cookie to keep the state represented by
 410  *        io_walk_link, which gets annoying.)
 411  */
 412 zio_t *
 413 zio_walk_parents(zio_t *cio)
 414 {
 415         zio_link_t *zl = cio->io_walk_link;
 416         list_t *pl = &cio->io_parent_list;
 417
 418         zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 419         cio->io_walk_link = zl;
 420
 421         if (zl == NULL)
 422                 return (NULL);
 423
 424         ASSERT(zl->zl_child == cio);
 425         return (zl->zl_parent);
 426 }
 427
 428 zio_t *
 429 zio_walk_children(zio_t *pio)
 430 {
 431         zio_link_t *zl = pio->io_walk_link;
 432         list_t *cl = &pio->io_child_list;
 433
 434         zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 435         pio->io_walk_link = zl;
 436
 437         if (zl == NULL)
 438                 return (NULL);
 439
 440         ASSERT(zl->zl_parent == pio);
 441         return (zl->zl_child);
 442 }
 443
 444 zio_t *
 445 zio_unique_parent(zio_t *cio)
 446 {
 447         zio_t *pio = zio_walk_parents(cio);
 448
 449         VERIFY(zio_walk_parents(cio) == NULL);
 450         return (pio);
 451 }
 452
 453 void
 454 zio_add_child(zio_t *pio, zio_t *cio)
 455 {
 456         zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 457
 458         /*
 459          * Logical I/Os can have logical, gang, or vdev children.
 460          * Gang I/Os can have gang or vdev children.
 461          * Vdev I/Os can only have vdev children.
 462          * The following ASSERT captures all of these constraints.
 463          */
 464         ASSERT(cio->io_child_type <= pio->io_child_type);
 465
 466         zl->zl_parent = pio;
 467         zl->zl_child = cio;
 468
 469         mutex_enter(&cio->io_lock);
 470         mutex_enter(&pio->io_lock);
 471
 472         ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 473
 474         for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 475                 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 476
 477         list_insert_head(&pio->io_child_list, zl);
 478         list_insert_head(&cio->io_parent_list, zl);
 479
 480         pio->io_child_count++;
 481         cio->io_parent_count++;
 482
 483         mutex_exit(&pio->io_lock);
 484         mutex_exit(&cio->io_lock);
 485 }
 486
 487 static void
 488 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 489 {
 490         ASSERT(zl->zl_parent == pio);
 491         ASSERT(zl->zl_child == cio);
 492
 493         mutex_enter(&cio->io_lock);
 494         mutex_enter(&pio->io_lock);
 495
 496         list_remove(&pio->io_child_list, zl);
 497         list_remove(&cio->io_parent_list, zl);
 498
 499         pio->io_child_count--;
 500         cio->io_parent_count--;
 501
 502         mutex_exit(&pio->io_lock);
 503         mutex_exit(&cio->io_lock);
 504
 505         kmem_cache_free(zio_link_cache, zl);
 506 }
 507
 508 static boolean_t
 509 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 510 {
 511         uint64_t *countp = &zio->io_children[child][wait];
 512         boolean_t waiting = B_FALSE;
 513
 514         mutex_enter(&zio->io_lock);
 515         ASSERT(zio->io_stall == NULL);
 516         if (*countp != 0) {
 517                 zio->io_stage >>= 1;
 518                 zio->io_stall = countp;
 519                 waiting = B_TRUE;
 520         }
 521         mutex_exit(&zio->io_lock);
 522
 523         return (waiting);
 524 }
 525
 526 static void
 527 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 528 {
 529         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 530         int *errorp = &pio->io_child_error[zio->io_child_type];
 531
 532         mutex_enter(&pio->io_lock);
 533         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 534                 *errorp = zio_worst_error(*errorp, zio->io_error);
 535         pio->io_reexecute |= zio->io_reexecute;
 536         ASSERT3U(*countp, >, 0);
 537
 538         (*countp)--;
 539
 540         if (*countp == 0 && pio->io_stall == countp) {
 541                 pio->io_stall = NULL;
 542                 mutex_exit(&pio->io_lock);
 543                 zio_execute(pio);
 544         } else {
 545                 mutex_exit(&pio->io_lock);
 546         }
 547 }
 548
 549 static void
 550 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 551 {
 552         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 553                 zio->io_error = zio->io_child_error[c];
 554 }
 555
 556 /*
 557  * ==========================================================================
 558  * Create the various types of I/O (read, write, free, etc)
 559  * ==========================================================================
 560  */
 561 static zio_t *
 562 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 563     void *data, uint64_t size, zio_done_func_t *done, void *private,
 564     zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 565     vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
 566     enum zio_stage stage, enum zio_stage pipeline)
 567 {
 568         zio_t *zio;
 569
 570         ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
 571         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 572         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 573
 574         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 575         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 576         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 577
 578         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 579         bzero(zio, sizeof (zio_t));
 580
 581         mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 582         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 583
 584         list_create(&zio->io_parent_list, sizeof (zio_link_t),
 585             offsetof(zio_link_t, zl_parent_node));
 586         list_create(&zio->io_child_list, sizeof (zio_link_t),
 587             offsetof(zio_link_t, zl_child_node));
 588
 589         if (vd != NULL)
 590                 zio->io_child_type = ZIO_CHILD_VDEV;
 591         else if (flags & ZIO_FLAG_GANG_CHILD)
 592                 zio->io_child_type = ZIO_CHILD_GANG;
 593         else if (flags & ZIO_FLAG_DDT_CHILD)
 594                 zio->io_child_type = ZIO_CHILD_DDT;
 595         else
 596                 zio->io_child_type = ZIO_CHILD_LOGICAL;
 597
 598         if (bp != NULL) {
 599                 zio->io_bp = (blkptr_t *)bp;
 600                 zio->io_bp_copy = *bp;
 601                 zio->io_bp_orig = *bp;
 602                 if (type != ZIO_TYPE_WRITE ||
 603                     zio->io_child_type == ZIO_CHILD_DDT)
 604                         zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 605                 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 606                         zio->io_logical = zio;
 607                 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 608                         pipeline |= ZIO_GANG_STAGES;
 609         }
 610
 611         zio->io_spa = spa;
 612         zio->io_txg = txg;
 613         zio->io_done = done;
 614         zio->io_private = private;
 615         zio->io_type = type;
 616         zio->io_priority = priority;
 617         zio->io_vd = vd;
 618         zio->io_offset = offset;
 619         zio->io_orig_data = zio->io_data = data;
 620         zio->io_orig_size = zio->io_size = size;
 621         zio->io_orig_flags = zio->io_flags = flags;
 622         zio->io_orig_stage = zio->io_stage = stage;
 623         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 624
 625         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 626         zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 627
 628         if (zb != NULL)
 629                 zio->io_bookmark = *zb;
 630
 631         if (pio != NULL) {
 632                 if (zio->io_logical == NULL)
 633                         zio->io_logical = pio->io_logical;
 634                 if (zio->io_child_type == ZIO_CHILD_GANG)
 635                         zio->io_gang_leader = pio->io_gang_leader;
 636                 zio_add_child(pio, zio);
 637         }
 638
 639         return (zio);
 640 }
 641
 642 static void
 643 zio_destroy(zio_t *zio)
 644 {
 645         list_destroy(&zio->io_parent_list);
 646         list_destroy(&zio->io_child_list);
 647         mutex_destroy(&zio->io_lock);
 648         cv_destroy(&zio->io_cv);
 649         kmem_cache_free(zio_cache, zio);
 650 }
 651
 652 zio_t *
 653 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 654     void *private, enum zio_flag flags)
 655 {
 656         zio_t *zio;
 657
 658         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 659             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 660             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 661
 662         return (zio);
 663 }
 664
 665 zio_t *
 666 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 667 {
 668         return (zio_null(NULL, spa, NULL, done, private, flags));
 669 }
 670
 671 void
 672 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 673 {
 674         if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 675                 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 676                     bp, (longlong_t)BP_GET_TYPE(bp));
 677         }
 678         if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 679             BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 680                 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 681                     bp, (longlong_t)BP_GET_CHECKSUM(bp));
 682         }
 683         if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 684             BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 685                 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 686                     bp, (longlong_t)BP_GET_COMPRESS(bp));
 687         }
 688         if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 689                 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 690                     bp, (longlong_t)BP_GET_LSIZE(bp));
 691         }
 692         if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 693                 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 694                     bp, (longlong_t)BP_GET_PSIZE(bp));
 695         }
 696
 697         if (BP_IS_EMBEDDED(bp)) {
 698                 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 699                         zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 700                             bp, (longlong_t)BPE_GET_ETYPE(bp));
 701                 }
 702         }
 703
 704         /*
 705          * Pool-specific checks.
 706          *
 707          * Note: it would be nice to verify that the blk_birth and
 708          * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 709          * allows the birth time of log blocks (and dmu_sync()-ed blocks
 710          * that are in the log) to be arbitrarily large.
 711          */
 712         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 713                 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 714                 if (vdevid >= spa->spa_root_vdev->vdev_children) {
 715                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 716                             "VDEV %llu",
 717                             bp, i, (longlong_t)vdevid);
 718                         continue;
 719                 }
 720                 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 721                 if (vd == NULL) {
 722                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 723                             "VDEV %llu",
 724                             bp, i, (longlong_t)vdevid);
 725                         continue;
 726                 }
 727                 if (vd->vdev_ops == &vdev_hole_ops) {
 728                         zfs_panic_recover("blkptr at %p DVA %u has hole "
 729                             "VDEV %llu",
 730                             bp, i, (longlong_t)vdevid);
 731                         continue;
 732                 }
 733                 if (vd->vdev_ops == &vdev_missing_ops) {
 734                         /*
 735                          * "missing" vdevs are valid during import, but we
 736                          * don't have their detailed info (e.g. asize), so
 737                          * we can't perform any more checks on them.
 738                          */
 739                         continue;
 740                 }
 741                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 742                 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 743                 if (BP_IS_GANG(bp))
 744                         asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 745                 if (offset + asize > vd->vdev_asize) {
 746                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 747                             "OFFSET %llu",
 748                             bp, i, (longlong_t)offset);
 749                 }
 750         }
 751 }
 752
 753 zio_t *
 754 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 755     void *data, uint64_t size, zio_done_func_t *done, void *private,
 756     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 757 {
 758         zio_t *zio;
 759
 760         zfs_blkptr_verify(spa, bp);
 761
 762         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 763             data, size, done, private,
 764             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 765             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 766             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 767
 768         return (zio);
 769 }
 770
 771 zio_t *
 772 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 773     void *data, uint64_t size, const zio_prop_t *zp,
 774     zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 775     void *private,
 776     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 777 {
 778         zio_t *zio;
 779
 780         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 781             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 782             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 783             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 784             DMU_OT_IS_VALID(zp->zp_type) &&
 785             zp->zp_level < 32 &&
 786             zp->zp_copies > 0 &&
 787             zp->zp_copies <= spa_max_replication(spa));
 788
 789         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 790             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 791             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 792             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 793
 794         zio->io_ready = ready;
 795         zio->io_physdone = physdone;
 796         zio->io_prop = *zp;
 797
 798         /*
 799          * Data can be NULL if we are going to call zio_write_override() to
 800          * provide the already-allocated BP.  But we may need the data to
 801          * verify a dedup hit (if requested).  In this case, don't try to
 802          * dedup (just take the already-allocated BP verbatim).
 803          */
 804         if (data == NULL && zio->io_prop.zp_dedup_verify) {
 805                 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 806         }
 807
 808         return (zio);
 809 }
 810
 811 zio_t *
 812 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 813     uint64_t size, zio_done_func_t *done, void *private,
 814     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 815 {
 816         zio_t *zio;
 817
 818         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 819             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 820             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 821
 822         return (zio);
 823 }
 824
 825 void
 826 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 827 {
 828         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 829         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 830         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 831         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 832
 833         /*
 834          * We must reset the io_prop to match the values that existed
 835          * when the bp was first written by dmu_sync() keeping in mind
 836          * that nopwrite and dedup are mutually exclusive.
 837          */
 838         zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 839         zio->io_prop.zp_nopwrite = nopwrite;
 840         zio->io_prop.zp_copies = copies;
 841         zio->io_bp_override = bp;
 842 }
 843
 844 void
 845 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 846 {
 847
 848         /*
 849          * The check for EMBEDDED is a performance optimization.  We
 850          * process the free here (by ignoring it) rather than
 851          * putting it on the list and then processing it in zio_free_sync().
 852          */
 853         if (BP_IS_EMBEDDED(bp))
 854                 return;
 855         metaslab_check_free(spa, bp);
 856
 857         /*
 858          * Frees that are for the currently-syncing txg, are not going to be
 859          * deferred, and which will not need to do a read (i.e. not GANG or
 860          * DEDUP), can be processed immediately.  Otherwise, put them on the
 861          * in-memory list for later processing.
 862          */
 863         if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 864             txg != spa->spa_syncing_txg ||
 865             spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 866                 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 867         } else {
 868                 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
 869                     BP_GET_PSIZE(bp), 0)));
 870         }
 871 }
 872
 873 zio_t *
 874 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 875     uint64_t size, enum zio_flag flags)
 876 {
 877         zio_t *zio;
 878         enum zio_stage stage = ZIO_FREE_PIPELINE;
 879
 880         ASSERT(!BP_IS_HOLE(bp));
 881         ASSERT(spa_syncing_txg(spa) == txg);
 882         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 883
 884         if (BP_IS_EMBEDDED(bp))
 885                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 886
 887         metaslab_check_free(spa, bp);
 888         arc_freed(spa, bp);
 889
 890         if (zfs_trim_enabled)
 891                 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
 892                     ZIO_STAGE_VDEV_IO_ASSESS;
 893         /*
 894          * GANG and DEDUP blocks can induce a read (for the gang block header,
 895          * or the DDT), so issue them asynchronously so that this thread is
 896          * not tied up.
 897          */
 898         else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 899                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 900
 901         flags |= ZIO_FLAG_DONT_QUEUE;
 902
 903         zio = zio_create(pio, spa, txg, bp, NULL, size,
 904             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 905             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 906
 907         return (zio);
 908 }
 909
 910 zio_t *
 911 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 912     zio_done_func_t *done, void *private, enum zio_flag flags)
 913 {
 914         zio_t *zio;
 915
 916         dprintf_bp(bp, "claiming in txg %llu", txg);
 917
 918         if (BP_IS_EMBEDDED(bp))
 919                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 920
 921         /*
 922          * A claim is an allocation of a specific block.  Claims are needed
 923          * to support immediate writes in the intent log.  The issue is that
 924          * immediate writes contain committed data, but in a txg that was
 925          * *not* committed.  Upon opening the pool after an unclean shutdown,
 926          * the intent log claims all blocks that contain immediate write data
 927          * so that the SPA knows they're in use.
 928          *
 929          * All claims *must* be resolved in the first txg -- before the SPA
 930          * starts allocating blocks -- so that nothing is allocated twice.
 931          * If txg == 0 we just verify that the block is claimable.
 932          */
 933         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 934         ASSERT(txg == spa_first_txg(spa) || txg == 0);
 935         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 936
 937         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 938             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 939             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 940
 941         return (zio);
 942 }
 943
 944 zio_t *
 945 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
 946     uint64_t size, zio_done_func_t *done, void *private,
 947     zio_priority_t priority, enum zio_flag flags)
 948 {
 949         zio_t *zio;
 950         int c;
 951
 952         if (vd->vdev_children == 0) {
 953                 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
 954                     ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
 955                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 956
 957                 zio->io_cmd = cmd;
 958         } else {
 959                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 960
 961                 for (c = 0; c < vd->vdev_children; c++)
 962                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 963                             offset, size, done, private, priority, flags));
 964         }
 965
 966         return (zio);
 967 }
 968
 969 zio_t *
 970 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 971     void *data, int checksum, zio_done_func_t *done, void *private,
 972     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 973 {
 974         zio_t *zio;
 975
 976         ASSERT(vd->vdev_children == 0);
 977         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 978             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 979         ASSERT3U(offset + size, <=, vd->vdev_psize);
 980
 981         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 982             ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 983             NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 984
 985         zio->io_prop.zp_checksum = checksum;
 986
 987         return (zio);
 988 }
 989
 990 zio_t *
 991 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 992     void *data, int checksum, zio_done_func_t *done, void *private,
 993     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 994 {
 995         zio_t *zio;
 996
 997         ASSERT(vd->vdev_children == 0);
 998         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 999             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
1000         ASSERT3U(offset + size, <=, vd->vdev_psize);
1001
1002         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
1003             ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
1004             NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
1005
1006         zio->io_prop.zp_checksum = checksum;
1007
1008         if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
1009                 /*
1010                  * zec checksums are necessarily destructive -- they modify
1011                  * the end of the write buffer to hold the verifier/checksum.
1012                  * Therefore, we must make a local copy in case the data is
1013                  * being written to multiple places in parallel.
1014                  */
1015                 void *wbuf = zio_buf_alloc(size);
1016                 bcopy(data, wbuf, size);
1017                 zio_push_transform(zio, wbuf, size, size, NULL);
1018         }
1019
1020         return (zio);
1021 }
1022
1023 /*
1024  * Create a child I/O to do some work for us.
1025  */
1026 zio_t *
1027 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
1028         void *data, uint64_t size, int type, zio_priority_t priority,
1029         enum zio_flag flags, zio_done_func_t *done, void *private)
1030 {
1031         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
1032         zio_t *zio;
1033
1034         ASSERT(vd->vdev_parent ==
1035             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
1036
1037         if (type == ZIO_TYPE_READ && bp != NULL) {
1038                 /*
1039                  * If we have the bp, then the child should perform the
1040                  * checksum and the parent need not.  This pushes error
1041                  * detection as close to the leaves as possible and
1042                  * eliminates redundant checksums in the interior nodes.
1043                  */
1044                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
1045                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
1046         }
1047
1048         /* Not all IO types require vdev io done stage e.g. free */
1049         if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
1050                 pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
1051
1052         if (vd->vdev_children == 0)
1053                 offset += VDEV_LABEL_START_SIZE;
1054
1055         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
1056
1057         /*
1058          * If we've decided to do a repair, the write is not speculative --
1059          * even if the original read was.
1060          */
1061         if (flags & ZIO_FLAG_IO_REPAIR)
1062                 flags &= ~ZIO_FLAG_SPECULATIVE;
1063
1064         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
1065             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
1066             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
1067
1068         zio->io_physdone = pio->io_physdone;
1069         if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1070                 zio->io_logical->io_phys_children++;
1071
1072         return (zio);
1073 }
1074
1075 zio_t *
1076 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
1077     int type, zio_priority_t priority, enum zio_flag flags,
1078     zio_done_func_t *done, void *private)
1079 {
1080         zio_t *zio;
1081
1082         ASSERT(vd->vdev_ops->vdev_op_leaf);
1083
1084         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1085             data, size, done, private, type, priority,
1086             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
1087             vd, offset, NULL,
1088             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
1089
1090         return (zio);
1091 }
1092
1093 void
1094 zio_flush(zio_t *zio, vdev_t *vd)
1095 {
1096         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
1097             NULL, NULL, ZIO_PRIORITY_NOW,
1098             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1099 }
1100
1101 zio_t *
1102 zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
1103 {
1104
1105         ASSERT(vd->vdev_ops->vdev_op_leaf);
1106
1107         return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
1108             ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
1109             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
1110             vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
1111 }
1112
1113 void
1114 zio_shrink(zio_t *zio, uint64_t size)
1115 {
1116         ASSERT(zio->io_executor == NULL);
1117         ASSERT(zio->io_orig_size == zio->io_size);
1118         ASSERT(size <= zio->io_size);
1119
1120         /*
1121          * We don't shrink for raidz because of problems with the
1122          * reconstruction when reading back less than the block size.
1123          * Note, BP_IS_RAIDZ() assumes no compression.
1124          */
1125         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1126         if (!BP_IS_RAIDZ(zio->io_bp))
1127                 zio->io_orig_size = zio->io_size = size;
1128 }
1129
1130 /*
1131  * ==========================================================================
1132  * Prepare to read and write logical blocks
1133  * ==========================================================================
1134  */
1135
1136 static int
1137 zio_read_bp_init(zio_t *zio)
1138 {
1139         blkptr_t *bp = zio->io_bp;
1140
1141         if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1142             zio->io_child_type == ZIO_CHILD_LOGICAL &&
1143             !(zio->io_flags & ZIO_FLAG_RAW)) {
1144                 uint64_t psize =
1145                     BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1146                 void *cbuf = zio_buf_alloc(psize);
1147
1148                 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1149         }
1150
1151         if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1152                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1153                 decode_embedded_bp_compressed(bp, zio->io_data);
1154         } else {
1155                 ASSERT(!BP_IS_EMBEDDED(bp));
1156         }
1157
1158         if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1159                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1160
1161         if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1162                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1163
1164         if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1165                 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1166
1167         return (ZIO_PIPELINE_CONTINUE);
1168 }
1169
1170 static int
1171 zio_write_bp_init(zio_t *zio)
1172 {
1173         spa_t *spa = zio->io_spa;
1174         zio_prop_t *zp = &zio->io_prop;
1175         enum zio_compress compress = zp->zp_compress;
1176         blkptr_t *bp = zio->io_bp;
1177         uint64_t lsize = zio->io_size;
1178         uint64_t psize = lsize;
1179         int pass = 1;
1180
1181         /*
1182          * If our children haven't all reached the ready stage,
1183          * wait for them and then repeat this pipeline stage.
1184          */
1185         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1186             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1187                 return (ZIO_PIPELINE_STOP);
1188
1189         if (!IO_IS_ALLOCATING(zio))
1190                 return (ZIO_PIPELINE_CONTINUE);
1191
1192         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1193
1194         if (zio->io_bp_override) {
1195                 ASSERT(bp->blk_birth != zio->io_txg);
1196                 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1197
1198                 *bp = *zio->io_bp_override;
1199                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1200
1201                 if (BP_IS_EMBEDDED(bp))
1202                         return (ZIO_PIPELINE_CONTINUE);
1203
1204                 /*
1205                  * If we've been overridden and nopwrite is set then
1206                  * set the flag accordingly to indicate that a nopwrite
1207                  * has already occurred.
1208                  */
1209                 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1210                         ASSERT(!zp->zp_dedup);
1211                         zio->io_flags |= ZIO_FLAG_NOPWRITE;
1212                         return (ZIO_PIPELINE_CONTINUE);
1213                 }
1214
1215                 ASSERT(!zp->zp_nopwrite);
1216
1217                 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1218                         return (ZIO_PIPELINE_CONTINUE);
1219
1220                 ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
1221                     ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
1222
1223                 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1224                         BP_SET_DEDUP(bp, 1);
1225                         zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1226                         return (ZIO_PIPELINE_CONTINUE);
1227                 }
1228                 zio->io_bp_override = NULL;
1229                 BP_ZERO(bp);
1230         }
1231
1232         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1233                 /*
1234                  * We're rewriting an existing block, which means we're
1235                  * working on behalf of spa_sync().  For spa_sync() to
1236                  * converge, it must eventually be the case that we don't
1237                  * have to allocate new blocks.  But compression changes
1238                  * the blocksize, which forces a reallocate, and makes
1239                  * convergence take longer.  Therefore, after the first
1240                  * few passes, stop compressing to ensure convergence.
1241                  */
1242                 pass = spa_sync_pass(spa);
1243
1244                 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1245                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1246                 ASSERT(!BP_GET_DEDUP(bp));
1247
1248                 if (pass >= zfs_sync_pass_dont_compress)
1249                         compress = ZIO_COMPRESS_OFF;
1250
1251                 /* Make sure someone doesn't change their mind on overwrites */
1252                 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1253                     spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1254         }
1255
1256         if (compress != ZIO_COMPRESS_OFF) {
1257                 void *cbuf = zio_buf_alloc(lsize);
1258                 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1259                 if (psize == 0 || psize == lsize) {
1260                         compress = ZIO_COMPRESS_OFF;
1261                         zio_buf_free(cbuf, lsize);
1262                 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1263                     zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1264                     spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1265                         encode_embedded_bp_compressed(bp,
1266                             cbuf, compress, lsize, psize);
1267                         BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1268                         BP_SET_TYPE(bp, zio->io_prop.zp_type);
1269                         BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1270                         zio_buf_free(cbuf, lsize);
1271                         bp->blk_birth = zio->io_txg;
1272                         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1273                         ASSERT(spa_feature_is_active(spa,
1274                             SPA_FEATURE_EMBEDDED_DATA));
1275                         return (ZIO_PIPELINE_CONTINUE);
1276                 } else {
1277                         /*
1278                          * Round up compressed size up to the ashift
1279                          * of the smallest-ashift device, and zero the tail.
1280                          * This ensures that the compressed size of the BP
1281                          * (and thus compressratio property) are correct,
1282                          * in that we charge for the padding used to fill out
1283                          * the last sector.
1284                          */
1285                         ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
1286                         size_t rounded = (size_t)P2ROUNDUP(psize,
1287                             1ULL << spa->spa_min_ashift);
1288                         if (rounded >= lsize) {
1289                                 compress = ZIO_COMPRESS_OFF;
1290                                 zio_buf_free(cbuf, lsize);
1291                                 psize = lsize;
1292                         } else {
1293                                 bzero((char *)cbuf + psize, rounded - psize);
1294                                 psize = rounded;
1295                                 zio_push_transform(zio, cbuf,
1296                                     psize, lsize, NULL);
1297                         }
1298                 }
1299         }
1300
1301         /*
1302          * The final pass of spa_sync() must be all rewrites, but the first
1303          * few passes offer a trade-off: allocating blocks defers convergence,
1304          * but newly allocated blocks are sequential, so they can be written
1305          * to disk faster.  Therefore, we allow the first few passes of
1306          * spa_sync() to allocate new blocks, but force rewrites after that.
1307          * There should only be a handful of blocks after pass 1 in any case.
1308          */
1309         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1310             BP_GET_PSIZE(bp) == psize &&
1311             pass >= zfs_sync_pass_rewrite) {
1312                 ASSERT(psize != 0);
1313                 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1314                 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1315                 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1316         } else {
1317                 BP_ZERO(bp);
1318                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1319         }
1320
1321         if (psize == 0) {
1322                 if (zio->io_bp_orig.blk_birth != 0 &&
1323                     spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1324                         BP_SET_LSIZE(bp, lsize);
1325                         BP_SET_TYPE(bp, zp->zp_type);
1326                         BP_SET_LEVEL(bp, zp->zp_level);
1327                         BP_SET_BIRTH(bp, zio->io_txg, 0);
1328                 }
1329                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1330         } else {
1331                 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1332                 BP_SET_LSIZE(bp, lsize);
1333                 BP_SET_TYPE(bp, zp->zp_type);
1334                 BP_SET_LEVEL(bp, zp->zp_level);
1335                 BP_SET_PSIZE(bp, psize);
1336                 BP_SET_COMPRESS(bp, compress);
1337                 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1338                 BP_SET_DEDUP(bp, zp->zp_dedup);
1339                 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1340                 if (zp->zp_dedup) {
1341                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1342                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1343                         zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1344                 }
1345                 if (zp->zp_nopwrite) {
1346                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1347                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1348                         zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1349                 }
1350         }
1351
1352         return (ZIO_PIPELINE_CONTINUE);
1353 }
1354
1355 static int
1356 zio_free_bp_init(zio_t *zio)
1357 {
1358         blkptr_t *bp = zio->io_bp;
1359
1360         if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1361                 if (BP_GET_DEDUP(bp))
1362                         zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1363         }
1364
1365         return (ZIO_PIPELINE_CONTINUE);
1366 }
1367
1368 /*
1369  * ==========================================================================
1370  * Execute the I/O pipeline
1371  * ==========================================================================
1372  */
1373
1374 static void
1375 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1376 {
1377         spa_t *spa = zio->io_spa;
1378         zio_type_t t = zio->io_type;
1379         int flags = (cutinline ? TQ_FRONT : 0);
1380
1381         ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
1382
1383         /*
1384          * If we're a config writer or a probe, the normal issue and
1385          * interrupt threads may all be blocked waiting for the config lock.
1386          * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1387          */
1388         if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1389                 t = ZIO_TYPE_NULL;
1390
1391         /*
1392          * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1393          */
1394         if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1395                 t = ZIO_TYPE_NULL;
1396
1397         /*
1398          * If this is a high priority I/O, then use the high priority taskq if
1399          * available.
1400          */
1401         if (zio->io_priority == ZIO_PRIORITY_NOW &&
1402             spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1403                 q++;
1404
1405         ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1406
1407         /*
1408          * NB: We are assuming that the zio can only be dispatched
1409          * to a single taskq at a time.  It would be a grievous error
1410          * to dispatch the zio to another taskq at the same time.
1411          */
1412 #if defined(illumos) || !defined(_KERNEL)
1413         ASSERT(zio->io_tqent.tqent_next == NULL);
1414 #else
1415         ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
1416 #endif
1417         spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1418             flags, &zio->io_tqent);
1419 }
1420
1421 static boolean_t
1422 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1423 {
1424         kthread_t *executor = zio->io_executor;
1425         spa_t *spa = zio->io_spa;
1426
1427         for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1428                 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1429                 uint_t i;
1430                 for (i = 0; i < tqs->stqs_count; i++) {
1431                         if (taskq_member(tqs->stqs_taskq[i], executor))
1432                                 return (B_TRUE);
1433                 }
1434         }
1435
1436         return (B_FALSE);
1437 }
1438
1439 static int
1440 zio_issue_async(zio_t *zio)
1441 {
1442         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1443
1444         return (ZIO_PIPELINE_STOP);
1445 }
1446
1447 void
1448 zio_interrupt(zio_t *zio)
1449 {
1450         zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1451 }
1452
1453 void
1454 zio_delay_interrupt(zio_t *zio)
1455 {
1456         /*
1457          * The timeout_generic() function isn't defined in userspace, so
1458          * rather than trying to implement the function, the zio delay
1459          * functionality has been disabled for userspace builds.
1460          */
1461
1462 #ifdef _KERNEL
1463         /*
1464          * If io_target_timestamp is zero, then no delay has been registered
1465          * for this IO, thus jump to the end of this function and "skip" the
1466          * delay; issuing it directly to the zio layer.
1467          */
1468         if (zio->io_target_timestamp != 0) {
1469                 hrtime_t now = gethrtime();
1470
1471                 if (now >= zio->io_target_timestamp) {
1472                         /*
1473                          * This IO has already taken longer than the target
1474                          * delay to complete, so we don't want to delay it
1475                          * any longer; we "miss" the delay and issue it
1476                          * directly to the zio layer. This is likely due to
1477                          * the target latency being set to a value less than
1478                          * the underlying hardware can satisfy (e.g. delay
1479                          * set to 1ms, but the disks take 10ms to complete an
1480                          * IO request).
1481                          */
1482
1483                         DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
1484                             hrtime_t, now);
1485
1486                         zio_interrupt(zio);
1487                 } else {
1488                         hrtime_t diff = zio->io_target_timestamp - now;
1489
1490                         DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
1491                             hrtime_t, now, hrtime_t, diff);
1492
1493                         (void) timeout_generic(CALLOUT_NORMAL,
1494                             (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
1495                 }
1496
1497                 return;
1498         }
1499 #endif
1500
1501         DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
1502         zio_interrupt(zio);
1503 }
1504
1505 /*
1506  * Execute the I/O pipeline until one of the following occurs:
1507  *
1508  *      (1) the I/O completes
1509  *      (2) the pipeline stalls waiting for dependent child I/Os
1510  *      (3) the I/O issues, so we're waiting for an I/O completion interrupt
1511  *      (4) the I/O is delegated by vdev-level caching or aggregation
1512  *      (5) the I/O is deferred due to vdev-level queueing
1513  *      (6) the I/O is handed off to another thread.
1514  *
1515  * In all cases, the pipeline stops whenever there's no CPU work; it never
1516  * burns a thread in cv_wait().
1517  *
1518  * There's no locking on io_stage because there's no legitimate way
1519  * for multiple threads to be attempting to process the same I/O.
1520  */
1521 static zio_pipe_stage_t *zio_pipeline[];
1522
1523 void
1524 zio_execute(zio_t *zio)
1525 {
1526         zio->io_executor = curthread;
1527
1528         while (zio->io_stage < ZIO_STAGE_DONE) {
1529                 enum zio_stage pipeline = zio->io_pipeline;
1530                 enum zio_stage stage = zio->io_stage;
1531                 int rv;
1532
1533                 ASSERT(!MUTEX_HELD(&zio->io_lock));
1534                 ASSERT(ISP2(stage));
1535                 ASSERT(zio->io_stall == NULL);
1536
1537                 do {
1538                         stage <<= 1;
1539                 } while ((stage & pipeline) == 0);
1540
1541                 ASSERT(stage <= ZIO_STAGE_DONE);
1542
1543                 /*
1544                  * If we are in interrupt context and this pipeline stage
1545                  * will grab a config lock that is held across I/O,
1546                  * or may wait for an I/O that needs an interrupt thread
1547                  * to complete, issue async to avoid deadlock.
1548                  *
1549                  * For VDEV_IO_START, we cut in line so that the io will
1550                  * be sent to disk promptly.
1551                  */
1552                 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1553                     zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1554                         boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1555                             zio_requeue_io_start_cut_in_line : B_FALSE;
1556                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1557                         return;
1558                 }
1559
1560                 zio->io_stage = stage;
1561                 rv = zio_pipeline[highbit64(stage) - 1](zio);
1562
1563                 if (rv == ZIO_PIPELINE_STOP)
1564                         return;
1565
1566                 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1567         }
1568 }
1569
1570 /*
1571  * ==========================================================================
1572  * Initiate I/O, either sync or async
1573  * ==========================================================================
1574  */
1575 int
1576 zio_wait(zio_t *zio)
1577 {
1578         int error;
1579
1580         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1581         ASSERT(zio->io_executor == NULL);
1582
1583         zio->io_waiter = curthread;
1584
1585         zio_execute(zio);
1586
1587         mutex_enter(&zio->io_lock);
1588         while (zio->io_executor != NULL)
1589                 cv_wait(&zio->io_cv, &zio->io_lock);
1590         mutex_exit(&zio->io_lock);
1591
1592         error = zio->io_error;
1593         zio_destroy(zio);
1594
1595         return (error);
1596 }
1597
1598 void
1599 zio_nowait(zio_t *zio)
1600 {
1601         ASSERT(zio->io_executor == NULL);
1602
1603         if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1604             zio_unique_parent(zio) == NULL) {
1605                 /*
1606                  * This is a logical async I/O with no parent to wait for it.
1607                  * We add it to the spa_async_root_zio "Godfather" I/O which
1608                  * will ensure they complete prior to unloading the pool.
1609                  */
1610                 spa_t *spa = zio->io_spa;
1611
1612                 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
1613         }
1614
1615         zio_execute(zio);
1616 }
1617
1618 /*
1619  * ==========================================================================
1620  * Reexecute or suspend/resume failed I/O
1621  * ==========================================================================
1622  */
1623
1624 static void
1625 zio_reexecute(zio_t *pio)
1626 {
1627         zio_t *cio, *cio_next;
1628
1629         ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1630         ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1631         ASSERT(pio->io_gang_leader == NULL);
1632         ASSERT(pio->io_gang_tree == NULL);
1633
1634         pio->io_flags = pio->io_orig_flags;
1635         pio->io_stage = pio->io_orig_stage;
1636         pio->io_pipeline = pio->io_orig_pipeline;
1637         pio->io_reexecute = 0;
1638         pio->io_flags |= ZIO_FLAG_REEXECUTED;
1639         pio->io_error = 0;
1640         for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1641                 pio->io_state[w] = 0;
1642         for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1643                 pio->io_child_error[c] = 0;
1644
1645         if (IO_IS_ALLOCATING(pio))
1646                 BP_ZERO(pio->io_bp);
1647
1648         /*
1649          * As we reexecute pio's children, new children could be created.
1650          * New children go to the head of pio's io_child_list, however,
1651          * so we will (correctly) not reexecute them.  The key is that
1652          * the remainder of pio's io_child_list, from 'cio_next' onward,
1653          * cannot be affected by any side effects of reexecuting 'cio'.
1654          */
1655         for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1656                 cio_next = zio_walk_children(pio);
1657                 mutex_enter(&pio->io_lock);
1658                 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1659                         pio->io_children[cio->io_child_type][w]++;
1660                 mutex_exit(&pio->io_lock);
1661                 zio_reexecute(cio);
1662         }
1663
1664         /*
1665          * Now that all children have been reexecuted, execute the parent.
1666          * We don't reexecute "The Godfather" I/O here as it's the
1667          * responsibility of the caller to wait on him.
1668          */
1669         if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1670                 zio_execute(pio);
1671 }
1672
1673 void
1674 zio_suspend(spa_t *spa, zio_t *zio)
1675 {
1676         if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1677                 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1678                     "failure and the failure mode property for this pool "
1679                     "is set to panic.", spa_name(spa));
1680
1681         zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1682
1683         mutex_enter(&spa->spa_suspend_lock);
1684
1685         if (spa->spa_suspend_zio_root == NULL)
1686                 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1687                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1688                     ZIO_FLAG_GODFATHER);
1689
1690         spa->spa_suspended = B_TRUE;
1691
1692         if (zio != NULL) {
1693                 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1694                 ASSERT(zio != spa->spa_suspend_zio_root);
1695                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1696                 ASSERT(zio_unique_parent(zio) == NULL);
1697                 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1698                 zio_add_child(spa->spa_suspend_zio_root, zio);
1699         }
1700
1701         mutex_exit(&spa->spa_suspend_lock);
1702 }
1703
1704 int
1705 zio_resume(spa_t *spa)
1706 {
1707         zio_t *pio;
1708
1709         /*
1710          * Reexecute all previously suspended i/o.
1711          */
1712         mutex_enter(&spa->spa_suspend_lock);
1713         spa->spa_suspended = B_FALSE;
1714         cv_broadcast(&spa->spa_suspend_cv);
1715         pio = spa->spa_suspend_zio_root;
1716         spa->spa_suspend_zio_root = NULL;
1717         mutex_exit(&spa->spa_suspend_lock);
1718
1719         if (pio == NULL)
1720                 return (0);
1721
1722         zio_reexecute(pio);
1723         return (zio_wait(pio));
1724 }
1725
1726 void
1727 zio_resume_wait(spa_t *spa)
1728 {
1729         mutex_enter(&spa->spa_suspend_lock);
1730         while (spa_suspended(spa))
1731                 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1732         mutex_exit(&spa->spa_suspend_lock);
1733 }
1734
1735 /*
1736  * ==========================================================================
1737  * Gang blocks.
1738  *
1739  * A gang block is a collection of small blocks that looks to the DMU
1740  * like one large block.  When zio_dva_allocate() cannot find a block
1741  * of the requested size, due to either severe fragmentation or the pool
1742  * being nearly full, it calls zio_write_gang_block() to construct the
1743  * block from smaller fragments.
1744  *
1745  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1746  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1747  * an indirect block: it's an array of block pointers.  It consumes
1748  * only one sector and hence is allocatable regardless of fragmentation.
1749  * The gang header's bps point to its gang members, which hold the data.
1750  *
1751  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1752  * as the verifier to ensure uniqueness of the SHA256 checksum.
1753  * Critically, the gang block bp's blk_cksum is the checksum of the data,
1754  * not the gang header.  This ensures that data block signatures (needed for
1755  * deduplication) are independent of how the block is physically stored.
1756  *
1757  * Gang blocks can be nested: a gang member may itself be a gang block.
1758  * Thus every gang block is a tree in which root and all interior nodes are
1759  * gang headers, and the leaves are normal blocks that contain user data.
1760  * The root of the gang tree is called the gang leader.
1761  *
1762  * To perform any operation (read, rewrite, free, claim) on a gang block,
1763  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1764  * in the io_gang_tree field of the original logical i/o by recursively
1765  * reading the gang leader and all gang headers below it.  This yields
1766  * an in-core tree containing the contents of every gang header and the
1767  * bps for every constituent of the gang block.
1768  *
1769  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1770  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1771  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1772  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1773  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1774  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1775  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1776  * of the gang header plus zio_checksum_compute() of the data to update the
1777  * gang header's blk_cksum as described above.
1778  *
1779  * The two-phase assemble/issue model solves the problem of partial failure --
1780  * what if you'd freed part of a gang block but then couldn't read the
1781  * gang header for another part?  Assembling the entire gang tree first
1782  * ensures that all the necessary gang header I/O has succeeded before
1783  * starting the actual work of free, claim, or write.  Once the gang tree
1784  * is assembled, free and claim are in-memory operations that cannot fail.
1785  *
1786  * In the event that a gang write fails, zio_dva_unallocate() walks the
1787  * gang tree to immediately free (i.e. insert back into the space map)
1788  * everything we've allocated.  This ensures that we don't get ENOSPC
1789  * errors during repeated suspend/resume cycles due to a flaky device.
1790  *
1791  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1792  * the gang tree, we won't modify the block, so we can safely defer the free
1793  * (knowing that the block is still intact).  If we *can* assemble the gang
1794  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1795  * each constituent bp and we can allocate a new block on the next sync pass.
1796  *
1797  * In all cases, the gang tree allows complete recovery from partial failure.
1798  * ==========================================================================
1799  */
1800
1801 static zio_t *
1802 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1803 {
1804         if (gn != NULL)
1805                 return (pio);
1806
1807         return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1808             NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1809             &pio->io_bookmark));
1810 }
1811
1812 zio_t *
1813 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1814 {
1815         zio_t *zio;
1816
1817         if (gn != NULL) {
1818                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1819                     gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1820                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1821                 /*
1822                  * As we rewrite each gang header, the pipeline will compute
1823                  * a new gang block header checksum for it; but no one will
1824                  * compute a new data checksum, so we do that here.  The one
1825                  * exception is the gang leader: the pipeline already computed
1826                  * its data checksum because that stage precedes gang assembly.
1827                  * (Presently, nothing actually uses interior data checksums;
1828                  * this is just good hygiene.)
1829                  */
1830                 if (gn != pio->io_gang_leader->io_gang_tree) {
1831                         zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1832                             data, BP_GET_PSIZE(bp));
1833                 }
1834                 /*
1835                  * If we are here to damage data for testing purposes,
1836                  * leave the GBH alone so that we can detect the damage.
1837                  */
1838                 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1839                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1840         } else {
1841                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1842                     data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1843                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1844         }
1845
1846         return (zio);
1847 }
1848
1849 /* ARGSUSED */
1850 zio_t *
1851 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1852 {
1853         return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1854             BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
1855             ZIO_GANG_CHILD_FLAGS(pio)));
1856 }
1857
1858 /* ARGSUSED */
1859 zio_t *
1860 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1861 {
1862         return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1863             NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1864 }
1865
1866 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1867         NULL,
1868         zio_read_gang,
1869         zio_rewrite_gang,
1870         zio_free_gang,
1871         zio_claim_gang,
1872         NULL
1873 };
1874
1875 static void zio_gang_tree_assemble_done(zio_t *zio);
1876
1877 static zio_gang_node_t *
1878 zio_gang_node_alloc(zio_gang_node_t **gnpp)
1879 {
1880         zio_gang_node_t *gn;
1881
1882         ASSERT(*gnpp == NULL);
1883
1884         gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1885         gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1886         *gnpp = gn;
1887
1888         return (gn);
1889 }
1890
1891 static void
1892 zio_gang_node_free(zio_gang_node_t **gnpp)
1893 {
1894         zio_gang_node_t *gn = *gnpp;
1895
1896         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1897                 ASSERT(gn->gn_child[g] == NULL);
1898
1899         zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1900         kmem_free(gn, sizeof (*gn));
1901         *gnpp = NULL;
1902 }
1903
1904 static void
1905 zio_gang_tree_free(zio_gang_node_t **gnpp)
1906 {
1907         zio_gang_node_t *gn = *gnpp;
1908
1909         if (gn == NULL)
1910                 return;
1911
1912         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1913                 zio_gang_tree_free(&gn->gn_child[g]);
1914
1915         zio_gang_node_free(gnpp);
1916 }
1917
1918 static void
1919 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1920 {
1921         zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1922
1923         ASSERT(gio->io_gang_leader == gio);
1924         ASSERT(BP_IS_GANG(bp));
1925
1926         zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1927             SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1928             gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1929 }
1930
1931 static void
1932 zio_gang_tree_assemble_done(zio_t *zio)
1933 {
1934         zio_t *gio = zio->io_gang_leader;
1935         zio_gang_node_t *gn = zio->io_private;
1936         blkptr_t *bp = zio->io_bp;
1937
1938         ASSERT(gio == zio_unique_parent(zio));
1939         ASSERT(zio->io_child_count == 0);
1940
1941         if (zio->io_error)
1942                 return;
1943
1944         if (BP_SHOULD_BYTESWAP(bp))
1945                 byteswap_uint64_array(zio->io_data, zio->io_size);
1946
1947         ASSERT(zio->io_data == gn->gn_gbh);
1948         ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1949         ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1950
1951         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1952                 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1953                 if (!BP_IS_GANG(gbp))
1954                         continue;
1955                 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1956         }
1957 }
1958
1959 static void
1960 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1961 {
1962         zio_t *gio = pio->io_gang_leader;
1963         zio_t *zio;
1964
1965         ASSERT(BP_IS_GANG(bp) == !!gn);
1966         ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1967         ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1968
1969         /*
1970          * If you're a gang header, your data is in gn->gn_gbh.
1971          * If you're a gang member, your data is in 'data' and gn == NULL.
1972          */
1973         zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1974
1975         if (gn != NULL) {
1976                 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1977
1978                 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1979                         blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1980                         if (BP_IS_HOLE(gbp))
1981                                 continue;
1982                         zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1983                         data = (char *)data + BP_GET_PSIZE(gbp);
1984                 }
1985         }
1986
1987         if (gn == gio->io_gang_tree && gio->io_data != NULL)
1988                 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1989
1990         if (zio != pio)
1991                 zio_nowait(zio);
1992 }
1993
1994 static int
1995 zio_gang_assemble(zio_t *zio)
1996 {
1997         blkptr_t *bp = zio->io_bp;
1998
1999         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
2000         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2001
2002         zio->io_gang_leader = zio;
2003
2004         zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
2005
2006         return (ZIO_PIPELINE_CONTINUE);
2007 }
2008
2009 static int
2010 zio_gang_issue(zio_t *zio)
2011 {
2012         blkptr_t *bp = zio->io_bp;
2013
2014         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
2015                 return (ZIO_PIPELINE_STOP);
2016
2017         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
2018         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2019
2020         if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
2021                 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
2022         else
2023                 zio_gang_tree_free(&zio->io_gang_tree);
2024
2025         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2026
2027         return (ZIO_PIPELINE_CONTINUE);
2028 }
2029
2030 static void
2031 zio_write_gang_member_ready(zio_t *zio)
2032 {
2033         zio_t *pio = zio_unique_parent(zio);
2034         zio_t *gio = zio->io_gang_leader;
2035         dva_t *cdva = zio->io_bp->blk_dva;
2036         dva_t *pdva = pio->io_bp->blk_dva;
2037         uint64_t asize;
2038
2039         if (BP_IS_HOLE(zio->io_bp))
2040                 return;
2041
2042         ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
2043
2044         ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
2045         ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
2046         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
2047         ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
2048         ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
2049
2050         mutex_enter(&pio->io_lock);
2051         for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
2052                 ASSERT(DVA_GET_GANG(&pdva[d]));
2053                 asize = DVA_GET_ASIZE(&pdva[d]);
2054                 asize += DVA_GET_ASIZE(&cdva[d]);
2055                 DVA_SET_ASIZE(&pdva[d], asize);
2056         }
2057         mutex_exit(&pio->io_lock);
2058 }
2059
2060 static int
2061 zio_write_gang_block(zio_t *pio)
2062 {
2063         spa_t *spa = pio->io_spa;
2064         blkptr_t *bp = pio->io_bp;
2065         zio_t *gio = pio->io_gang_leader;
2066         zio_t *zio;
2067         zio_gang_node_t *gn, **gnpp;
2068         zio_gbh_phys_t *gbh;
2069         uint64_t txg = pio->io_txg;
2070         uint64_t resid = pio->io_size;
2071         uint64_t lsize;
2072         int copies = gio->io_prop.zp_copies;
2073         int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
2074         zio_prop_t zp;
2075         int error;
2076
2077         error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
2078             bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
2079             METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
2080         if (error) {
2081                 pio->io_error = error;
2082                 return (ZIO_PIPELINE_CONTINUE);
2083         }
2084
2085         if (pio == gio) {
2086                 gnpp = &gio->io_gang_tree;
2087         } else {
2088                 gnpp = pio->io_private;
2089                 ASSERT(pio->io_ready == zio_write_gang_member_ready);
2090         }
2091
2092         gn = zio_gang_node_alloc(gnpp);
2093         gbh = gn->gn_gbh;
2094         bzero(gbh, SPA_GANGBLOCKSIZE);
2095
2096         /*
2097          * Create the gang header.
2098          */
2099         zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
2100             pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2101
2102         /*
2103          * Create and nowait the gang children.
2104          */
2105         for (int g = 0; resid != 0; resid -= lsize, g++) {
2106                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
2107                     SPA_MINBLOCKSIZE);
2108                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
2109
2110                 zp.zp_checksum = gio->io_prop.zp_checksum;
2111                 zp.zp_compress = ZIO_COMPRESS_OFF;
2112                 zp.zp_type = DMU_OT_NONE;
2113                 zp.zp_level = 0;
2114                 zp.zp_copies = gio->io_prop.zp_copies;
2115                 zp.zp_dedup = B_FALSE;
2116                 zp.zp_dedup_verify = B_FALSE;
2117                 zp.zp_nopwrite = B_FALSE;
2118
2119                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
2120                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
2121                     zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
2122                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2123                     &pio->io_bookmark));
2124         }
2125
2126         /*
2127          * Set pio's pipeline to just wait for zio to finish.
2128          */
2129         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2130
2131         zio_nowait(zio);
2132
2133         return (ZIO_PIPELINE_CONTINUE);
2134 }
2135
2136 /*
2137  * The zio_nop_write stage in the pipeline determines if allocating a
2138  * new bp is necessary.  The nopwrite feature can handle writes in
2139  * either syncing or open context (i.e. zil writes) and as a result is
2140  * mutually exclusive with dedup.
2141  *
2142  * By leveraging a cryptographically secure checksum, such as SHA256, we
2143  * can compare the checksums of the new data and the old to determine if
2144  * allocating a new block is required.  Note that our requirements for
2145  * cryptographic strength are fairly weak: there can't be any accidental
2146  * hash collisions, but we don't need to be secure against intentional
2147  * (malicious) collisions.  To trigger a nopwrite, you have to be able
2148  * to write the file to begin with, and triggering an incorrect (hash
2149  * collision) nopwrite is no worse than simply writing to the file.
2150  * That said, there are no known attacks against the checksum algorithms
2151  * used for nopwrite, assuming that the salt and the checksums
2152  * themselves remain secret.
2153  */
2154 static int
2155 zio_nop_write(zio_t *zio)
2156 {
2157         blkptr_t *bp = zio->io_bp;
2158         blkptr_t *bp_orig = &zio->io_bp_orig;
2159         zio_prop_t *zp = &zio->io_prop;
2160
2161         ASSERT(BP_GET_LEVEL(bp) == 0);
2162         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2163         ASSERT(zp->zp_nopwrite);
2164         ASSERT(!zp->zp_dedup);
2165         ASSERT(zio->io_bp_override == NULL);
2166         ASSERT(IO_IS_ALLOCATING(zio));
2167
2168         /*
2169          * Check to see if the original bp and the new bp have matching
2170          * characteristics (i.e. same checksum, compression algorithms, etc).
2171          * If they don't then just continue with the pipeline which will
2172          * allocate a new bp.
2173          */
2174         if (BP_IS_HOLE(bp_orig) ||
2175             !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
2176             ZCHECKSUM_FLAG_NOPWRITE) ||
2177             BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2178             BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2179             BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2180             zp->zp_copies != BP_GET_NDVAS(bp_orig))
2181                 return (ZIO_PIPELINE_CONTINUE);
2182
2183         /*
2184          * If the checksums match then reset the pipeline so that we
2185          * avoid allocating a new bp and issuing any I/O.
2186          */
2187         if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2188                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
2189                     ZCHECKSUM_FLAG_NOPWRITE);
2190                 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2191                 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2192                 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2193                 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2194                     sizeof (uint64_t)) == 0);
2195
2196                 *bp = *bp_orig;
2197                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2198                 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2199         }
2200
2201         return (ZIO_PIPELINE_CONTINUE);
2202 }
2203
2204 /*
2205  * ==========================================================================
2206  * Dedup
2207  * ==========================================================================
2208  */
2209 static void
2210 zio_ddt_child_read_done(zio_t *zio)
2211 {
2212         blkptr_t *bp = zio->io_bp;
2213         ddt_entry_t *dde = zio->io_private;
2214         ddt_phys_t *ddp;
2215         zio_t *pio = zio_unique_parent(zio);
2216
2217         mutex_enter(&pio->io_lock);
2218         ddp = ddt_phys_select(dde, bp);
2219         if (zio->io_error == 0)
2220                 ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
2221         if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2222                 dde->dde_repair_data = zio->io_data;
2223         else
2224                 zio_buf_free(zio->io_data, zio->io_size);
2225         mutex_exit(&pio->io_lock);
2226 }
2227
2228 static int
2229 zio_ddt_read_start(zio_t *zio)
2230 {
2231         blkptr_t *bp = zio->io_bp;
2232
2233         ASSERT(BP_GET_DEDUP(bp));
2234         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2235         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2236
2237         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2238                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2239                 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2240                 ddt_phys_t *ddp = dde->dde_phys;
2241                 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2242                 blkptr_t blk;
2243
2244                 ASSERT(zio->io_vsd == NULL);
2245                 zio->io_vsd = dde;
2246
2247                 if (ddp_self == NULL)
2248                         return (ZIO_PIPELINE_CONTINUE);
2249
2250                 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2251                         if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2252                                 continue;
2253                         ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2254                             &blk);
2255                         zio_nowait(zio_read(zio, zio->io_spa, &blk,
2256                             zio_buf_alloc(zio->io_size), zio->io_size,
2257                             zio_ddt_child_read_done, dde, zio->io_priority,
2258                             ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2259                             &zio->io_bookmark));
2260                 }
2261                 return (ZIO_PIPELINE_CONTINUE);
2262         }
2263
2264         zio_nowait(zio_read(zio, zio->io_spa, bp,
2265             zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2266             ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2267
2268         return (ZIO_PIPELINE_CONTINUE);
2269 }
2270
2271 static int
2272 zio_ddt_read_done(zio_t *zio)
2273 {
2274         blkptr_t *bp = zio->io_bp;
2275
2276         if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2277                 return (ZIO_PIPELINE_STOP);
2278
2279         ASSERT(BP_GET_DEDUP(bp));
2280         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2281         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2282
2283         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2284                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2285                 ddt_entry_t *dde = zio->io_vsd;
2286                 if (ddt == NULL) {
2287                         ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2288                         return (ZIO_PIPELINE_CONTINUE);
2289                 }
2290                 if (dde == NULL) {
2291                         zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2292                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2293                         return (ZIO_PIPELINE_STOP);
2294                 }
2295                 if (dde->dde_repair_data != NULL) {
2296                         bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2297                         zio->io_child_error[ZIO_CHILD_DDT] = 0;
2298                 }
2299                 ddt_repair_done(ddt, dde);
2300                 zio->io_vsd = NULL;
2301         }
2302
2303         ASSERT(zio->io_vsd == NULL);
2304
2305         return (ZIO_PIPELINE_CONTINUE);
2306 }
2307
2308 static boolean_t
2309 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2310 {
2311         spa_t *spa = zio->io_spa;
2312
2313         /*
2314          * Note: we compare the original data, not the transformed data,
2315          * because when zio->io_bp is an override bp, we will not have
2316          * pushed the I/O transforms.  That's an important optimization
2317          * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2318          */
2319         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2320                 zio_t *lio = dde->dde_lead_zio[p];
2321
2322                 if (lio != NULL) {
2323                         return (lio->io_orig_size != zio->io_orig_size ||
2324                             bcmp(zio->io_orig_data, lio->io_orig_data,
2325                             zio->io_orig_size) != 0);
2326                 }
2327         }
2328
2329         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2330                 ddt_phys_t *ddp = &dde->dde_phys[p];
2331
2332                 if (ddp->ddp_phys_birth != 0) {
2333                         arc_buf_t *abuf = NULL;
2334                         arc_flags_t aflags = ARC_FLAG_WAIT;
2335                         blkptr_t blk = *zio->io_bp;
2336                         int error;
2337
2338                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2339
2340                         ddt_exit(ddt);
2341
2342                         error = arc_read(NULL, spa, &blk,
2343                             arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2344                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2345                             &aflags, &zio->io_bookmark);
2346
2347                         if (error == 0) {
2348                                 if (arc_buf_size(abuf) != zio->io_orig_size ||
2349                                     bcmp(abuf->b_data, zio->io_orig_data,
2350                                     zio->io_orig_size) != 0)
2351                                         error = SET_ERROR(EEXIST);
2352                                 VERIFY(arc_buf_remove_ref(abuf, &abuf));
2353                         }
2354
2355                         ddt_enter(ddt);
2356                         return (error != 0);
2357                 }
2358         }
2359
2360         return (B_FALSE);
2361 }
2362
2363 static void
2364 zio_ddt_child_write_ready(zio_t *zio)
2365 {
2366         int p = zio->io_prop.zp_copies;
2367         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2368         ddt_entry_t *dde = zio->io_private;
2369         ddt_phys_t *ddp = &dde->dde_phys[p];
2370         zio_t *pio;
2371
2372         if (zio->io_error)
2373                 return;
2374
2375         ddt_enter(ddt);
2376
2377         ASSERT(dde->dde_lead_zio[p] == zio);
2378
2379         ddt_phys_fill(ddp, zio->io_bp);
2380
2381         while ((pio = zio_walk_parents(zio)) != NULL)
2382                 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2383
2384         ddt_exit(ddt);
2385 }
2386
2387 static void
2388 zio_ddt_child_write_done(zio_t *zio)
2389 {
2390         int p = zio->io_prop.zp_copies;
2391         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2392         ddt_entry_t *dde = zio->io_private;
2393         ddt_phys_t *ddp = &dde->dde_phys[p];
2394
2395         ddt_enter(ddt);
2396
2397         ASSERT(ddp->ddp_refcnt == 0);
2398         ASSERT(dde->dde_lead_zio[p] == zio);
2399         dde->dde_lead_zio[p] = NULL;
2400
2401         if (zio->io_error == 0) {
2402                 while (zio_walk_parents(zio) != NULL)
2403                         ddt_phys_addref(ddp);
2404         } else {
2405                 ddt_phys_clear(ddp);
2406         }
2407
2408         ddt_exit(ddt);
2409 }
2410
2411 static void
2412 zio_ddt_ditto_write_done(zio_t *zio)
2413 {
2414         int p = DDT_PHYS_DITTO;
2415         zio_prop_t *zp = &zio->io_prop;
2416         blkptr_t *bp = zio->io_bp;
2417         ddt_t *ddt = ddt_select(zio->io_spa, bp);
2418         ddt_entry_t *dde = zio->io_private;
2419         ddt_phys_t *ddp = &dde->dde_phys[p];
2420         ddt_key_t *ddk = &dde->dde_key;
2421
2422         ddt_enter(ddt);
2423
2424         ASSERT(ddp->ddp_refcnt == 0);
2425         ASSERT(dde->dde_lead_zio[p] == zio);
2426         dde->dde_lead_zio[p] = NULL;
2427
2428         if (zio->io_error == 0) {
2429                 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2430                 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2431                 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2432                 if (ddp->ddp_phys_birth != 0)
2433                         ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2434                 ddt_phys_fill(ddp, bp);
2435         }
2436
2437         ddt_exit(ddt);
2438 }
2439
2440 static int
2441 zio_ddt_write(zio_t *zio)
2442 {
2443         spa_t *spa = zio->io_spa;
2444         blkptr_t *bp = zio->io_bp;
2445         uint64_t txg = zio->io_txg;
2446         zio_prop_t *zp = &zio->io_prop;
2447         int p = zp->zp_copies;
2448         int ditto_copies;
2449         zio_t *cio = NULL;
2450         zio_t *dio = NULL;
2451         ddt_t *ddt = ddt_select(spa, bp);
2452         ddt_entry_t *dde;
2453         ddt_phys_t *ddp;
2454
2455         ASSERT(BP_GET_DEDUP(bp));
2456         ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2457         ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2458
2459         ddt_enter(ddt);
2460         dde = ddt_lookup(ddt, bp, B_TRUE);
2461         ddp = &dde->dde_phys[p];
2462
2463         if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2464                 /*
2465                  * If we're using a weak checksum, upgrade to a strong checksum
2466                  * and try again.  If we're already using a strong checksum,
2467                  * we can't resolve it, so just convert to an ordinary write.
2468                  * (And automatically e-mail a paper to Nature?)
2469                  */
2470                 if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
2471                     ZCHECKSUM_FLAG_DEDUP)) {
2472                         zp->zp_checksum = spa_dedup_checksum(spa);
2473                         zio_pop_transforms(zio);
2474                         zio->io_stage = ZIO_STAGE_OPEN;
2475                         BP_ZERO(bp);
2476                 } else {
2477                         zp->zp_dedup = B_FALSE;
2478                 }
2479                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2480                 ddt_exit(ddt);
2481                 return (ZIO_PIPELINE_CONTINUE);
2482         }
2483
2484         ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2485         ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2486
2487         if (ditto_copies > ddt_ditto_copies_present(dde) &&
2488             dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2489                 zio_prop_t czp = *zp;
2490
2491                 czp.zp_copies = ditto_copies;
2492
2493                 /*
2494                  * If we arrived here with an override bp, we won't have run
2495                  * the transform stack, so we won't have the data we need to
2496                  * generate a child i/o.  So, toss the override bp and restart.
2497                  * This is safe, because using the override bp is just an
2498                  * optimization; and it's rare, so the cost doesn't matter.
2499                  */
2500                 if (zio->io_bp_override) {
2501                         zio_pop_transforms(zio);
2502                         zio->io_stage = ZIO_STAGE_OPEN;
2503                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2504                         zio->io_bp_override = NULL;
2505                         BP_ZERO(bp);
2506                         ddt_exit(ddt);
2507                         return (ZIO_PIPELINE_CONTINUE);
2508                 }
2509
2510                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2511                     zio->io_orig_size, &czp, NULL, NULL,
2512                     zio_ddt_ditto_write_done, dde, zio->io_priority,
2513                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2514
2515                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2516                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2517         }
2518
2519         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2520                 if (ddp->ddp_phys_birth != 0)
2521                         ddt_bp_fill(ddp, bp, txg);
2522                 if (dde->dde_lead_zio[p] != NULL)
2523                         zio_add_child(zio, dde->dde_lead_zio[p]);
2524                 else
2525                         ddt_phys_addref(ddp);
2526         } else if (zio->io_bp_override) {
2527                 ASSERT(bp->blk_birth == txg);
2528                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2529                 ddt_phys_fill(ddp, bp);
2530                 ddt_phys_addref(ddp);
2531         } else {
2532                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2533                     zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2534                     zio_ddt_child_write_done, dde, zio->io_priority,
2535                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2536
2537                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2538                 dde->dde_lead_zio[p] = cio;
2539         }
2540
2541         ddt_exit(ddt);
2542
2543         if (cio)
2544                 zio_nowait(cio);
2545         if (dio)
2546                 zio_nowait(dio);
2547
2548         return (ZIO_PIPELINE_CONTINUE);
2549 }
2550
2551 ddt_entry_t *freedde; /* for debugging */
2552
2553 static int
2554 zio_ddt_free(zio_t *zio)
2555 {
2556         spa_t *spa = zio->io_spa;
2557         blkptr_t *bp = zio->io_bp;
2558         ddt_t *ddt = ddt_select(spa, bp);
2559         ddt_entry_t *dde;
2560         ddt_phys_t *ddp;
2561
2562         ASSERT(BP_GET_DEDUP(bp));
2563         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2564
2565         ddt_enter(ddt);
2566         freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2567         ddp = ddt_phys_select(dde, bp);
2568         ddt_phys_decref(ddp);
2569         ddt_exit(ddt);
2570
2571         return (ZIO_PIPELINE_CONTINUE);
2572 }
2573
2574 /*
2575  * ==========================================================================
2576  * Allocate and free blocks
2577  * ==========================================================================
2578  */
2579 static int
2580 zio_dva_allocate(zio_t *zio)
2581 {
2582         spa_t *spa = zio->io_spa;
2583         metaslab_class_t *mc = spa_normal_class(spa);
2584         blkptr_t *bp = zio->io_bp;
2585         int error;
2586         int flags = 0;
2587
2588         if (zio->io_gang_leader == NULL) {
2589                 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2590                 zio->io_gang_leader = zio;
2591         }
2592
2593         ASSERT(BP_IS_HOLE(bp));
2594         ASSERT0(BP_GET_NDVAS(bp));
2595         ASSERT3U(zio->io_prop.zp_copies, >, 0);
2596         ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2597         ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2598
2599         /*
2600          * The dump device does not support gang blocks so allocation on
2601          * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2602          * the "fast" gang feature.
2603          */
2604         flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2605         flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2606             METASLAB_GANG_CHILD : 0;
2607         error = metaslab_alloc(spa, mc, zio->io_size, bp,
2608             zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2609
2610         if (error) {
2611                 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2612                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2613                     error);
2614                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2615                         return (zio_write_gang_block(zio));
2616                 zio->io_error = error;
2617         }
2618
2619         return (ZIO_PIPELINE_CONTINUE);
2620 }
2621
2622 static int
2623 zio_dva_free(zio_t *zio)
2624 {
2625         metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2626
2627         return (ZIO_PIPELINE_CONTINUE);
2628 }
2629
2630 static int
2631 zio_dva_claim(zio_t *zio)
2632 {
2633         int error;
2634
2635         error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2636         if (error)
2637                 zio->io_error = error;
2638
2639         return (ZIO_PIPELINE_CONTINUE);
2640 }
2641
2642 /*
2643  * Undo an allocation.  This is used by zio_done() when an I/O fails
2644  * and we want to give back the block we just allocated.
2645  * This handles both normal blocks and gang blocks.
2646  */
2647 static void
2648 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2649 {
2650         ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2651         ASSERT(zio->io_bp_override == NULL);
2652
2653         if (!BP_IS_HOLE(bp))
2654                 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2655
2656         if (gn != NULL) {
2657                 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2658                         zio_dva_unallocate(zio, gn->gn_child[g],
2659                             &gn->gn_gbh->zg_blkptr[g]);
2660                 }
2661         }
2662 }
2663
2664 /*
2665  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2666  */
2667 int
2668 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2669     uint64_t size, boolean_t use_slog)
2670 {
2671         int error = 1;
2672
2673         ASSERT(txg > spa_syncing_txg(spa));
2674
2675         /*
2676          * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2677          * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2678          * when allocating them.
2679          */
2680         if (use_slog) {
2681                 error = metaslab_alloc(spa, spa_log_class(spa), size,
2682                     new_bp, 1, txg, old_bp,
2683                     METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2684         }
2685
2686         if (error) {
2687                 error = metaslab_alloc(spa, spa_normal_class(spa), size,
2688                     new_bp, 1, txg, old_bp,
2689                     METASLAB_HINTBP_AVOID);
2690         }
2691
2692         if (error == 0) {
2693                 BP_SET_LSIZE(new_bp, size);
2694                 BP_SET_PSIZE(new_bp, size);
2695                 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2696                 BP_SET_CHECKSUM(new_bp,
2697                     spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2698                     ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2699                 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2700                 BP_SET_LEVEL(new_bp, 0);
2701                 BP_SET_DEDUP(new_bp, 0);
2702                 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2703         }
2704
2705         return (error);
2706 }
2707
2708 /*
2709  * Free an intent log block.
2710  */
2711 void
2712 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2713 {
2714         ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2715         ASSERT(!BP_IS_GANG(bp));
2716
2717         zio_free(spa, txg, bp);
2718 }
2719
2720 /*
2721  * ==========================================================================
2722  * Read, write and delete to physical devices
2723  * ==========================================================================
2724  */
2725
2726
2727 /*
2728  * Issue an I/O to the underlying vdev. Typically the issue pipeline
2729  * stops after this stage and will resume upon I/O completion.
2730  * However, there are instances where the vdev layer may need to
2731  * continue the pipeline when an I/O was not issued. Since the I/O
2732  * that was sent to the vdev layer might be different than the one
2733  * currently active in the pipeline (see vdev_queue_io()), we explicitly
2734  * force the underlying vdev layers to call either zio_execute() or
2735  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2736  */
2737 static int
2738 zio_vdev_io_start(zio_t *zio)
2739 {
2740         vdev_t *vd = zio->io_vd;
2741         uint64_t align;
2742         spa_t *spa = zio->io_spa;
2743         int ret;
2744
2745         ASSERT(zio->io_error == 0);
2746         ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2747
2748         if (vd == NULL) {
2749                 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2750                         spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2751
2752                 /*
2753                  * The mirror_ops handle multiple DVAs in a single BP.
2754                  */
2755                 vdev_mirror_ops.vdev_op_io_start(zio);
2756                 return (ZIO_PIPELINE_STOP);
2757         }
2758
2759         if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
2760             zio->io_priority == ZIO_PRIORITY_NOW) {
2761                 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
2762                 return (ZIO_PIPELINE_CONTINUE);
2763         }
2764
2765         /*
2766          * We keep track of time-sensitive I/Os so that the scan thread
2767          * can quickly react to certain workloads.  In particular, we care
2768          * about non-scrubbing, top-level reads and writes with the following
2769          * characteristics:
2770          *      - synchronous writes of user data to non-slog devices
2771          *      - any reads of user data
2772          * When these conditions are met, adjust the timestamp of spa_last_io
2773          * which allows the scan thread to adjust its workload accordingly.
2774          */
2775         if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2776             vd == vd->vdev_top && !vd->vdev_islog &&
2777             zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2778             zio->io_txg != spa_syncing_txg(spa)) {
2779                 uint64_t old = spa->spa_last_io;
2780                 uint64_t new = ddi_get_lbolt64();
2781                 if (old != new)
2782                         (void) atomic_cas_64(&spa->spa_last_io, old, new);
2783         }
2784
2785         align = 1ULL << vd->vdev_top->vdev_ashift;
2786
2787         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2788             P2PHASE(zio->io_size, align) != 0) {
2789                 /* Transform logical writes to be a full physical block size. */
2790                 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2791                 char *abuf = NULL;
2792                 if (zio->io_type == ZIO_TYPE_READ ||
2793                     zio->io_type == ZIO_TYPE_WRITE)
2794                         abuf = zio_buf_alloc(asize);
2795                 ASSERT(vd == vd->vdev_top);
2796                 if (zio->io_type == ZIO_TYPE_WRITE) {
2797                         bcopy(zio->io_data, abuf, zio->io_size);
2798                         bzero(abuf + zio->io_size, asize - zio->io_size);
2799                 }
2800                 zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
2801                     zio_subblock);
2802         }
2803
2804         /*
2805          * If this is not a physical io, make sure that it is properly aligned
2806          * before proceeding.
2807          */
2808         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2809                 ASSERT0(P2PHASE(zio->io_offset, align));
2810                 ASSERT0(P2PHASE(zio->io_size, align));
2811         } else {
2812                 /*
2813                  * For the physical io we allow alignment
2814                  * to a logical block size.
2815                  */
2816                 uint64_t log_align =
2817                     1ULL << vd->vdev_top->vdev_logical_ashift;
2818                 ASSERT0(P2PHASE(zio->io_offset, log_align));
2819                 ASSERT0(P2PHASE(zio->io_size, log_align));
2820         }
2821
2822         VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
2823
2824         /*
2825          * If this is a repair I/O, and there's no self-healing involved --
2826          * that is, we're just resilvering what we expect to resilver --
2827          * then don't do the I/O unless zio's txg is actually in vd's DTL.
2828          * This prevents spurious resilvering with nested replication.
2829          * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2830          * A is out of date, we'll read from C+D, then use the data to
2831          * resilver A+B -- but we don't actually want to resilver B, just A.
2832          * The top-level mirror has no way to know this, so instead we just
2833          * discard unnecessary repairs as we work our way down the vdev tree.
2834          * The same logic applies to any form of nested replication:
2835          * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2836          */
2837         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2838             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2839             zio->io_txg != 0 && /* not a delegated i/o */
2840             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2841                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2842                 zio_vdev_io_bypass(zio);
2843                 return (ZIO_PIPELINE_CONTINUE);
2844         }
2845
2846         if (vd->vdev_ops->vdev_op_leaf) {
2847                 switch (zio->io_type) {
2848                 case ZIO_TYPE_READ:
2849                         if (vdev_cache_read(zio))
2850                                 return (ZIO_PIPELINE_CONTINUE);
2851                         /* FALLTHROUGH */
2852                 case ZIO_TYPE_WRITE:
2853                 case ZIO_TYPE_FREE:
2854                         if ((zio = vdev_queue_io(zio)) == NULL)
2855                                 return (ZIO_PIPELINE_STOP);
2856
2857                         if (!vdev_accessible(vd, zio)) {
2858                                 zio->io_error = SET_ERROR(ENXIO);
2859                                 zio_interrupt(zio);
2860                                 return (ZIO_PIPELINE_STOP);
2861                         }
2862                         break;
2863                 }
2864                 /*
2865                  * Note that we ignore repair writes for TRIM because they can
2866                  * conflict with normal writes. This isn't an issue because, by
2867                  * definition, we only repair blocks that aren't freed.
2868                  */
2869                 if (zio->io_type == ZIO_TYPE_WRITE &&
2870                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2871                     !trim_map_write_start(zio))
2872                         return (ZIO_PIPELINE_STOP);
2873         }
2874
2875         vd->vdev_ops->vdev_op_io_start(zio);
2876         return (ZIO_PIPELINE_STOP);
2877 }
2878
2879 static int
2880 zio_vdev_io_done(zio_t *zio)
2881 {
2882         vdev_t *vd = zio->io_vd;
2883         vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2884         boolean_t unexpected_error = B_FALSE;
2885
2886         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2887                 return (ZIO_PIPELINE_STOP);
2888
2889         ASSERT(zio->io_type == ZIO_TYPE_READ ||
2890             zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
2891
2892         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2893             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
2894             zio->io_type == ZIO_TYPE_FREE)) {
2895
2896                 if (zio->io_type == ZIO_TYPE_WRITE &&
2897                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
2898                         trim_map_write_done(zio);
2899
2900                 vdev_queue_io_done(zio);
2901
2902                 if (zio->io_type == ZIO_TYPE_WRITE)
2903                         vdev_cache_write(zio);
2904
2905                 if (zio_injection_enabled && zio->io_error == 0)
2906                         zio->io_error = zio_handle_device_injection(vd,
2907                             zio, EIO);
2908
2909                 if (zio_injection_enabled && zio->io_error == 0)
2910                         zio->io_error = zio_handle_label_injection(zio, EIO);
2911
2912                 if (zio->io_error) {
2913                         if (zio->io_error == ENOTSUP &&
2914                             zio->io_type == ZIO_TYPE_FREE) {
2915                                 /* Not all devices support TRIM. */
2916                         } else if (!vdev_accessible(vd, zio)) {
2917                                 zio->io_error = SET_ERROR(ENXIO);
2918                         } else {
2919                                 unexpected_error = B_TRUE;
2920                         }
2921                 }
2922         }
2923
2924         ops->vdev_op_io_done(zio);
2925
2926         if (unexpected_error)
2927                 VERIFY(vdev_probe(vd, zio) == NULL);
2928
2929         return (ZIO_PIPELINE_CONTINUE);
2930 }
2931
2932 /*
2933  * For non-raidz ZIOs, we can just copy aside the bad data read from the
2934  * disk, and use that to finish the checksum ereport later.
2935  */
2936 static void
2937 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2938     const void *good_buf)
2939 {
2940         /* no processing needed */
2941         zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2942 }
2943
2944 /*ARGSUSED*/
2945 void
2946 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2947 {
2948         void *buf = zio_buf_alloc(zio->io_size);
2949
2950         bcopy(zio->io_data, buf, zio->io_size);
2951
2952         zcr->zcr_cbinfo = zio->io_size;
2953         zcr->zcr_cbdata = buf;
2954         zcr->zcr_finish = zio_vsd_default_cksum_finish;
2955         zcr->zcr_free = zio_buf_free;
2956 }
2957
2958 static int
2959 zio_vdev_io_assess(zio_t *zio)
2960 {
2961         vdev_t *vd = zio->io_vd;
2962
2963         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2964                 return (ZIO_PIPELINE_STOP);
2965
2966         if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2967                 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2968
2969         if (zio->io_vsd != NULL) {
2970                 zio->io_vsd_ops->vsd_free(zio);
2971                 zio->io_vsd = NULL;
2972         }
2973
2974         if (zio_injection_enabled && zio->io_error == 0)
2975                 zio->io_error = zio_handle_fault_injection(zio, EIO);
2976
2977         if (zio->io_type == ZIO_TYPE_FREE &&
2978             zio->io_priority != ZIO_PRIORITY_NOW) {
2979                 switch (zio->io_error) {
2980                 case 0:
2981                         ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
2982                         ZIO_TRIM_STAT_BUMP(success);
2983                         break;
2984                 case EOPNOTSUPP:
2985                         ZIO_TRIM_STAT_BUMP(unsupported);
2986                         break;
2987                 default:
2988                         ZIO_TRIM_STAT_BUMP(failed);
2989                         break;
2990                 }
2991         }
2992
2993         /*
2994          * If the I/O failed, determine whether we should attempt to retry it.
2995          *
2996          * On retry, we cut in line in the issue queue, since we don't want
2997          * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2998          */
2999         if (zio->io_error && vd == NULL &&
3000             !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
3001                 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
3002                 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
3003                 zio->io_error = 0;
3004                 zio->io_flags |= ZIO_FLAG_IO_RETRY |
3005                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
3006                 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
3007                 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
3008                     zio_requeue_io_start_cut_in_line);
3009                 return (ZIO_PIPELINE_STOP);
3010         }
3011
3012         /*
3013          * If we got an error on a leaf device, convert it to ENXIO
3014          * if the device is not accessible at all.
3015          */
3016         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
3017             !vdev_accessible(vd, zio))
3018                 zio->io_error = SET_ERROR(ENXIO);
3019
3020         /*
3021          * If we can't write to an interior vdev (mirror or RAID-Z),
3022          * set vdev_cant_write so that we stop trying to allocate from it.
3023          */
3024         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
3025             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
3026                 vd->vdev_cant_write = B_TRUE;
3027         }
3028
3029         if (zio->io_error)
3030                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3031
3032         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
3033             zio->io_physdone != NULL) {
3034                 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
3035                 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
3036                 zio->io_physdone(zio->io_logical);
3037         }
3038
3039         return (ZIO_PIPELINE_CONTINUE);
3040 }
3041
3042 void
3043 zio_vdev_io_reissue(zio_t *zio)
3044 {
3045         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
3046         ASSERT(zio->io_error == 0);
3047
3048         zio->io_stage >>= 1;
3049 }
3050
3051 void
3052 zio_vdev_io_redone(zio_t *zio)
3053 {
3054         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
3055
3056         zio->io_stage >>= 1;
3057 }
3058
3059 void
3060 zio_vdev_io_bypass(zio_t *zio)
3061 {
3062         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
3063         ASSERT(zio->io_error == 0);
3064
3065         zio->io_flags |= ZIO_FLAG_IO_BYPASS;
3066         zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
3067 }
3068
3069 /*
3070  * ==========================================================================
3071  * Generate and verify checksums
3072  * ==========================================================================
3073  */
3074 static int
3075 zio_checksum_generate(zio_t *zio)
3076 {
3077         blkptr_t *bp = zio->io_bp;
3078         enum zio_checksum checksum;
3079
3080         if (bp == NULL) {
3081                 /*
3082                  * This is zio_write_phys().
3083                  * We're either generating a label checksum, or none at all.
3084                  */
3085                 checksum = zio->io_prop.zp_checksum;
3086
3087                 if (checksum == ZIO_CHECKSUM_OFF)
3088                         return (ZIO_PIPELINE_CONTINUE);
3089
3090                 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
3091         } else {
3092                 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
3093                         ASSERT(!IO_IS_ALLOCATING(zio));
3094                         checksum = ZIO_CHECKSUM_GANG_HEADER;
3095                 } else {
3096                         checksum = BP_GET_CHECKSUM(bp);
3097                 }
3098         }
3099
3100         zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
3101
3102         return (ZIO_PIPELINE_CONTINUE);
3103 }
3104
3105 static int
3106 zio_checksum_verify(zio_t *zio)
3107 {
3108         zio_bad_cksum_t info;
3109         blkptr_t *bp = zio->io_bp;
3110         int error;
3111
3112         ASSERT(zio->io_vd != NULL);
3113
3114         if (bp == NULL) {
3115                 /*
3116                  * This is zio_read_phys().
3117                  * We're either verifying a label checksum, or nothing at all.
3118                  */
3119                 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
3120                         return (ZIO_PIPELINE_CONTINUE);
3121
3122                 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
3123         }
3124
3125         if ((error = zio_checksum_error(zio, &info)) != 0) {
3126                 zio->io_error = error;
3127                 if (error == ECKSUM &&
3128                     !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3129                         zfs_ereport_start_checksum(zio->io_spa,
3130                             zio->io_vd, zio, zio->io_offset,
3131                             zio->io_size, NULL, &info);
3132                 }
3133         }
3134
3135         return (ZIO_PIPELINE_CONTINUE);
3136 }
3137
3138 /*
3139  * Called by RAID-Z to ensure we don't compute the checksum twice.
3140  */
3141 void
3142 zio_checksum_verified(zio_t *zio)
3143 {
3144         zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
3145 }
3146
3147 /*
3148  * ==========================================================================
3149  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
3150  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
3151  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
3152  * indicate errors that are specific to one I/O, and most likely permanent.
3153  * Any other error is presumed to be worse because we weren't expecting it.
3154  * ==========================================================================
3155  */
3156 int
3157 zio_worst_error(int e1, int e2)
3158 {
3159         static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
3160         int r1, r2;
3161
3162         for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
3163                 if (e1 == zio_error_rank[r1])
3164                         break;
3165
3166         for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
3167                 if (e2 == zio_error_rank[r2])
3168                         break;
3169
3170         return (r1 > r2 ? e1 : e2);
3171 }
3172
3173 /*
3174  * ==========================================================================
3175  * I/O completion
3176  * ==========================================================================
3177  */
3178 static int
3179 zio_ready(zio_t *zio)
3180 {
3181         blkptr_t *bp = zio->io_bp;
3182         zio_t *pio, *pio_next;
3183
3184         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3185             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
3186                 return (ZIO_PIPELINE_STOP);
3187
3188         if (zio->io_ready) {
3189                 ASSERT(IO_IS_ALLOCATING(zio));
3190                 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3191                     (zio->io_flags & ZIO_FLAG_NOPWRITE));
3192                 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
3193
3194                 zio->io_ready(zio);
3195         }
3196
3197         if (bp != NULL && bp != &zio->io_bp_copy)
3198                 zio->io_bp_copy = *bp;
3199
3200         if (zio->io_error)
3201                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3202
3203         mutex_enter(&zio->io_lock);
3204         zio->io_state[ZIO_WAIT_READY] = 1;
3205         pio = zio_walk_parents(zio);
3206         mutex_exit(&zio->io_lock);
3207
3208         /*
3209          * As we notify zio's parents, new parents could be added.
3210          * New parents go to the head of zio's io_parent_list, however,
3211          * so we will (correctly) not notify them.  The remainder of zio's
3212          * io_parent_list, from 'pio_next' onward, cannot change because
3213          * all parents must wait for us to be done before they can be done.
3214          */
3215         for (; pio != NULL; pio = pio_next) {
3216                 pio_next = zio_walk_parents(zio);
3217                 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
3218         }
3219
3220         if (zio->io_flags & ZIO_FLAG_NODATA) {
3221                 if (BP_IS_GANG(bp)) {
3222                         zio->io_flags &= ~ZIO_FLAG_NODATA;
3223                 } else {
3224                         ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3225                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3226                 }
3227         }
3228
3229         if (zio_injection_enabled &&
3230             zio->io_spa->spa_syncing_txg == zio->io_txg)
3231                 zio_handle_ignored_writes(zio);
3232
3233         return (ZIO_PIPELINE_CONTINUE);
3234 }
3235
3236 static int
3237 zio_done(zio_t *zio)
3238 {
3239         spa_t *spa = zio->io_spa;
3240         zio_t *lio = zio->io_logical;
3241         blkptr_t *bp = zio->io_bp;
3242         vdev_t *vd = zio->io_vd;
3243         uint64_t psize = zio->io_size;
3244         zio_t *pio, *pio_next;
3245
3246         /*
3247          * If our children haven't all completed,
3248          * wait for them and then repeat this pipeline stage.
3249          */
3250         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3251             zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3252             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3253             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3254                 return (ZIO_PIPELINE_STOP);
3255
3256         for (int c = 0; c < ZIO_CHILD_TYPES; c++)
3257                 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
3258                         ASSERT(zio->io_children[c][w] == 0);
3259
3260         if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
3261                 ASSERT(bp->blk_pad[0] == 0);
3262                 ASSERT(bp->blk_pad[1] == 0);
3263                 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
3264                     (bp == zio_unique_parent(zio)->io_bp));
3265                 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
3266                     zio->io_bp_override == NULL &&
3267                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3268                         ASSERT(!BP_SHOULD_BYTESWAP(bp));
3269                         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
3270                         ASSERT(BP_COUNT_GANG(bp) == 0 ||
3271                             (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
3272                 }
3273                 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3274                         VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
3275         }
3276
3277         /*
3278          * If there were child vdev/gang/ddt errors, they apply to us now.
3279          */
3280         zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3281         zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3282         zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3283
3284         /*
3285          * If the I/O on the transformed data was successful, generate any
3286          * checksum reports now while we still have the transformed data.
3287          */
3288         if (zio->io_error == 0) {
3289                 while (zio->io_cksum_report != NULL) {
3290                         zio_cksum_report_t *zcr = zio->io_cksum_report;
3291                         uint64_t align = zcr->zcr_align;
3292                         uint64_t asize = P2ROUNDUP(psize, align);
3293                         char *abuf = zio->io_data;
3294
3295                         if (asize != psize) {
3296                                 abuf = zio_buf_alloc(asize);
3297                                 bcopy(zio->io_data, abuf, psize);
3298                                 bzero(abuf + psize, asize - psize);
3299                         }
3300
3301                         zio->io_cksum_report = zcr->zcr_next;
3302                         zcr->zcr_next = NULL;
3303                         zcr->zcr_finish(zcr, abuf);
3304                         zfs_ereport_free_checksum(zcr);
3305
3306                         if (asize != psize)
3307                                 zio_buf_free(abuf, asize);
3308                 }
3309         }
3310
3311         zio_pop_transforms(zio);        /* note: may set zio->io_error */
3312
3313         vdev_stat_update(zio, psize);
3314
3315         if (zio->io_error) {
3316                 /*
3317                  * If this I/O is attached to a particular vdev,
3318                  * generate an error message describing the I/O failure
3319                  * at the block level.  We ignore these errors if the
3320                  * device is currently unavailable.
3321                  */
3322                 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
3323                         zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
3324
3325                 if ((zio->io_error == EIO || !(zio->io_flags &
3326                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3327                     zio == lio) {
3328                         /*
3329                          * For logical I/O requests, tell the SPA to log the
3330                          * error and generate a logical data ereport.
3331                          */
3332                         spa_log_error(spa, zio);
3333                         zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
3334                             0, 0);
3335                 }
3336         }
3337
3338         if (zio->io_error && zio == lio) {
3339                 /*
3340                  * Determine whether zio should be reexecuted.  This will
3341                  * propagate all the way to the root via zio_notify_parent().
3342                  */
3343                 ASSERT(vd == NULL && bp != NULL);
3344                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3345
3346                 if (IO_IS_ALLOCATING(zio) &&
3347                     !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3348                         if (zio->io_error != ENOSPC)
3349                                 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3350                         else
3351                                 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3352                 }
3353
3354                 if ((zio->io_type == ZIO_TYPE_READ ||
3355                     zio->io_type == ZIO_TYPE_FREE) &&
3356                     !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3357                     zio->io_error == ENXIO &&
3358                     spa_load_state(spa) == SPA_LOAD_NONE &&
3359                     spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
3360                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3361
3362                 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3363                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3364
3365                 /*
3366                  * Here is a possibly good place to attempt to do
3367                  * either combinatorial reconstruction or error correction
3368                  * based on checksums.  It also might be a good place
3369                  * to send out preliminary ereports before we suspend
3370                  * processing.
3371                  */
3372         }
3373
3374         /*
3375          * If there were logical child errors, they apply to us now.
3376          * We defer this until now to avoid conflating logical child
3377          * errors with errors that happened to the zio itself when
3378          * updating vdev stats and reporting FMA events above.
3379          */
3380         zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3381
3382         if ((zio->io_error || zio->io_reexecute) &&
3383             IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3384             !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3385                 zio_dva_unallocate(zio, zio->io_gang_tree, bp);
3386
3387         zio_gang_tree_free(&zio->io_gang_tree);
3388
3389         /*
3390          * Godfather I/Os should never suspend.
3391          */
3392         if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3393             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3394                 zio->io_reexecute = 0;
3395
3396         if (zio->io_reexecute) {
3397                 /*
3398                  * This is a logical I/O that wants to reexecute.
3399                  *
3400                  * Reexecute is top-down.  When an i/o fails, if it's not
3401                  * the root, it simply notifies its parent and sticks around.
3402                  * The parent, seeing that it still has children in zio_done(),
3403                  * does the same.  This percolates all the way up to the root.
3404                  * The root i/o will reexecute or suspend the entire tree.
3405                  *
3406                  * This approach ensures that zio_reexecute() honors
3407                  * all the original i/o dependency relationships, e.g.
3408                  * parents not executing until children are ready.
3409                  */
3410                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3411
3412                 zio->io_gang_leader = NULL;
3413
3414                 mutex_enter(&zio->io_lock);
3415                 zio->io_state[ZIO_WAIT_DONE] = 1;
3416                 mutex_exit(&zio->io_lock);
3417
3418                 /*
3419                  * "The Godfather" I/O monitors its children but is
3420                  * not a true parent to them. It will track them through
3421                  * the pipeline but severs its ties whenever they get into
3422                  * trouble (e.g. suspended). This allows "The Godfather"
3423                  * I/O to return status without blocking.
3424                  */
3425                 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3426                         zio_link_t *zl = zio->io_walk_link;
3427                         pio_next = zio_walk_parents(zio);
3428
3429                         if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3430                             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3431                                 zio_remove_child(pio, zio, zl);
3432                                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3433                         }
3434                 }
3435
3436                 if ((pio = zio_unique_parent(zio)) != NULL) {
3437                         /*
3438                          * We're not a root i/o, so there's nothing to do
3439                          * but notify our parent.  Don't propagate errors
3440                          * upward since we haven't permanently failed yet.
3441                          */
3442                         ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3443                         zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3444                         zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3445                 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3446                         /*
3447                          * We'd fail again if we reexecuted now, so suspend
3448                          * until conditions improve (e.g. device comes online).
3449                          */
3450                         zio_suspend(spa, zio);
3451                 } else {
3452                         /*
3453                          * Reexecution is potentially a huge amount of work.
3454                          * Hand it off to the otherwise-unused claim taskq.
3455                          */
3456 #if defined(illumos) || !defined(_KERNEL)
3457                         ASSERT(zio->io_tqent.tqent_next == NULL);
3458 #else
3459                         ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
3460 #endif
3461                         spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3462                             ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3463                             0, &zio->io_tqent);
3464                 }
3465                 return (ZIO_PIPELINE_STOP);
3466         }
3467
3468         ASSERT(zio->io_child_count == 0);
3469         ASSERT(zio->io_reexecute == 0);
3470         ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3471
3472         /*
3473          * Report any checksum errors, since the I/O is complete.
3474          */
3475         while (zio->io_cksum_report != NULL) {
3476                 zio_cksum_report_t *zcr = zio->io_cksum_report;
3477                 zio->io_cksum_report = zcr->zcr_next;
3478                 zcr->zcr_next = NULL;
3479                 zcr->zcr_finish(zcr, NULL);
3480                 zfs_ereport_free_checksum(zcr);
3481         }
3482
3483         /*
3484          * It is the responsibility of the done callback to ensure that this
3485          * particular zio is no longer discoverable for adoption, and as
3486          * such, cannot acquire any new parents.
3487          */
3488         if (zio->io_done)
3489                 zio->io_done(zio);
3490
3491         mutex_enter(&zio->io_lock);
3492         zio->io_state[ZIO_WAIT_DONE] = 1;
3493         mutex_exit(&zio->io_lock);
3494
3495         for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3496                 zio_link_t *zl = zio->io_walk_link;
3497                 pio_next = zio_walk_parents(zio);
3498                 zio_remove_child(pio, zio, zl);
3499                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3500         }
3501
3502         if (zio->io_waiter != NULL) {
3503                 mutex_enter(&zio->io_lock);
3504                 zio->io_executor = NULL;
3505                 cv_broadcast(&zio->io_cv);
3506                 mutex_exit(&zio->io_lock);
3507         } else {
3508                 zio_destroy(zio);
3509         }
3510
3511         return (ZIO_PIPELINE_STOP);
3512 }
3513
3514 /*
3515  * ==========================================================================
3516  * I/O pipeline definition
3517  * ==========================================================================
3518  */
3519 static zio_pipe_stage_t *zio_pipeline[] = {
3520         NULL,
3521         zio_read_bp_init,
3522         zio_free_bp_init,
3523         zio_issue_async,
3524         zio_write_bp_init,
3525         zio_checksum_generate,
3526         zio_nop_write,
3527         zio_ddt_read_start,
3528         zio_ddt_read_done,
3529         zio_ddt_write,
3530         zio_ddt_free,
3531         zio_gang_assemble,
3532         zio_gang_issue,
3533         zio_dva_allocate,
3534         zio_dva_free,
3535         zio_dva_claim,
3536         zio_ready,
3537         zio_vdev_io_start,
3538         zio_vdev_io_done,
3539         zio_vdev_io_assess,
3540         zio_checksum_verify,
3541         zio_done
3542 };
3543
3544
3545
3546
3547 /*
3548  * Compare two zbookmark_phys_t's to see which we would reach first in a
3549  * pre-order traversal of the object tree.
3550  *
3551  * This is simple in every case aside from the meta-dnode object. For all other
3552  * objects, we traverse them in order (object 1 before object 2, and so on).
3553  * However, all of these objects are traversed while traversing object 0, since
3554  * the data it points to is the list of objects.  Thus, we need to convert to a
3555  * canonical representation so we can compare meta-dnode bookmarks to
3556  * non-meta-dnode bookmarks.
3557  *
3558  * We do this by calculating "equivalents" for each field of the zbookmark.
3559  * zbookmarks outside of the meta-dnode use their own object and level, and
3560  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
3561  * blocks this bookmark refers to) by multiplying their blkid by their span
3562  * (the number of L0 blocks contained within one block at their level).
3563  * zbookmarks inside the meta-dnode calculate their object equivalent
3564  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
3565  * level + 1<<31 (any value larger than a level could ever be) for their level.
3566  * This causes them to always compare before a bookmark in their object
3567  * equivalent, compare appropriately to bookmarks in other objects, and to
3568  * compare appropriately to other bookmarks in the meta-dnode.
3569  */
3570 int
3571 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
3572     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
3573 {
3574         /*
3575          * These variables represent the "equivalent" values for the zbookmark,
3576          * after converting zbookmarks inside the meta dnode to their
3577          * normal-object equivalents.
3578          */
3579         uint64_t zb1obj, zb2obj;
3580         uint64_t zb1L0, zb2L0;
3581         uint64_t zb1level, zb2level;
3582
3583         if (zb1->zb_object == zb2->zb_object &&
3584             zb1->zb_level == zb2->zb_level &&
3585             zb1->zb_blkid == zb2->zb_blkid)
3586                 return (0);
3587
3588         /*
3589          * BP_SPANB calculates the span in blocks.
3590          */
3591         zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
3592         zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
3593
3594         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3595                 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
3596                 zb1L0 = 0;
3597                 zb1level = zb1->zb_level + COMPARE_META_LEVEL;
3598         } else {
3599                 zb1obj = zb1->zb_object;
3600                 zb1level = zb1->zb_level;
3601         }
3602
3603         if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
3604                 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
3605                 zb2L0 = 0;
3606                 zb2level = zb2->zb_level + COMPARE_META_LEVEL;
3607         } else {
3608                 zb2obj = zb2->zb_object;
3609                 zb2level = zb2->zb_level;
3610         }
3611
3612         /* Now that we have a canonical representation, do the comparison. */
3613         if (zb1obj != zb2obj)
3614                 return (zb1obj < zb2obj ? -1 : 1);
3615         else if (zb1L0 != zb2L0)
3616                 return (zb1L0 < zb2L0 ? -1 : 1);
3617         else if (zb1level != zb2level)
3618                 return (zb1level > zb2level ? -1 : 1);
3619         /*
3620          * This can (theoretically) happen if the bookmarks have the same object
3621          * and level, but different blkids, if the block sizes are not the same.
3622          * There is presently no way to change the indirect block sizes
3623          */
3624         return (0);
3625 }
3626
3627 /*
3628  *  This function checks the following: given that last_block is the place that
3629  *  our traversal stopped last time, does that guarantee that we've visited
3630  *  every node under subtree_root?  Therefore, we can't just use the raw output
3631  *  of zbookmark_compare.  We have to pass in a modified version of
3632  *  subtree_root; by incrementing the block id, and then checking whether
3633  *  last_block is before or equal to that, we can tell whether or not having
3634  *  visited last_block implies that all of subtree_root's children have been
3635  *  visited.
3636  */
3637 boolean_t
3638 zbookmark_subtree_completed(const dnode_phys_t *dnp,
3639     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
3640 {
3641         zbookmark_phys_t mod_zb = *subtree_root;
3642         mod_zb.zb_blkid++;
3643         ASSERT(last_block->zb_level == 0);
3644
3645         /* The objset_phys_t isn't before anything. */
3646         if (dnp == NULL)
3647                 return (B_FALSE);
3648
3649         /*
3650          * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
3651          * data block size in sectors, because that variable is only used if
3652          * the bookmark refers to a block in the meta-dnode.  Since we don't
3653          * know without examining it what object it refers to, and there's no
3654          * harm in passing in this value in other cases, we always pass it in.
3655          *
3656          * We pass in 0 for the indirect block size shift because zb2 must be
3657          * level 0.  The indirect block size is only used to calculate the span
3658          * of the bookmark, but since the bookmark must be level 0, the span is
3659          * always 1, so the math works out.
3660          *
3661          * If you make changes to how the zbookmark_compare code works, be sure
3662          * to make sure that this code still works afterwards.
3663          */
3664         return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
3665             1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
3666             last_block) <= 0);
3667 }