sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2017 by Delphix. All rights reserved.
  24  */
  25
  26 /*
  27  * Storage Pool Checkpoint
  28  *
  29  * A storage pool checkpoint can be thought of as a pool-wide snapshot or
  30  * a stable version of extreme rewind that guarantees no blocks from the
  31  * checkpointed state will have been overwritten. It remembers the entire
  32  * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
  33  * point that it was taken and the user can rewind back to that point even if
  34  * they applied destructive operations on their datasets or even enabled new
  35  * zpool on-disk features. If a pool has a checkpoint that is no longer
  36  * needed, the user can discard it.
  37  *
  38  * == On disk data structures used ==
  39  *
  40  * - The pool has a new feature flag and a new entry in the MOS. The feature
  41  *   flag is set to active when we create the checkpoint and remains active
  42  *   until the checkpoint is fully discarded. The entry in the MOS config
  43  *   (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
  44  *   references the state of the pool when we take the checkpoint. The entry
  45  *   remains populated until we start discarding the checkpoint or we rewind
  46  *   back to it.
  47  *
  48  * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
  49  *   which persists until the checkpoint is fully discarded. The space map
  50  *   contains entries that have been freed in the current state of the pool
  51  *   but we want to keep around in case we decide to rewind to the checkpoint.
  52  *   [see vdev_checkpoint_sm]
  53  *
  54  * - Each metaslab's ms_sm space map behaves the same as without the
  55  *   checkpoint, with the only exception being the scenario when we free
  56  *   blocks that belong to the checkpoint. In this case, these blocks remain
  57  *   ALLOCATED in the metaslab's space map and they are added as FREE in the
  58  *   vdev's checkpoint space map.
  59  *
  60  * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
  61  *   the uberblock was checkpointed. For normal uberblocks this field is 0.
  62  *
  63  * == Overview of operations ==
  64  *
  65  * - To create a checkpoint, we first wait for the current TXG to be synced,
  66  *   so we can use the most recently synced uberblock (spa_ubsync) as the
  67  *   checkpointed uberblock. Then we use an early synctask to place that
  68  *   uberblock in MOS config, increment the feature flag for the checkpoint
  69  *   (marking it active), and setting spa_checkpoint_txg (see its use below)
  70  *   to the TXG of the checkpointed uberblock. We use an early synctask for
  71  *   the aforementioned operations to ensure that no blocks were dirtied
  72  *   between the current TXG and the TXG of the checkpointed uberblock
  73  *   (e.g the previous txg).
  74  *
  75  * - When a checkpoint exists, we need to ensure that the blocks that
  76  *   belong to the checkpoint are freed but never reused. This means that
  77  *   these blocks should never end up in the ms_allocatable or the ms_freeing
  78  *   trees of a metaslab. Therefore, whenever there is a checkpoint the new
  79  *   ms_checkpointing tree is used in addition to the aforementioned ones.
  80  *
  81  *   Whenever a block is freed and we find out that it is referenced by the
  82  *   checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
  83  *   we place it in the ms_checkpointing tree instead of the ms_freeingtree.
  84  *   This way, we divide the blocks that are being freed into checkpointed
  85  *   and not-checkpointed blocks.
  86  *
  87  *   In order to persist these frees, we write the extents from the
  88  *   ms_freeingtree to the ms_sm as usual, and the extents from the
  89  *   ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
  90  *   checkpointed extents will remain allocated in the metaslab's ms_sm space
  91  *   map, and therefore won't be reused [see metaslab_sync()]. In addition,
  92  *   when we discard the checkpoint, we can find the entries that have
  93  *   actually been freed in vdev_checkpoint_sm.
  94  *   [see spa_checkpoint_discard_thread_sync()]
  95  *
  96  * - To discard the checkpoint we use an early synctask to delete the
  97  *   checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
  98  *   and wakeup the discarding zthr thread (an open-context async thread).
  99  *   We use an early synctask to ensure that the operation happens before any
 100  *   new data end up in the checkpoint's data structures.
 101  *
 102  *   Once the synctask is done and the discarding zthr is awake, we discard
 103  *   the checkpointed data over multiple TXGs by having the zthr prefetching
 104  *   entries from vdev_checkpoint_sm and then starting a synctask that places
 105  *   them as free blocks in to their respective ms_allocatable and ms_sm
 106  *   structures.
 107  *   [see spa_checkpoint_discard_thread()]
 108  *
 109  *   When there are no entries left in the vdev_checkpoint_sm of all
 110  *   top-level vdevs, a final synctask runs that decrements the feature flag.
 111  *
 112  * - To rewind to the checkpoint, we first use the current uberblock and
 113  *   open the MOS so we can access the checkpointed uberblock from the MOS
 114  *   config. After we retrieve the checkpointed uberblock, we use it as the
 115  *   current uberblock for the pool by writing it to disk with an updated
 116  *   TXG, opening its version of the MOS, and moving on as usual from there.
 117  *   [see spa_ld_checkpoint_rewind()]
 118  *
 119  *   An important note on rewinding to the checkpoint has to do with how we
 120  *   handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
 121  *   blocks that have not been claimed by the time we took the checkpoint
 122  *   as they should no longer be valid.
 123  *   [see comment in zil_claim()]
 124  *
 125  * == Miscellaneous information ==
 126  *
 127  * - In the hypothetical event that we take a checkpoint, remove a vdev,
 128  *   and attempt to rewind, the rewind would fail as the checkpointed
 129  *   uberblock would reference data in the removed device. For this reason
 130  *   and others of similar nature, we disallow the following operations that
 131  *   can change the config:
 132  *      vdev removal and attach/detach, mirror splitting, and pool reguid.
 133  *
 134  * - As most of the checkpoint logic is implemented in the SPA and doesn't
 135  *   distinguish datasets when it comes to space accounting, having a
 136  *   checkpoint can potentially break the boundaries set by dataset
 137  *   reservations.
 138  */
 139
 140 #include <sys/dmu_tx.h>
 141 #include <sys/dsl_dir.h>
 142 #include <sys/dsl_synctask.h>
 143 #include <sys/metaslab_impl.h>
 144 #include <sys/spa.h>
 145 #include <sys/spa_impl.h>
 146 #include <sys/spa_checkpoint.h>
 147 #include <sys/vdev_impl.h>
 148 #include <sys/zap.h>
 149 #include <sys/zfeature.h>
 150
 151 /*
 152  * The following parameter limits the amount of memory to be used for the
 153  * prefetching of the checkpoint space map done on each vdev while
 154  * discarding the checkpoint.
 155  *
 156  * The reason it exists is because top-level vdevs with long checkpoint
 157  * space maps can potentially take up a lot of memory depending on the
 158  * amount of checkpointed data that has been freed within them while
 159  * the pool had a checkpoint.
 160  */
 161 uint64_t        zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
 162
 163 int
 164 spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
 165 {
 166         if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 167                 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
 168
 169         bzero(pcs, sizeof (pool_checkpoint_stat_t));
 170
 171         int error = zap_contains(spa_meta_objset(spa),
 172             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
 173         ASSERT(error == 0 || error == ENOENT);
 174
 175         if (error == ENOENT)
 176                 pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
 177         else
 178                 pcs->pcs_state = CS_CHECKPOINT_EXISTS;
 179
 180         pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
 181         pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
 182
 183         return (0);
 184 }
 185
 186 static void
 187 spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
 188 {
 189         spa_t *spa = arg;
 190
 191         spa->spa_checkpoint_info.sci_timestamp = 0;
 192
 193         spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
 194
 195         spa_history_log_internal(spa, "spa discard checkpoint", tx,
 196             "finished discarding checkpointed state from the pool");
 197 }
 198
 199 typedef struct spa_checkpoint_discard_sync_callback_arg {
 200         vdev_t *sdc_vd;
 201         uint64_t sdc_txg;
 202         uint64_t sdc_entry_limit;
 203 } spa_checkpoint_discard_sync_callback_arg_t;
 204
 205 static int
 206 spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
 207     uint64_t size, void *arg)
 208 {
 209         spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
 210         vdev_t *vd = sdc->sdc_vd;
 211         metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 212         uint64_t end = offset + size;
 213
 214         if (sdc->sdc_entry_limit == 0)
 215                 return (EINTR);
 216
 217         /*
 218          * Since the space map is not condensed, we know that
 219          * none of its entries is crossing the boundaries of
 220          * its respective metaslab.
 221          *
 222          * That said, there is no fundamental requirement that
 223          * the checkpoint's space map entries should not cross
 224          * metaslab boundaries. So if needed we could add code
 225          * that handles metaslab-crossing segments in the future.
 226          */
 227         VERIFY3U(type, ==, SM_FREE);
 228         VERIFY3U(offset, >=, ms->ms_start);
 229         VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 230
 231         /*
 232          * At this point we should not be processing any
 233          * other frees concurrently, so the lock is technically
 234          * unnecessary. We use the lock anyway though to
 235          * potentially save ourselves from future headaches.
 236          */
 237         mutex_enter(&ms->ms_lock);
 238         if (range_tree_is_empty(ms->ms_freeing))
 239                 vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
 240         range_tree_add(ms->ms_freeing, offset, size);
 241         mutex_exit(&ms->ms_lock);
 242
 243         ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
 244         ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
 245
 246         vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
 247         vd->vdev_stat.vs_checkpoint_space -= size;
 248         sdc->sdc_entry_limit--;
 249
 250         return (0);
 251 }
 252
 253 static void
 254 spa_checkpoint_accounting_verify(spa_t *spa)
 255 {
 256         vdev_t *rvd = spa->spa_root_vdev;
 257         uint64_t ckpoint_sm_space_sum = 0;
 258         uint64_t vs_ckpoint_space_sum = 0;
 259
 260         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 261                 vdev_t *vd = rvd->vdev_child[c];
 262
 263                 if (vd->vdev_checkpoint_sm != NULL) {
 264                         ckpoint_sm_space_sum +=
 265                             -vd->vdev_checkpoint_sm->sm_alloc;
 266                         vs_ckpoint_space_sum +=
 267                             vd->vdev_stat.vs_checkpoint_space;
 268                         ASSERT3U(ckpoint_sm_space_sum, ==,
 269                             vs_ckpoint_space_sum);
 270                 } else {
 271                         ASSERT0(vd->vdev_stat.vs_checkpoint_space);
 272                 }
 273         }
 274         ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
 275 }
 276
 277 static void
 278 spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 279 {
 280         vdev_t *vd = arg;
 281         int error;
 282
 283         /*
 284          * The space map callback is applied only to non-debug entries.
 285          * Because the number of debug entries is less or equal to the
 286          * number of non-debug entries, we want to ensure that we only
 287          * read what we prefetched from open-context.
 288          *
 289          * Thus, we set the maximum entries that the space map callback
 290          * will be applied to be half the entries that could fit in the
 291          * imposed memory limit.
 292          */
 293         uint64_t max_entry_limit =
 294             (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
 295
 296         uint64_t entries_in_sm =
 297             space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 298
 299         /*
 300          * Iterate from the end of the space map towards the beginning,
 301          * placing its entries on ms_freeing and removing them from the
 302          * space map. The iteration stops if one of the following
 303          * conditions is true:
 304          *
 305          * 1] We reached the beginning of the space map. At this point
 306          *    the space map should be completely empty and
 307          *    space_map_incremental_destroy should have returned 0.
 308          *    The next step would be to free and close the space map
 309          *    and remove its entry from its vdev's top zap. This allows
 310          *    spa_checkpoint_discard_thread() to move on to the next vdev.
 311          *
 312          * 2] We reached the memory limit (amount of memory used to hold
 313          *    space map entries in memory) and space_map_incremental_destroy
 314          *    returned EINTR. This means that there are entries remaining
 315          *    in the space map that will be cleared in a future invocation
 316          *    of this function by spa_checkpoint_discard_thread().
 317          */
 318         spa_checkpoint_discard_sync_callback_arg_t sdc;
 319         sdc.sdc_vd = vd;
 320         sdc.sdc_txg = tx->tx_txg;
 321         sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
 322
 323         uint64_t entries_before = entries_in_sm;
 324
 325         error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
 326             spa_checkpoint_discard_sync_callback, &sdc, tx);
 327
 328         uint64_t entries_after =
 329             space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 330
 331 #ifdef DEBUG
 332         spa_checkpoint_accounting_verify(vd->vdev_spa);
 333 #endif
 334
 335         zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
 336             "deleted %llu entries - %llu entries are left",
 337             tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
 338             entries_after);
 339
 340         if (error != EINTR) {
 341                 if (error != 0) {
 342                         zfs_panic_recover("zfs: error %d was returned "
 343                             "while incrementally destroying the checkpoint "
 344                             "space map of vdev %llu\n",
 345                             error, vd->vdev_id);
 346                 }
 347                 ASSERT0(entries_after);
 348                 ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
 349                 ASSERT0(vd->vdev_checkpoint_sm->sm_length);
 350
 351                 space_map_free(vd->vdev_checkpoint_sm, tx);
 352                 space_map_close(vd->vdev_checkpoint_sm);
 353                 vd->vdev_checkpoint_sm = NULL;
 354
 355                 VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
 356                     vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
 357         }
 358 }
 359
 360 static boolean_t
 361 spa_checkpoint_discard_is_done(spa_t *spa)
 362 {
 363         vdev_t *rvd = spa->spa_root_vdev;
 364
 365         ASSERT(!spa_has_checkpoint(spa));
 366         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
 367
 368         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 369                 if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
 370                         return (B_FALSE);
 371                 ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
 372         }
 373
 374         return (B_TRUE);
 375 }
 376
 377 /* ARGSUSED */
 378 boolean_t
 379 spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
 380 {
 381         spa_t *spa = arg;
 382
 383         if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 384                 return (B_FALSE);
 385
 386         if (spa_has_checkpoint(spa))
 387                 return (B_FALSE);
 388
 389         return (B_TRUE);
 390 }
 391
 392 int
 393 spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
 394 {
 395         spa_t *spa = arg;
 396         vdev_t *rvd = spa->spa_root_vdev;
 397
 398         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 399                 vdev_t *vd = rvd->vdev_child[c];
 400
 401                 while (vd->vdev_checkpoint_sm != NULL) {
 402                         space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
 403                         int numbufs;
 404                         dmu_buf_t **dbp;
 405
 406                         if (zthr_iscancelled(zthr))
 407                                 return (0);
 408
 409                         ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
 410
 411                         uint64_t size = MIN(space_map_length(checkpoint_sm),
 412                             zfs_spa_discard_memory_limit);
 413                         uint64_t offset =
 414                             space_map_length(checkpoint_sm) - size;
 415
 416                         /*
 417                          * Ensure that the part of the space map that will
 418                          * be destroyed by the synctask, is prefetched in
 419                          * memory before the synctask runs.
 420                          */
 421                         int error = dmu_buf_hold_array_by_bonus(
 422                             checkpoint_sm->sm_dbuf, offset, size,
 423                             B_TRUE, FTAG, &numbufs, &dbp);
 424                         if (error != 0) {
 425                                 zfs_panic_recover("zfs: error %d was returned "
 426                                     "while prefetching checkpoint space map "
 427                                     "entries of vdev %llu\n",
 428                                     error, vd->vdev_id);
 429                         }
 430
 431                         VERIFY0(dsl_sync_task(spa->spa_name, NULL,
 432                             spa_checkpoint_discard_thread_sync, vd,
 433                             0, ZFS_SPACE_CHECK_NONE));
 434
 435                         dmu_buf_rele_array(dbp, numbufs, FTAG);
 436                 }
 437         }
 438
 439         VERIFY(spa_checkpoint_discard_is_done(spa));
 440         VERIFY0(spa->spa_checkpoint_info.sci_dspace);
 441         VERIFY0(dsl_sync_task(spa->spa_name, NULL,
 442             spa_checkpoint_discard_complete_sync, spa,
 443             0, ZFS_SPACE_CHECK_NONE));
 444
 445         return (0);
 446 }
 447
 448
 449 /* ARGSUSED */
 450 static int
 451 spa_checkpoint_check(void *arg, dmu_tx_t *tx)
 452 {
 453         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 454
 455         if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
 456                 return (SET_ERROR(ENOTSUP));
 457
 458         if (!spa_top_vdevs_spacemap_addressable(spa))
 459                 return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
 460
 461         if (spa->spa_vdev_removal != NULL)
 462                 return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
 463
 464         if (spa->spa_checkpoint_txg != 0)
 465                 return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
 466
 467         if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 468                 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
 469
 470         return (0);
 471 }
 472
 473 /* ARGSUSED */
 474 static void
 475 spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
 476 {
 477         dsl_pool_t *dp = dmu_tx_pool(tx);
 478         spa_t *spa = dp->dp_spa;
 479         uberblock_t checkpoint = spa->spa_ubsync;
 480
 481         /*
 482          * At this point, there should not be a checkpoint in the MOS.
 483          */
 484         ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 485             DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
 486
 487         ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
 488         ASSERT0(spa->spa_checkpoint_info.sci_dspace);
 489
 490         /*
 491          * Since the checkpointed uberblock is the one that just got synced
 492          * (we use spa_ubsync), its txg must be equal to the txg number of
 493          * the txg we are syncing, minus 1.
 494          */
 495         ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
 496
 497         /*
 498          * Once the checkpoint is in place, we need to ensure that none of
 499          * its blocks will be marked for reuse after it has been freed.
 500          * When there is a checkpoint and a block is freed, we compare its
 501          * birth txg to the txg of the checkpointed uberblock to see if the
 502          * block is part of the checkpoint or not. Therefore, we have to set
 503          * spa_checkpoint_txg before any frees happen in this txg (which is
 504          * why this is done as an early_synctask as explained in the comment
 505          * in spa_checkpoint()).
 506          */
 507         spa->spa_checkpoint_txg = checkpoint.ub_txg;
 508         spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
 509
 510         checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
 511         VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
 512             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
 513             sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
 514             &checkpoint, tx));
 515
 516         /*
 517          * Increment the feature refcount and thus activate the feature.
 518          * Note that the feature will be deactivated when we've
 519          * completely discarded all checkpointed state (both vdev
 520          * space maps and uberblock).
 521          */
 522         spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
 523
 524         spa_history_log_internal(spa, "spa checkpoint", tx,
 525             "checkpointed uberblock txg=%llu", checkpoint.ub_txg);
 526 }
 527
 528 /*
 529  * Create a checkpoint for the pool.
 530  */
 531 int
 532 spa_checkpoint(const char *pool)
 533 {
 534         int error;
 535         spa_t *spa;
 536
 537         error = spa_open(pool, &spa, FTAG);
 538         if (error != 0)
 539                 return (error);
 540
 541         mutex_enter(&spa->spa_vdev_top_lock);
 542
 543         /*
 544          * Wait for current syncing txg to finish so the latest synced
 545          * uberblock (spa_ubsync) has all the changes that we expect
 546          * to see if we were to revert later to the checkpoint. In other
 547          * words we want the checkpointed uberblock to include/reference
 548          * all the changes that were pending at the time that we issued
 549          * the checkpoint command.
 550          */
 551         txg_wait_synced(spa_get_dsl(spa), 0);
 552
 553         /*
 554          * As the checkpointed uberblock references blocks from the previous
 555          * txg (spa_ubsync) we want to ensure that are not freeing any of
 556          * these blocks in the same txg that the following synctask will
 557          * run. Thus, we run it as an early synctask, so the dirty changes
 558          * that are synced to disk afterwards during zios and other synctasks
 559          * do not reuse checkpointed blocks.
 560          */
 561         error = dsl_early_sync_task(pool, spa_checkpoint_check,
 562             spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
 563
 564         mutex_exit(&spa->spa_vdev_top_lock);
 565
 566         spa_close(spa, FTAG);
 567         return (error);
 568 }
 569
 570 /* ARGSUSED */
 571 static int
 572 spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
 573 {
 574         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 575
 576         if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 577                 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
 578
 579         if (spa->spa_checkpoint_txg == 0)
 580                 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
 581
 582         VERIFY0(zap_contains(spa_meta_objset(spa),
 583             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
 584
 585         return (0);
 586 }
 587
 588 /* ARGSUSED */
 589 static void
 590 spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
 591 {
 592         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 593
 594         VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 595             DMU_POOL_ZPOOL_CHECKPOINT, tx));
 596
 597         spa->spa_checkpoint_txg = 0;
 598
 599         zthr_wakeup(spa->spa_checkpoint_discard_zthr);
 600
 601         spa_history_log_internal(spa, "spa discard checkpoint", tx,
 602             "started discarding checkpointed state from the pool");
 603 }
 604
 605 /*
 606  * Discard the checkpoint from a pool.
 607  */
 608 int
 609 spa_checkpoint_discard(const char *pool)
 610 {
 611         /*
 612          * Similarly to spa_checkpoint(), we want our synctask to run
 613          * before any pending dirty data are written to disk so they
 614          * won't end up in the checkpoint's data structures (e.g.
 615          * ms_checkpointing and vdev_checkpoint_sm) and re-create any
 616          * space maps that the discarding open-context thread has
 617          * deleted.
 618          * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
 619          */
 620         return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
 621             spa_checkpoint_discard_sync, NULL, 0,
 622             ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
 623 }