sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/txg_impl.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dsl_pool.h>
  32 #include <sys/dsl_scan.h>
  33 #include <sys/zil.h>
  34 #include <sys/callb.h>
  35
  36 /*
  37  * ZFS Transaction Groups
  38  * ----------------------
  39  *
  40  * ZFS transaction groups are, as the name implies, groups of transactions
  41  * that act on persistent state. ZFS asserts consistency at the granularity of
  42  * these transaction groups. Each successive transaction group (txg) is
  43  * assigned a 64-bit consecutive identifier. There are three active
  44  * transaction group states: open, quiescing, or syncing. At any given time,
  45  * there may be an active txg associated with each state; each active txg may
  46  * either be processing, or blocked waiting to enter the next state. There may
  47  * be up to three active txgs, and there is always a txg in the open state
  48  * (though it may be blocked waiting to enter the quiescing state). In broad
  49  * strokes, transactions -- operations that change in-memory structures -- are
  50  * accepted into the txg in the open state, and are completed while the txg is
  51  * in the open or quiescing states. The accumulated changes are written to
  52  * disk in the syncing state.
  53  *
  54  * Open
  55  *
  56  * When a new txg becomes active, it first enters the open state. New
  57  * transactions -- updates to in-memory structures -- are assigned to the
  58  * currently open txg. There is always a txg in the open state so that ZFS can
  59  * accept new changes (though the txg may refuse new changes if it has hit
  60  * some limit). ZFS advances the open txg to the next state for a variety of
  61  * reasons such as it hitting a time or size threshold, or the execution of an
  62  * administrative action that must be completed in the syncing state.
  63  *
  64  * Quiescing
  65  *
  66  * After a txg exits the open state, it enters the quiescing state. The
  67  * quiescing state is intended to provide a buffer between accepting new
  68  * transactions in the open state and writing them out to stable storage in
  69  * the syncing state. While quiescing, transactions can continue their
  70  * operation without delaying either of the other states. Typically, a txg is
  71  * in the quiescing state very briefly since the operations are bounded by
  72  * software latencies rather than, say, slower I/O latencies. After all
  73  * transactions complete, the txg is ready to enter the next state.
  74  *
  75  * Syncing
  76  *
  77  * In the syncing state, the in-memory state built up during the open and (to
  78  * a lesser degree) the quiescing states is written to stable storage. The
  79  * process of writing out modified data can, in turn modify more data. For
  80  * example when we write new blocks, we need to allocate space for them; those
  81  * allocations modify metadata (space maps)... which themselves must be
  82  * written to stable storage. During the sync state, ZFS iterates, writing out
  83  * data until it converges and all in-memory changes have been written out.
  84  * The first such pass is the largest as it encompasses all the modified user
  85  * data (as opposed to filesystem metadata). Subsequent passes typically have
  86  * far less data to write as they consist exclusively of filesystem metadata.
  87  *
  88  * To ensure convergence, after a certain number of passes ZFS begins
  89  * overwriting locations on stable storage that had been allocated earlier in
  90  * the syncing state (and subsequently freed). ZFS usually allocates new
  91  * blocks to optimize for large, continuous, writes. For the syncing state to
  92  * converge however it must complete a pass where no new blocks are allocated
  93  * since each allocation requires a modification of persistent metadata.
  94  * Further, to hasten convergence, after a prescribed number of passes, ZFS
  95  * also defers frees, and stops compressing.
  96  *
  97  * In addition to writing out user data, we must also execute synctasks during
  98  * the syncing context. A synctask is the mechanism by which some
  99  * administrative activities work such as creating and destroying snapshots or
 100  * datasets. Note that when a synctask is initiated it enters the open txg,
 101  * and ZFS then pushes that txg as quickly as possible to completion of the
 102  * syncing state in order to reduce the latency of the administrative
 103  * activity. To complete the syncing state, ZFS writes out a new uberblock,
 104  * the root of the tree of blocks that comprise all state stored on the ZFS
 105  * pool. Finally, if there is a quiesced txg waiting, we signal that it can
 106  * now transition to the syncing state.
 107  */
 108
 109 static void txg_sync_thread(void *arg);
 110 static void txg_quiesce_thread(void *arg);
 111
 112 int zfs_txg_timeout = 5;        /* max seconds worth of delta per txg */
 113
 114 SYSCTL_DECL(_vfs_zfs);
 115 SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
 116 SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
 117     "Maximum seconds worth of delta per txg");
 118
 119 /*
 120  * Prepare the txg subsystem.
 121  */
 122 void
 123 txg_init(dsl_pool_t *dp, uint64_t txg)
 124 {
 125         tx_state_t *tx = &dp->dp_tx;
 126         int c;
 127         bzero(tx, sizeof (tx_state_t));
 128
 129         tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
 130
 131         for (c = 0; c < max_ncpus; c++) {
 132                 int i;
 133
 134                 mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
 135                 mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
 136                     NULL);
 137                 for (i = 0; i < TXG_SIZE; i++) {
 138                         cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
 139                             NULL);
 140                         list_create(&tx->tx_cpu[c].tc_callbacks[i],
 141                             sizeof (dmu_tx_callback_t),
 142                             offsetof(dmu_tx_callback_t, dcb_node));
 143                 }
 144         }
 145
 146         mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 147
 148         cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
 149         cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
 150         cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
 151         cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
 152         cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
 153
 154         tx->tx_open_txg = txg;
 155 }
 156
 157 /*
 158  * Close down the txg subsystem.
 159  */
 160 void
 161 txg_fini(dsl_pool_t *dp)
 162 {
 163         tx_state_t *tx = &dp->dp_tx;
 164         int c;
 165
 166         ASSERT0(tx->tx_threads);
 167
 168         mutex_destroy(&tx->tx_sync_lock);
 169
 170         cv_destroy(&tx->tx_sync_more_cv);
 171         cv_destroy(&tx->tx_sync_done_cv);
 172         cv_destroy(&tx->tx_quiesce_more_cv);
 173         cv_destroy(&tx->tx_quiesce_done_cv);
 174         cv_destroy(&tx->tx_exit_cv);
 175
 176         for (c = 0; c < max_ncpus; c++) {
 177                 int i;
 178
 179                 mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
 180                 mutex_destroy(&tx->tx_cpu[c].tc_lock);
 181                 for (i = 0; i < TXG_SIZE; i++) {
 182                         cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
 183                         list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
 184                 }
 185         }
 186
 187         if (tx->tx_commit_cb_taskq != NULL)
 188                 taskq_destroy(tx->tx_commit_cb_taskq);
 189
 190         kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
 191
 192         bzero(tx, sizeof (tx_state_t));
 193 }
 194
 195 /*
 196  * Start syncing transaction groups.
 197  */
 198 void
 199 txg_sync_start(dsl_pool_t *dp)
 200 {
 201         tx_state_t *tx = &dp->dp_tx;
 202
 203         mutex_enter(&tx->tx_sync_lock);
 204
 205         dprintf("pool %p\n", dp);
 206
 207         ASSERT0(tx->tx_threads);
 208
 209         tx->tx_threads = 2;
 210
 211         tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
 212             dp, 0, &p0, TS_RUN, minclsyspri);
 213
 214         /*
 215          * The sync thread can need a larger-than-default stack size on
 216          * 32-bit x86.  This is due in part to nested pools and
 217          * scrub_visitbp() recursion.
 218          */
 219         tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
 220             dp, 0, &p0, TS_RUN, minclsyspri);
 221
 222         mutex_exit(&tx->tx_sync_lock);
 223 }
 224
 225 static void
 226 txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
 227 {
 228         CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
 229         mutex_enter(&tx->tx_sync_lock);
 230 }
 231
 232 static void
 233 txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
 234 {
 235         ASSERT(*tpp != NULL);
 236         *tpp = NULL;
 237         tx->tx_threads--;
 238         cv_broadcast(&tx->tx_exit_cv);
 239         CALLB_CPR_EXIT(cpr);            /* drops &tx->tx_sync_lock */
 240         thread_exit();
 241 }
 242
 243 static void
 244 txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
 245 {
 246         CALLB_CPR_SAFE_BEGIN(cpr);
 247
 248         if (time)
 249                 (void) cv_timedwait(cv, &tx->tx_sync_lock, time);
 250         else
 251                 cv_wait(cv, &tx->tx_sync_lock);
 252
 253         CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
 254 }
 255
 256 /*
 257  * Stop syncing transaction groups.
 258  */
 259 void
 260 txg_sync_stop(dsl_pool_t *dp)
 261 {
 262         tx_state_t *tx = &dp->dp_tx;
 263
 264         dprintf("pool %p\n", dp);
 265         /*
 266          * Finish off any work in progress.
 267          */
 268         ASSERT3U(tx->tx_threads, ==, 2);
 269
 270         /*
 271          * We need to ensure that we've vacated the deferred space_maps.
 272          */
 273         txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 274
 275         /*
 276          * Wake all sync threads and wait for them to die.
 277          */
 278         mutex_enter(&tx->tx_sync_lock);
 279
 280         ASSERT3U(tx->tx_threads, ==, 2);
 281
 282         tx->tx_exiting = 1;
 283
 284         cv_broadcast(&tx->tx_quiesce_more_cv);
 285         cv_broadcast(&tx->tx_quiesce_done_cv);
 286         cv_broadcast(&tx->tx_sync_more_cv);
 287
 288         while (tx->tx_threads != 0)
 289                 cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
 290
 291         tx->tx_exiting = 0;
 292
 293         mutex_exit(&tx->tx_sync_lock);
 294 }
 295
 296 uint64_t
 297 txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 298 {
 299         tx_state_t *tx = &dp->dp_tx;
 300         tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
 301         uint64_t txg;
 302
 303         mutex_enter(&tc->tc_open_lock);
 304         txg = tx->tx_open_txg;
 305
 306         mutex_enter(&tc->tc_lock);
 307         tc->tc_count[txg & TXG_MASK]++;
 308         mutex_exit(&tc->tc_lock);
 309
 310         th->th_cpu = tc;
 311         th->th_txg = txg;
 312
 313         return (txg);
 314 }
 315
 316 void
 317 txg_rele_to_quiesce(txg_handle_t *th)
 318 {
 319         tx_cpu_t *tc = th->th_cpu;
 320
 321         ASSERT(!MUTEX_HELD(&tc->tc_lock));
 322         mutex_exit(&tc->tc_open_lock);
 323 }
 324
 325 void
 326 txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
 327 {
 328         tx_cpu_t *tc = th->th_cpu;
 329         int g = th->th_txg & TXG_MASK;
 330
 331         mutex_enter(&tc->tc_lock);
 332         list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
 333         mutex_exit(&tc->tc_lock);
 334 }
 335
 336 void
 337 txg_rele_to_sync(txg_handle_t *th)
 338 {
 339         tx_cpu_t *tc = th->th_cpu;
 340         int g = th->th_txg & TXG_MASK;
 341
 342         mutex_enter(&tc->tc_lock);
 343         ASSERT(tc->tc_count[g] != 0);
 344         if (--tc->tc_count[g] == 0)
 345                 cv_broadcast(&tc->tc_cv[g]);
 346         mutex_exit(&tc->tc_lock);
 347
 348         th->th_cpu = NULL;      /* defensive */
 349 }
 350
 351 /*
 352  * Blocks until all transactions in the group are committed.
 353  *
 354  * On return, the transaction group has reached a stable state in which it can
 355  * then be passed off to the syncing context.
 356  */
 357 static __noinline void
 358 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 359 {
 360         tx_state_t *tx = &dp->dp_tx;
 361         int g = txg & TXG_MASK;
 362         int c;
 363
 364         /*
 365          * Grab all tc_open_locks so nobody else can get into this txg.
 366          */
 367         for (c = 0; c < max_ncpus; c++)
 368                 mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 369
 370         ASSERT(txg == tx->tx_open_txg);
 371         tx->tx_open_txg++;
 372         tx->tx_open_time = gethrtime();
 373
 374         DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 375         DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 376
 377         /*
 378          * Now that we've incremented tx_open_txg, we can let threads
 379          * enter the next transaction group.
 380          */
 381         for (c = 0; c < max_ncpus; c++)
 382                 mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 383
 384         /*
 385          * Quiesce the transaction group by waiting for everyone to txg_exit().
 386          */
 387         for (c = 0; c < max_ncpus; c++) {
 388                 tx_cpu_t *tc = &tx->tx_cpu[c];
 389                 mutex_enter(&tc->tc_lock);
 390                 while (tc->tc_count[g] != 0)
 391                         cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 392                 mutex_exit(&tc->tc_lock);
 393         }
 394 }
 395
 396 static void
 397 txg_do_callbacks(void *arg)
 398 {
 399         list_t *cb_list = arg;
 400
 401         dmu_tx_do_callbacks(cb_list, 0);
 402
 403         list_destroy(cb_list);
 404
 405         kmem_free(cb_list, sizeof (list_t));
 406 }
 407
 408 /*
 409  * Dispatch the commit callbacks registered on this txg to worker threads.
 410  *
 411  * If no callbacks are registered for a given TXG, nothing happens.
 412  * This function creates a taskq for the associated pool, if needed.
 413  */
 414 static void
 415 txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 416 {
 417         int c;
 418         tx_state_t *tx = &dp->dp_tx;
 419         list_t *cb_list;
 420
 421         for (c = 0; c < max_ncpus; c++) {
 422                 tx_cpu_t *tc = &tx->tx_cpu[c];
 423                 /*
 424                  * No need to lock tx_cpu_t at this point, since this can
 425                  * only be called once a txg has been synced.
 426                  */
 427
 428                 int g = txg & TXG_MASK;
 429
 430                 if (list_is_empty(&tc->tc_callbacks[g]))
 431                         continue;
 432
 433                 if (tx->tx_commit_cb_taskq == NULL) {
 434                         /*
 435                          * Commit callback taskq hasn't been created yet.
 436                          */
 437                         tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
 438                             max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
 439                             TASKQ_PREPOPULATE);
 440                 }
 441
 442                 cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 443                 list_create(cb_list, sizeof (dmu_tx_callback_t),
 444                     offsetof(dmu_tx_callback_t, dcb_node));
 445
 446                 list_move_tail(cb_list, &tc->tc_callbacks[g]);
 447
 448                 (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 449                     txg_do_callbacks, cb_list, TQ_SLEEP);
 450         }
 451 }
 452
 453 static boolean_t
 454 txg_is_syncing(dsl_pool_t *dp)
 455 {
 456         tx_state_t *tx = &dp->dp_tx;
 457         ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
 458         return (tx->tx_syncing_txg != 0);
 459 }
 460
 461 static boolean_t
 462 txg_is_quiescing(dsl_pool_t *dp)
 463 {
 464         tx_state_t *tx = &dp->dp_tx;
 465         ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
 466         return (tx->tx_quiescing_txg != 0);
 467 }
 468
 469 static boolean_t
 470 txg_has_quiesced_to_sync(dsl_pool_t *dp)
 471 {
 472         tx_state_t *tx = &dp->dp_tx;
 473         ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
 474         return (tx->tx_quiesced_txg != 0);
 475 }
 476
 477 static void
 478 txg_sync_thread(void *arg)
 479 {
 480         dsl_pool_t *dp = arg;
 481         spa_t *spa = dp->dp_spa;
 482         tx_state_t *tx = &dp->dp_tx;
 483         callb_cpr_t cpr;
 484         uint64_t start, delta;
 485
 486         txg_thread_enter(tx, &cpr);
 487
 488         start = delta = 0;
 489         for (;;) {
 490                 uint64_t timeout = zfs_txg_timeout * hz;
 491                 uint64_t timer;
 492                 uint64_t txg;
 493
 494                 /*
 495                  * We sync when we're scanning, there's someone waiting
 496                  * on us, or the quiesce thread has handed off a txg to
 497                  * us, or we have reached our timeout.
 498                  */
 499                 timer = (delta >= timeout ? 0 : timeout - delta);
 500                 while (!dsl_scan_active(dp->dp_scan) &&
 501                     !tx->tx_exiting && timer > 0 &&
 502                     tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 503                     !txg_has_quiesced_to_sync(dp) &&
 504                     dp->dp_dirty_total < zfs_dirty_data_sync) {
 505                         dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 506                             tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 507                         txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 508                         delta = ddi_get_lbolt() - start;
 509                         timer = (delta > timeout ? 0 : timeout - delta);
 510                 }
 511
 512                 /*
 513                  * Wait until the quiesce thread hands off a txg to us,
 514                  * prompting it to do so if necessary.
 515                  */
 516                 while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
 517                         if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 518                                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 519                         cv_broadcast(&tx->tx_quiesce_more_cv);
 520                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
 521                 }
 522
 523                 if (tx->tx_exiting)
 524                         txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 525
 526                 /*
 527                  * Consume the quiesced txg which has been handed off to
 528                  * us.  This may cause the quiescing thread to now be
 529                  * able to quiesce another txg, so we must signal it.
 530                  */
 531                 ASSERT(tx->tx_quiesced_txg != 0);
 532                 txg = tx->tx_quiesced_txg;
 533                 tx->tx_quiesced_txg = 0;
 534                 tx->tx_syncing_txg = txg;
 535                 DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 536                 cv_broadcast(&tx->tx_quiesce_more_cv);
 537
 538                 dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 539                     txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 540                 mutex_exit(&tx->tx_sync_lock);
 541
 542                 start = ddi_get_lbolt();
 543                 spa_sync(spa, txg);
 544                 delta = ddi_get_lbolt() - start;
 545
 546                 mutex_enter(&tx->tx_sync_lock);
 547                 tx->tx_synced_txg = txg;
 548                 tx->tx_syncing_txg = 0;
 549                 DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
 550                 cv_broadcast(&tx->tx_sync_done_cv);
 551
 552                 /*
 553                  * Dispatch commit callbacks to worker threads.
 554                  */
 555                 txg_dispatch_callbacks(dp, txg);
 556         }
 557 }
 558
 559 static void
 560 txg_quiesce_thread(void *arg)
 561 {
 562         dsl_pool_t *dp = arg;
 563         tx_state_t *tx = &dp->dp_tx;
 564         callb_cpr_t cpr;
 565
 566         txg_thread_enter(tx, &cpr);
 567
 568         for (;;) {
 569                 uint64_t txg;
 570
 571                 /*
 572                  * We quiesce when there's someone waiting on us.
 573                  * However, we can only have one txg in "quiescing" or
 574                  * "quiesced, waiting to sync" state.  So we wait until
 575                  * the "quiesced, waiting to sync" txg has been consumed
 576                  * by the sync thread.
 577                  */
 578                 while (!tx->tx_exiting &&
 579                     (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
 580                     txg_has_quiesced_to_sync(dp)))
 581                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
 582
 583                 if (tx->tx_exiting)
 584                         txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
 585
 586                 txg = tx->tx_open_txg;
 587                 dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 588                     txg, tx->tx_quiesce_txg_waiting,
 589                     tx->tx_sync_txg_waiting);
 590                 tx->tx_quiescing_txg = txg;
 591
 592                 mutex_exit(&tx->tx_sync_lock);
 593                 txg_quiesce(dp, txg);
 594                 mutex_enter(&tx->tx_sync_lock);
 595
 596                 /*
 597                  * Hand this txg off to the sync thread.
 598                  */
 599                 dprintf("quiesce done, handing off txg %llu\n", txg);
 600                 tx->tx_quiescing_txg = 0;
 601                 tx->tx_quiesced_txg = txg;
 602                 DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 603                 cv_broadcast(&tx->tx_sync_more_cv);
 604                 cv_broadcast(&tx->tx_quiesce_done_cv);
 605         }
 606 }
 607
 608 /*
 609  * Delay this thread by delay nanoseconds if we are still in the open
 610  * transaction group and there is already a waiting txg quiesing or quiesced.
 611  * Abort the delay if this txg stalls or enters the quiesing state.
 612  */
 613 void
 614 txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
 615 {
 616         tx_state_t *tx = &dp->dp_tx;
 617         hrtime_t start = gethrtime();
 618
 619         /* don't delay if this txg could transition to quiescing immediately */
 620         if (tx->tx_open_txg > txg ||
 621             tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
 622                 return;
 623
 624         mutex_enter(&tx->tx_sync_lock);
 625         if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
 626                 mutex_exit(&tx->tx_sync_lock);
 627                 return;
 628         }
 629
 630         while (gethrtime() - start < delay &&
 631             tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
 632                 (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
 633                     &tx->tx_sync_lock, delay, resolution, 0);
 634         }
 635
 636         mutex_exit(&tx->tx_sync_lock);
 637 }
 638
 639 void
 640 txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 641 {
 642         tx_state_t *tx = &dp->dp_tx;
 643
 644         ASSERT(!dsl_pool_config_held(dp));
 645
 646         mutex_enter(&tx->tx_sync_lock);
 647         ASSERT3U(tx->tx_threads, ==, 2);
 648         if (txg == 0)
 649                 txg = tx->tx_open_txg + TXG_DEFER_SIZE;
 650         if (tx->tx_sync_txg_waiting < txg)
 651                 tx->tx_sync_txg_waiting = txg;
 652         dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 653             txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 654         while (tx->tx_synced_txg < txg) {
 655                 dprintf("broadcasting sync more "
 656                     "tx_synced=%llu waiting=%llu dp=%p\n",
 657                     tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 658                 cv_broadcast(&tx->tx_sync_more_cv);
 659                 cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
 660         }
 661         mutex_exit(&tx->tx_sync_lock);
 662 }
 663
 664 void
 665 txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 666 {
 667         tx_state_t *tx = &dp->dp_tx;
 668
 669         ASSERT(!dsl_pool_config_held(dp));
 670
 671         mutex_enter(&tx->tx_sync_lock);
 672         ASSERT3U(tx->tx_threads, ==, 2);
 673         if (txg == 0)
 674                 txg = tx->tx_open_txg + 1;
 675         if (tx->tx_quiesce_txg_waiting < txg)
 676                 tx->tx_quiesce_txg_waiting = txg;
 677         dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 678             txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 679         while (tx->tx_open_txg < txg) {
 680                 cv_broadcast(&tx->tx_quiesce_more_cv);
 681                 cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
 682         }
 683         mutex_exit(&tx->tx_sync_lock);
 684 }
 685
 686 /*
 687  * If there isn't a txg syncing or in the pipeline, push another txg through
 688  * the pipeline by queiscing the open txg.
 689  */
 690 void
 691 txg_kick(dsl_pool_t *dp)
 692 {
 693         tx_state_t *tx = &dp->dp_tx;
 694
 695         ASSERT(!dsl_pool_config_held(dp));
 696
 697         mutex_enter(&tx->tx_sync_lock);
 698         if (!txg_is_syncing(dp) &&
 699             !txg_is_quiescing(dp) &&
 700             tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
 701             tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
 702             tx->tx_quiesced_txg <= tx->tx_synced_txg) {
 703                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
 704                 cv_broadcast(&tx->tx_quiesce_more_cv);
 705         }
 706         mutex_exit(&tx->tx_sync_lock);
 707 }
 708
 709 boolean_t
 710 txg_stalled(dsl_pool_t *dp)
 711 {
 712         tx_state_t *tx = &dp->dp_tx;
 713         return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 714 }
 715
 716 boolean_t
 717 txg_sync_waiting(dsl_pool_t *dp)
 718 {
 719         tx_state_t *tx = &dp->dp_tx;
 720
 721         return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
 722             tx->tx_quiesced_txg != 0);
 723 }
 724
 725 /*
 726  * Verify that this txg is active (open, quiescing, syncing).  Non-active
 727  * txg's should not be manipulated.
 728  */
 729 void
 730 txg_verify(spa_t *spa, uint64_t txg)
 731 {
 732         dsl_pool_t *dp = spa_get_dsl(spa);
 733         if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
 734                 return;
 735         ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
 736         ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
 737         ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
 738 }
 739
 740 /*
 741  * Per-txg object lists.
 742  */
 743 void
 744 txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
 745 {
 746         int t;
 747
 748         mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
 749
 750         tl->tl_offset = offset;
 751         tl->tl_spa = spa;
 752
 753         for (t = 0; t < TXG_SIZE; t++)
 754                 tl->tl_head[t] = NULL;
 755 }
 756
 757 void
 758 txg_list_destroy(txg_list_t *tl)
 759 {
 760         int t;
 761
 762         for (t = 0; t < TXG_SIZE; t++)
 763                 ASSERT(txg_list_empty(tl, t));
 764
 765         mutex_destroy(&tl->tl_lock);
 766 }
 767
 768 boolean_t
 769 txg_list_empty(txg_list_t *tl, uint64_t txg)
 770 {
 771         txg_verify(tl->tl_spa, txg);
 772         return (tl->tl_head[txg & TXG_MASK] == NULL);
 773 }
 774
 775 /*
 776  * Returns true if all txg lists are empty.
 777  *
 778  * Warning: this is inherently racy (an item could be added immediately
 779  * after this function returns). We don't bother with the lock because
 780  * it wouldn't change the semantics.
 781  */
 782 boolean_t
 783 txg_all_lists_empty(txg_list_t *tl)
 784 {
 785         for (int i = 0; i < TXG_SIZE; i++) {
 786                 if (!txg_list_empty(tl, i)) {
 787                         return (B_FALSE);
 788                 }
 789         }
 790         return (B_TRUE);
 791 }
 792
 793 /*
 794  * Add an entry to the list (unless it's already on the list).
 795  * Returns B_TRUE if it was actually added.
 796  */
 797 boolean_t
 798 txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
 799 {
 800         int t = txg & TXG_MASK;
 801         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 802         boolean_t add;
 803
 804         txg_verify(tl->tl_spa, txg);
 805         mutex_enter(&tl->tl_lock);
 806         add = (tn->tn_member[t] == 0);
 807         if (add) {
 808                 tn->tn_member[t] = 1;
 809                 tn->tn_next[t] = tl->tl_head[t];
 810                 tl->tl_head[t] = tn;
 811         }
 812         mutex_exit(&tl->tl_lock);
 813
 814         return (add);
 815 }
 816
 817 /*
 818  * Add an entry to the end of the list, unless it's already on the list.
 819  * (walks list to find end)
 820  * Returns B_TRUE if it was actually added.
 821  */
 822 boolean_t
 823 txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
 824 {
 825         int t = txg & TXG_MASK;
 826         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 827         boolean_t add;
 828
 829         txg_verify(tl->tl_spa, txg);
 830         mutex_enter(&tl->tl_lock);
 831         add = (tn->tn_member[t] == 0);
 832         if (add) {
 833                 txg_node_t **tp;
 834
 835                 for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
 836                         continue;
 837
 838                 tn->tn_member[t] = 1;
 839                 tn->tn_next[t] = NULL;
 840                 *tp = tn;
 841         }
 842         mutex_exit(&tl->tl_lock);
 843
 844         return (add);
 845 }
 846
 847 /*
 848  * Remove the head of the list and return it.
 849  */
 850 void *
 851 txg_list_remove(txg_list_t *tl, uint64_t txg)
 852 {
 853         int t = txg & TXG_MASK;
 854         txg_node_t *tn;
 855         void *p = NULL;
 856
 857         txg_verify(tl->tl_spa, txg);
 858         mutex_enter(&tl->tl_lock);
 859         if ((tn = tl->tl_head[t]) != NULL) {
 860                 ASSERT(tn->tn_member[t]);
 861                 ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
 862                 p = (char *)tn - tl->tl_offset;
 863                 tl->tl_head[t] = tn->tn_next[t];
 864                 tn->tn_next[t] = NULL;
 865                 tn->tn_member[t] = 0;
 866         }
 867         mutex_exit(&tl->tl_lock);
 868
 869         return (p);
 870 }
 871
 872 /*
 873  * Remove a specific item from the list and return it.
 874  */
 875 void *
 876 txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
 877 {
 878         int t = txg & TXG_MASK;
 879         txg_node_t *tn, **tp;
 880
 881         txg_verify(tl->tl_spa, txg);
 882         mutex_enter(&tl->tl_lock);
 883
 884         for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
 885                 if ((char *)tn - tl->tl_offset == p) {
 886                         *tp = tn->tn_next[t];
 887                         tn->tn_next[t] = NULL;
 888                         tn->tn_member[t] = 0;
 889                         mutex_exit(&tl->tl_lock);
 890                         return (p);
 891                 }
 892         }
 893
 894         mutex_exit(&tl->tl_lock);
 895
 896         return (NULL);
 897 }
 898
 899 boolean_t
 900 txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
 901 {
 902         int t = txg & TXG_MASK;
 903         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 904
 905         txg_verify(tl->tl_spa, txg);
 906         return (tn->tn_member[t] != 0);
 907 }
 908
 909 /*
 910  * Walk a txg list -- only safe if you know it's not changing.
 911  */
 912 void *
 913 txg_list_head(txg_list_t *tl, uint64_t txg)
 914 {
 915         int t = txg & TXG_MASK;
 916         txg_node_t *tn = tl->tl_head[t];
 917
 918         txg_verify(tl->tl_spa, txg);
 919         return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 920 }
 921
 922 void *
 923 txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
 924 {
 925         int t = txg & TXG_MASK;
 926         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 927
 928         txg_verify(tl->tl_spa, txg);
 929         tn = tn->tn_next[t];
 930
 931         return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 932 }