sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/txg_impl.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dsl_pool.h>
  32 #include <sys/dsl_scan.h>
  33 #include <sys/zil.h>
  34 #include <sys/callb.h>
  35
  36 /*
  37  * ZFS Transaction Groups
  38  * ----------------------
  39  *
  40  * ZFS transaction groups are, as the name implies, groups of transactions
  41  * that act on persistent state. ZFS asserts consistency at the granularity of
  42  * these transaction groups. Each successive transaction group (txg) is
  43  * assigned a 64-bit consecutive identifier. There are three active
  44  * transaction group states: open, quiescing, or syncing. At any given time,
  45  * there may be an active txg associated with each state; each active txg may
  46  * either be processing, or blocked waiting to enter the next state. There may
  47  * be up to three active txgs, and there is always a txg in the open state
  48  * (though it may be blocked waiting to enter the quiescing state). In broad
  49  * strokes, transactions -- operations that change in-memory structures -- are
  50  * accepted into the txg in the open state, and are completed while the txg is
  51  * in the open or quiescing states. The accumulated changes are written to
  52  * disk in the syncing state.
  53  *
  54  * Open
  55  *
  56  * When a new txg becomes active, it first enters the open state. New
  57  * transactions -- updates to in-memory structures -- are assigned to the
  58  * currently open txg. There is always a txg in the open state so that ZFS can
  59  * accept new changes (though the txg may refuse new changes if it has hit
  60  * some limit). ZFS advances the open txg to the next state for a variety of
  61  * reasons such as it hitting a time or size threshold, or the execution of an
  62  * administrative action that must be completed in the syncing state.
  63  *
  64  * Quiescing
  65  *
  66  * After a txg exits the open state, it enters the quiescing state. The
  67  * quiescing state is intended to provide a buffer between accepting new
  68  * transactions in the open state and writing them out to stable storage in
  69  * the syncing state. While quiescing, transactions can continue their
  70  * operation without delaying either of the other states. Typically, a txg is
  71  * in the quiescing state very briefly since the operations are bounded by
  72  * software latencies rather than, say, slower I/O latencies. After all
  73  * transactions complete, the txg is ready to enter the next state.
  74  *
  75  * Syncing
  76  *
  77  * In the syncing state, the in-memory state built up during the open and (to
  78  * a lesser degree) the quiescing states is written to stable storage. The
  79  * process of writing out modified data can, in turn modify more data. For
  80  * example when we write new blocks, we need to allocate space for them; those
  81  * allocations modify metadata (space maps)... which themselves must be
  82  * written to stable storage. During the sync state, ZFS iterates, writing out
  83  * data until it converges and all in-memory changes have been written out.
  84  * The first such pass is the largest as it encompasses all the modified user
  85  * data (as opposed to filesystem metadata). Subsequent passes typically have
  86  * far less data to write as they consist exclusively of filesystem metadata.
  87  *
  88  * To ensure convergence, after a certain number of passes ZFS begins
  89  * overwriting locations on stable storage that had been allocated earlier in
  90  * the syncing state (and subsequently freed). ZFS usually allocates new
  91  * blocks to optimize for large, continuous, writes. For the syncing state to
  92  * converge however it must complete a pass where no new blocks are allocated
  93  * since each allocation requires a modification of persistent metadata.
  94  * Further, to hasten convergence, after a prescribed number of passes, ZFS
  95  * also defers frees, and stops compressing.
  96  *
  97  * In addition to writing out user data, we must also execute synctasks during
  98  * the syncing context. A synctask is the mechanism by which some
  99  * administrative activities work such as creating and destroying snapshots or
 100  * datasets. Note that when a synctask is initiated it enters the open txg,
 101  * and ZFS then pushes that txg as quickly as possible to completion of the
 102  * syncing state in order to reduce the latency of the administrative
 103  * activity. To complete the syncing state, ZFS writes out a new uberblock,
 104  * the root of the tree of blocks that comprise all state stored on the ZFS
 105  * pool. Finally, if there is a quiesced txg waiting, we signal that it can
 106  * now transition to the syncing state.
 107  */
 108
 109 static void txg_sync_thread(void *arg);
 110 static void txg_quiesce_thread(void *arg);
 111
 112 int zfs_txg_timeout = 5;        /* max seconds worth of delta per txg */
 113
 114 SYSCTL_DECL(_vfs_zfs);
 115 SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
 116 SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
 117     "Maximum seconds worth of delta per txg");
 118
 119 /*
 120  * Prepare the txg subsystem.
 121  */
 122 void
 123 txg_init(dsl_pool_t *dp, uint64_t txg)
 124 {
 125         tx_state_t *tx = &dp->dp_tx;
 126         int c;
 127         bzero(tx, sizeof (tx_state_t));
 128
 129         tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
 130
 131         for (c = 0; c < max_ncpus; c++) {
 132                 int i;
 133
 134                 mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
 135                 mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
 136                     NULL);
 137                 for (i = 0; i < TXG_SIZE; i++) {
 138                         cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
 139                             NULL);
 140                         list_create(&tx->tx_cpu[c].tc_callbacks[i],
 141                             sizeof (dmu_tx_callback_t),
 142                             offsetof(dmu_tx_callback_t, dcb_node));
 143                 }
 144         }
 145
 146         mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 147
 148         cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
 149         cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
 150         cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
 151         cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
 152         cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
 153
 154         tx->tx_open_txg = txg;
 155 }
 156
 157 /*
 158  * Close down the txg subsystem.
 159  */
 160 void
 161 txg_fini(dsl_pool_t *dp)
 162 {
 163         tx_state_t *tx = &dp->dp_tx;
 164         int c;
 165
 166         ASSERT(tx->tx_threads == 0);
 167
 168         mutex_destroy(&tx->tx_sync_lock);
 169
 170         cv_destroy(&tx->tx_sync_more_cv);
 171         cv_destroy(&tx->tx_sync_done_cv);
 172         cv_destroy(&tx->tx_quiesce_more_cv);
 173         cv_destroy(&tx->tx_quiesce_done_cv);
 174         cv_destroy(&tx->tx_exit_cv);
 175
 176         for (c = 0; c < max_ncpus; c++) {
 177                 int i;
 178
 179                 mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
 180                 mutex_destroy(&tx->tx_cpu[c].tc_lock);
 181                 for (i = 0; i < TXG_SIZE; i++) {
 182                         cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
 183                         list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
 184                 }
 185         }
 186
 187         if (tx->tx_commit_cb_taskq != NULL)
 188                 taskq_destroy(tx->tx_commit_cb_taskq);
 189
 190         kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
 191
 192         bzero(tx, sizeof (tx_state_t));
 193 }
 194
 195 /*
 196  * Start syncing transaction groups.
 197  */
 198 void
 199 txg_sync_start(dsl_pool_t *dp)
 200 {
 201         tx_state_t *tx = &dp->dp_tx;
 202
 203         mutex_enter(&tx->tx_sync_lock);
 204
 205         dprintf("pool %p\n", dp);
 206
 207         ASSERT(tx->tx_threads == 0);
 208
 209         tx->tx_threads = 2;
 210
 211         tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
 212             dp, 0, &p0, TS_RUN, minclsyspri);
 213
 214         /*
 215          * The sync thread can need a larger-than-default stack size on
 216          * 32-bit x86.  This is due in part to nested pools and
 217          * scrub_visitbp() recursion.
 218          */
 219         tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
 220             dp, 0, &p0, TS_RUN, minclsyspri);
 221
 222         mutex_exit(&tx->tx_sync_lock);
 223 }
 224
 225 static void
 226 txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
 227 {
 228         CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
 229         mutex_enter(&tx->tx_sync_lock);
 230 }
 231
 232 static void
 233 txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
 234 {
 235         ASSERT(*tpp != NULL);
 236         *tpp = NULL;
 237         tx->tx_threads--;
 238         cv_broadcast(&tx->tx_exit_cv);
 239         CALLB_CPR_EXIT(cpr);            /* drops &tx->tx_sync_lock */
 240         thread_exit();
 241 }
 242
 243 static void
 244 txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
 245 {
 246         CALLB_CPR_SAFE_BEGIN(cpr);
 247
 248         if (time)
 249                 (void) cv_timedwait(cv, &tx->tx_sync_lock, time);
 250         else
 251                 cv_wait(cv, &tx->tx_sync_lock);
 252
 253         CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
 254 }
 255
 256 /*
 257  * Stop syncing transaction groups.
 258  */
 259 void
 260 txg_sync_stop(dsl_pool_t *dp)
 261 {
 262         tx_state_t *tx = &dp->dp_tx;
 263
 264         dprintf("pool %p\n", dp);
 265         /*
 266          * Finish off any work in progress.
 267          */
 268         ASSERT(tx->tx_threads == 2);
 269
 270         /*
 271          * We need to ensure that we've vacated the deferred space_maps.
 272          */
 273         txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 274
 275         /*
 276          * Wake all sync threads and wait for them to die.
 277          */
 278         mutex_enter(&tx->tx_sync_lock);
 279
 280         ASSERT(tx->tx_threads == 2);
 281
 282         tx->tx_exiting = 1;
 283
 284         cv_broadcast(&tx->tx_quiesce_more_cv);
 285         cv_broadcast(&tx->tx_quiesce_done_cv);
 286         cv_broadcast(&tx->tx_sync_more_cv);
 287
 288         while (tx->tx_threads != 0)
 289                 cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
 290
 291         tx->tx_exiting = 0;
 292
 293         mutex_exit(&tx->tx_sync_lock);
 294 }
 295
 296 uint64_t
 297 txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 298 {
 299         tx_state_t *tx = &dp->dp_tx;
 300         tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
 301         uint64_t txg;
 302
 303         mutex_enter(&tc->tc_open_lock);
 304         txg = tx->tx_open_txg;
 305
 306         mutex_enter(&tc->tc_lock);
 307         tc->tc_count[txg & TXG_MASK]++;
 308         mutex_exit(&tc->tc_lock);
 309
 310         th->th_cpu = tc;
 311         th->th_txg = txg;
 312
 313         return (txg);
 314 }
 315
 316 void
 317 txg_rele_to_quiesce(txg_handle_t *th)
 318 {
 319         tx_cpu_t *tc = th->th_cpu;
 320
 321         ASSERT(!MUTEX_HELD(&tc->tc_lock));
 322         mutex_exit(&tc->tc_open_lock);
 323 }
 324
 325 void
 326 txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
 327 {
 328         tx_cpu_t *tc = th->th_cpu;
 329         int g = th->th_txg & TXG_MASK;
 330
 331         mutex_enter(&tc->tc_lock);
 332         list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
 333         mutex_exit(&tc->tc_lock);
 334 }
 335
 336 void
 337 txg_rele_to_sync(txg_handle_t *th)
 338 {
 339         tx_cpu_t *tc = th->th_cpu;
 340         int g = th->th_txg & TXG_MASK;
 341
 342         mutex_enter(&tc->tc_lock);
 343         ASSERT(tc->tc_count[g] != 0);
 344         if (--tc->tc_count[g] == 0)
 345                 cv_broadcast(&tc->tc_cv[g]);
 346         mutex_exit(&tc->tc_lock);
 347
 348         th->th_cpu = NULL;      /* defensive */
 349 }
 350
 351 /*
 352  * Blocks until all transactions in the group are committed.
 353  *
 354  * On return, the transaction group has reached a stable state in which it can
 355  * then be passed off to the syncing context.
 356  */
 357 static __noinline void
 358 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 359 {
 360         tx_state_t *tx = &dp->dp_tx;
 361         int g = txg & TXG_MASK;
 362         int c;
 363
 364         /*
 365          * Grab all tc_open_locks so nobody else can get into this txg.
 366          */
 367         for (c = 0; c < max_ncpus; c++)
 368                 mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 369
 370         ASSERT(txg == tx->tx_open_txg);
 371         tx->tx_open_txg++;
 372         tx->tx_open_time = gethrtime();
 373
 374         DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 375         DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 376
 377         /*
 378          * Now that we've incremented tx_open_txg, we can let threads
 379          * enter the next transaction group.
 380          */
 381         for (c = 0; c < max_ncpus; c++)
 382                 mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 383
 384         /*
 385          * Quiesce the transaction group by waiting for everyone to txg_exit().
 386          */
 387         for (c = 0; c < max_ncpus; c++) {
 388                 tx_cpu_t *tc = &tx->tx_cpu[c];
 389                 mutex_enter(&tc->tc_lock);
 390                 while (tc->tc_count[g] != 0)
 391                         cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 392                 mutex_exit(&tc->tc_lock);
 393         }
 394 }
 395
 396 static void
 397 txg_do_callbacks(void *arg)
 398 {
 399         list_t *cb_list = arg;
 400
 401         dmu_tx_do_callbacks(cb_list, 0);
 402
 403         list_destroy(cb_list);
 404
 405         kmem_free(cb_list, sizeof (list_t));
 406 }
 407
 408 /*
 409  * Dispatch the commit callbacks registered on this txg to worker threads.
 410  *
 411  * If no callbacks are registered for a given TXG, nothing happens.
 412  * This function creates a taskq for the associated pool, if needed.
 413  */
 414 static void
 415 txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 416 {
 417         int c;
 418         tx_state_t *tx = &dp->dp_tx;
 419         list_t *cb_list;
 420
 421         for (c = 0; c < max_ncpus; c++) {
 422                 tx_cpu_t *tc = &tx->tx_cpu[c];
 423                 /*
 424                  * No need to lock tx_cpu_t at this point, since this can
 425                  * only be called once a txg has been synced.
 426                  */
 427
 428                 int g = txg & TXG_MASK;
 429
 430                 if (list_is_empty(&tc->tc_callbacks[g]))
 431                         continue;
 432
 433                 if (tx->tx_commit_cb_taskq == NULL) {
 434                         /*
 435                          * Commit callback taskq hasn't been created yet.
 436                          */
 437                         tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
 438                             max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
 439                             TASKQ_PREPOPULATE);
 440                 }
 441
 442                 cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 443                 list_create(cb_list, sizeof (dmu_tx_callback_t),
 444                     offsetof(dmu_tx_callback_t, dcb_node));
 445
 446                 list_move_tail(cb_list, &tc->tc_callbacks[g]);
 447
 448                 (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 449                     txg_do_callbacks, cb_list, TQ_SLEEP);
 450         }
 451 }
 452
 453 static void
 454 txg_sync_thread(void *arg)
 455 {
 456         dsl_pool_t *dp = arg;
 457         spa_t *spa = dp->dp_spa;
 458         tx_state_t *tx = &dp->dp_tx;
 459         callb_cpr_t cpr;
 460         uint64_t start, delta;
 461
 462         txg_thread_enter(tx, &cpr);
 463
 464         start = delta = 0;
 465         for (;;) {
 466                 uint64_t timeout = zfs_txg_timeout * hz;
 467                 uint64_t timer;
 468                 uint64_t txg;
 469
 470                 /*
 471                  * We sync when we're scanning, there's someone waiting
 472                  * on us, or the quiesce thread has handed off a txg to
 473                  * us, or we have reached our timeout.
 474                  */
 475                 timer = (delta >= timeout ? 0 : timeout - delta);
 476                 while (!dsl_scan_active(dp->dp_scan) &&
 477                     !tx->tx_exiting && timer > 0 &&
 478                     tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 479                     tx->tx_quiesced_txg == 0 &&
 480                     dp->dp_dirty_total < zfs_dirty_data_sync) {
 481                         dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 482                             tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 483                         txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 484                         delta = ddi_get_lbolt() - start;
 485                         timer = (delta > timeout ? 0 : timeout - delta);
 486                 }
 487
 488                 /*
 489                  * Wait until the quiesce thread hands off a txg to us,
 490                  * prompting it to do so if necessary.
 491                  */
 492                 while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
 493                         if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 494                                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 495                         cv_broadcast(&tx->tx_quiesce_more_cv);
 496                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
 497                 }
 498
 499                 if (tx->tx_exiting)
 500                         txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 501
 502                 /*
 503                  * Consume the quiesced txg which has been handed off to
 504                  * us.  This may cause the quiescing thread to now be
 505                  * able to quiesce another txg, so we must signal it.
 506                  */
 507                 txg = tx->tx_quiesced_txg;
 508                 tx->tx_quiesced_txg = 0;
 509                 tx->tx_syncing_txg = txg;
 510                 DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 511                 cv_broadcast(&tx->tx_quiesce_more_cv);
 512
 513                 dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 514                     txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 515                 mutex_exit(&tx->tx_sync_lock);
 516
 517                 start = ddi_get_lbolt();
 518                 spa_sync(spa, txg);
 519                 delta = ddi_get_lbolt() - start;
 520
 521                 mutex_enter(&tx->tx_sync_lock);
 522                 tx->tx_synced_txg = txg;
 523                 tx->tx_syncing_txg = 0;
 524                 DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
 525                 cv_broadcast(&tx->tx_sync_done_cv);
 526
 527                 /*
 528                  * Dispatch commit callbacks to worker threads.
 529                  */
 530                 txg_dispatch_callbacks(dp, txg);
 531         }
 532 }
 533
 534 static void
 535 txg_quiesce_thread(void *arg)
 536 {
 537         dsl_pool_t *dp = arg;
 538         tx_state_t *tx = &dp->dp_tx;
 539         callb_cpr_t cpr;
 540
 541         txg_thread_enter(tx, &cpr);
 542
 543         for (;;) {
 544                 uint64_t txg;
 545
 546                 /*
 547                  * We quiesce when there's someone waiting on us.
 548                  * However, we can only have one txg in "quiescing" or
 549                  * "quiesced, waiting to sync" state.  So we wait until
 550                  * the "quiesced, waiting to sync" txg has been consumed
 551                  * by the sync thread.
 552                  */
 553                 while (!tx->tx_exiting &&
 554                     (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
 555                     tx->tx_quiesced_txg != 0))
 556                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
 557
 558                 if (tx->tx_exiting)
 559                         txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
 560
 561                 txg = tx->tx_open_txg;
 562                 dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 563                     txg, tx->tx_quiesce_txg_waiting,
 564                     tx->tx_sync_txg_waiting);
 565                 mutex_exit(&tx->tx_sync_lock);
 566                 txg_quiesce(dp, txg);
 567                 mutex_enter(&tx->tx_sync_lock);
 568
 569                 /*
 570                  * Hand this txg off to the sync thread.
 571                  */
 572                 dprintf("quiesce done, handing off txg %llu\n", txg);
 573                 tx->tx_quiesced_txg = txg;
 574                 DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 575                 cv_broadcast(&tx->tx_sync_more_cv);
 576                 cv_broadcast(&tx->tx_quiesce_done_cv);
 577         }
 578 }
 579
 580 /*
 581  * Delay this thread by delay nanoseconds if we are still in the open
 582  * transaction group and there is already a waiting txg quiesing or quiesced.
 583  * Abort the delay if this txg stalls or enters the quiesing state.
 584  */
 585 void
 586 txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
 587 {
 588         tx_state_t *tx = &dp->dp_tx;
 589         hrtime_t start = gethrtime();
 590
 591         /* don't delay if this txg could transition to quiescing immediately */
 592         if (tx->tx_open_txg > txg ||
 593             tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
 594                 return;
 595
 596         mutex_enter(&tx->tx_sync_lock);
 597         if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
 598                 mutex_exit(&tx->tx_sync_lock);
 599                 return;
 600         }
 601
 602         while (gethrtime() - start < delay &&
 603             tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
 604                 (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
 605                     &tx->tx_sync_lock, delay, resolution, 0);
 606         }
 607
 608         mutex_exit(&tx->tx_sync_lock);
 609 }
 610
 611 void
 612 txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 613 {
 614         tx_state_t *tx = &dp->dp_tx;
 615
 616         ASSERT(!dsl_pool_config_held(dp));
 617
 618         mutex_enter(&tx->tx_sync_lock);
 619         ASSERT(tx->tx_threads == 2);
 620         if (txg == 0)
 621                 txg = tx->tx_open_txg + TXG_DEFER_SIZE;
 622         if (tx->tx_sync_txg_waiting < txg)
 623                 tx->tx_sync_txg_waiting = txg;
 624         dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 625             txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 626         while (tx->tx_synced_txg < txg) {
 627                 dprintf("broadcasting sync more "
 628                     "tx_synced=%llu waiting=%llu dp=%p\n",
 629                     tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 630                 cv_broadcast(&tx->tx_sync_more_cv);
 631                 cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
 632         }
 633         mutex_exit(&tx->tx_sync_lock);
 634 }
 635
 636 void
 637 txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 638 {
 639         tx_state_t *tx = &dp->dp_tx;
 640
 641         ASSERT(!dsl_pool_config_held(dp));
 642
 643         mutex_enter(&tx->tx_sync_lock);
 644         ASSERT(tx->tx_threads == 2);
 645         if (txg == 0)
 646                 txg = tx->tx_open_txg + 1;
 647         if (tx->tx_quiesce_txg_waiting < txg)
 648                 tx->tx_quiesce_txg_waiting = txg;
 649         dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 650             txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 651         while (tx->tx_open_txg < txg) {
 652                 cv_broadcast(&tx->tx_quiesce_more_cv);
 653                 cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
 654         }
 655         mutex_exit(&tx->tx_sync_lock);
 656 }
 657
 658 /*
 659  * If there isn't a txg syncing or in the pipeline, push another txg through
 660  * the pipeline by queiscing the open txg.
 661  */
 662 void
 663 txg_kick(dsl_pool_t *dp)
 664 {
 665         tx_state_t *tx = &dp->dp_tx;
 666
 667         ASSERT(!dsl_pool_config_held(dp));
 668
 669         mutex_enter(&tx->tx_sync_lock);
 670         if (tx->tx_syncing_txg == 0 &&
 671             tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
 672             tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
 673             tx->tx_quiesced_txg <= tx->tx_synced_txg) {
 674                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
 675                 cv_broadcast(&tx->tx_quiesce_more_cv);
 676         }
 677         mutex_exit(&tx->tx_sync_lock);
 678 }
 679
 680 boolean_t
 681 txg_stalled(dsl_pool_t *dp)
 682 {
 683         tx_state_t *tx = &dp->dp_tx;
 684         return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 685 }
 686
 687 boolean_t
 688 txg_sync_waiting(dsl_pool_t *dp)
 689 {
 690         tx_state_t *tx = &dp->dp_tx;
 691
 692         return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
 693             tx->tx_quiesced_txg != 0);
 694 }
 695
 696 /*
 697  * Verify that this txg is active (open, quiescing, syncing).  Non-active
 698  * txg's should not be manipulated.
 699  */
 700 void
 701 txg_verify(spa_t *spa, uint64_t txg)
 702 {
 703         dsl_pool_t *dp = spa_get_dsl(spa);
 704         if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
 705                 return;
 706         ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
 707         ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
 708         ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
 709 }
 710
 711 /*
 712  * Per-txg object lists.
 713  */
 714 void
 715 txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
 716 {
 717         int t;
 718
 719         mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
 720
 721         tl->tl_offset = offset;
 722         tl->tl_spa = spa;
 723
 724         for (t = 0; t < TXG_SIZE; t++)
 725                 tl->tl_head[t] = NULL;
 726 }
 727
 728 void
 729 txg_list_destroy(txg_list_t *tl)
 730 {
 731         int t;
 732
 733         for (t = 0; t < TXG_SIZE; t++)
 734                 ASSERT(txg_list_empty(tl, t));
 735
 736         mutex_destroy(&tl->tl_lock);
 737 }
 738
 739 boolean_t
 740 txg_list_empty(txg_list_t *tl, uint64_t txg)
 741 {
 742         txg_verify(tl->tl_spa, txg);
 743         return (tl->tl_head[txg & TXG_MASK] == NULL);
 744 }
 745
 746 /*
 747  * Returns true if all txg lists are empty.
 748  *
 749  * Warning: this is inherently racy (an item could be added immediately
 750  * after this function returns). We don't bother with the lock because
 751  * it wouldn't change the semantics.
 752  */
 753 boolean_t
 754 txg_all_lists_empty(txg_list_t *tl)
 755 {
 756         for (int i = 0; i < TXG_SIZE; i++) {
 757                 if (!txg_list_empty(tl, i)) {
 758                         return (B_FALSE);
 759                 }
 760         }
 761         return (B_TRUE);
 762 }
 763
 764 /*
 765  * Add an entry to the list (unless it's already on the list).
 766  * Returns B_TRUE if it was actually added.
 767  */
 768 boolean_t
 769 txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
 770 {
 771         int t = txg & TXG_MASK;
 772         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 773         boolean_t add;
 774
 775         txg_verify(tl->tl_spa, txg);
 776         mutex_enter(&tl->tl_lock);
 777         add = (tn->tn_member[t] == 0);
 778         if (add) {
 779                 tn->tn_member[t] = 1;
 780                 tn->tn_next[t] = tl->tl_head[t];
 781                 tl->tl_head[t] = tn;
 782         }
 783         mutex_exit(&tl->tl_lock);
 784
 785         return (add);
 786 }
 787
 788 /*
 789  * Add an entry to the end of the list, unless it's already on the list.
 790  * (walks list to find end)
 791  * Returns B_TRUE if it was actually added.
 792  */
 793 boolean_t
 794 txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
 795 {
 796         int t = txg & TXG_MASK;
 797         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 798         boolean_t add;
 799
 800         txg_verify(tl->tl_spa, txg);
 801         mutex_enter(&tl->tl_lock);
 802         add = (tn->tn_member[t] == 0);
 803         if (add) {
 804                 txg_node_t **tp;
 805
 806                 for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
 807                         continue;
 808
 809                 tn->tn_member[t] = 1;
 810                 tn->tn_next[t] = NULL;
 811                 *tp = tn;
 812         }
 813         mutex_exit(&tl->tl_lock);
 814
 815         return (add);
 816 }
 817
 818 /*
 819  * Remove the head of the list and return it.
 820  */
 821 void *
 822 txg_list_remove(txg_list_t *tl, uint64_t txg)
 823 {
 824         int t = txg & TXG_MASK;
 825         txg_node_t *tn;
 826         void *p = NULL;
 827
 828         txg_verify(tl->tl_spa, txg);
 829         mutex_enter(&tl->tl_lock);
 830         if ((tn = tl->tl_head[t]) != NULL) {
 831                 p = (char *)tn - tl->tl_offset;
 832                 tl->tl_head[t] = tn->tn_next[t];
 833                 tn->tn_next[t] = NULL;
 834                 tn->tn_member[t] = 0;
 835         }
 836         mutex_exit(&tl->tl_lock);
 837
 838         return (p);
 839 }
 840
 841 /*
 842  * Remove a specific item from the list and return it.
 843  */
 844 void *
 845 txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
 846 {
 847         int t = txg & TXG_MASK;
 848         txg_node_t *tn, **tp;
 849
 850         txg_verify(tl->tl_spa, txg);
 851         mutex_enter(&tl->tl_lock);
 852
 853         for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
 854                 if ((char *)tn - tl->tl_offset == p) {
 855                         *tp = tn->tn_next[t];
 856                         tn->tn_next[t] = NULL;
 857                         tn->tn_member[t] = 0;
 858                         mutex_exit(&tl->tl_lock);
 859                         return (p);
 860                 }
 861         }
 862
 863         mutex_exit(&tl->tl_lock);
 864
 865         return (NULL);
 866 }
 867
 868 boolean_t
 869 txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
 870 {
 871         int t = txg & TXG_MASK;
 872         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 873
 874         txg_verify(tl->tl_spa, txg);
 875         return (tn->tn_member[t] != 0);
 876 }
 877
 878 /*
 879  * Walk a txg list -- only safe if you know it's not changing.
 880  */
 881 void *
 882 txg_list_head(txg_list_t *tl, uint64_t txg)
 883 {
 884         int t = txg & TXG_MASK;
 885         txg_node_t *tn = tl->tl_head[t];
 886
 887         txg_verify(tl->tl_spa, txg);
 888         return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 889 }
 890
 891 void *
 892 txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
 893 {
 894         int t = txg & TXG_MASK;
 895         txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 896
 897         txg_verify(tl->tl_spa, txg);
 898         tn = tn->tn_next[t];
 899
 900         return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 901 }