sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25
  26 #include <sys/dsl_pool.h>
  27 #include <sys/dsl_dataset.h>
  28 #include <sys/dsl_prop.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_synctask.h>
  31 #include <sys/dsl_scan.h>
  32 #include <sys/dnode.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/dmu_objset.h>
  35 #include <sys/arc.h>
  36 #include <sys/zap.h>
  37 #include <sys/zio.h>
  38 #include <sys/zfs_context.h>
  39 #include <sys/fs/zfs.h>
  40 #include <sys/zfs_znode.h>
  41 #include <sys/spa_impl.h>
  42 #include <sys/dsl_deadlist.h>
  43 #include <sys/bptree.h>
  44 #include <sys/zfeature.h>
  45 #include <sys/zil_impl.h>
  46
  47 int zfs_no_write_throttle = 0;
  48 int zfs_write_limit_shift = 3;                  /* 1/8th of physical memory */
  49 int zfs_txg_synctime_ms = 1000;         /* target millisecs to sync a txg */
  50
  51 uint64_t zfs_write_limit_min = 32 << 20;        /* min write limit is 32MB */
  52 uint64_t zfs_write_limit_max = 0;               /* max data payload per txg */
  53 uint64_t zfs_write_limit_inflated = 0;
  54 uint64_t zfs_write_limit_override = 0;
  55
  56 kmutex_t zfs_write_limit_lock;
  57
  58 static pgcnt_t old_physmem = 0;
  59
  60 SYSCTL_DECL(_vfs_zfs);
  61 TUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle);
  62 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN,
  63     &zfs_no_write_throttle, 0, "");
  64 TUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift);
  65 SYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN,
  66     &zfs_write_limit_shift, 0, "2^N of physical memory");
  67 SYSCTL_DECL(_vfs_zfs_txg);
  68 TUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms);
  69 SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN,
  70     &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg");
  71
  72 TUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min);
  73 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN,
  74     &zfs_write_limit_min, 0, "Minimum write limit");
  75 TUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max);
  76 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN,
  77     &zfs_write_limit_max, 0, "Maximum data payload per txg");
  78 TUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated);
  79 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN,
  80     &zfs_write_limit_inflated, 0, "");
  81 TUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override);
  82 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN,
  83     &zfs_write_limit_override, 0, "");
  84
  85 int
  86 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  87 {
  88         uint64_t obj;
  89         int err;
  90
  91         err = zap_lookup(dp->dp_meta_objset,
  92             dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
  93             name, sizeof (obj), 1, &obj);
  94         if (err)
  95                 return (err);
  96
  97         return (dsl_dir_open_obj(dp, obj, name, dp, ddp));
  98 }
  99
 100 static dsl_pool_t *
 101 dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 102 {
 103         dsl_pool_t *dp;
 104         blkptr_t *bp = spa_get_rootblkptr(spa);
 105
 106         dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
 107         dp->dp_spa = spa;
 108         dp->dp_meta_rootbp = *bp;
 109         rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
 110         dp->dp_write_limit = zfs_write_limit_min;
 111         txg_init(dp, txg);
 112
 113         txg_list_create(&dp->dp_dirty_datasets,
 114             offsetof(dsl_dataset_t, ds_dirty_link));
 115         txg_list_create(&dp->dp_dirty_zilogs,
 116             offsetof(zilog_t, zl_dirty_link));
 117         txg_list_create(&dp->dp_dirty_dirs,
 118             offsetof(dsl_dir_t, dd_dirty_link));
 119         txg_list_create(&dp->dp_sync_tasks,
 120             offsetof(dsl_sync_task_group_t, dstg_node));
 121
 122         mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 123
 124         dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 125             1, 4, 0);
 126
 127         return (dp);
 128 }
 129
 130 int
 131 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 132 {
 133         int err;
 134         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 135
 136         err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 137             &dp->dp_meta_objset);
 138         if (err != 0)
 139                 dsl_pool_close(dp);
 140         else
 141                 *dpp = dp;
 142
 143         return (err);
 144 }
 145
 146 int
 147 dsl_pool_open(dsl_pool_t *dp)
 148 {
 149         int err;
 150         dsl_dir_t *dd;
 151         dsl_dataset_t *ds;
 152         uint64_t obj;
 153
 154         ASSERT(!dmu_objset_is_dirty_anywhere(dp->dp_meta_objset));
 155
 156         rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 157         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 158             DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 159             &dp->dp_root_dir_obj);
 160         if (err)
 161                 goto out;
 162
 163         err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
 164             NULL, dp, &dp->dp_root_dir);
 165         if (err)
 166                 goto out;
 167
 168         err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 169         if (err)
 170                 goto out;
 171
 172         if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 173                 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 174                 if (err)
 175                         goto out;
 176                 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 177                     FTAG, &ds);
 178                 if (err == 0) {
 179                         err = dsl_dataset_hold_obj(dp,
 180                             ds->ds_phys->ds_prev_snap_obj, dp,
 181                             &dp->dp_origin_snap);
 182                         dsl_dataset_rele(ds, FTAG);
 183                 }
 184                 dsl_dir_close(dd, dp);
 185                 if (err)
 186                         goto out;
 187         }
 188
 189         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 190                 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 191                     &dp->dp_free_dir);
 192                 if (err)
 193                         goto out;
 194
 195                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 196                     DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 197                 if (err)
 198                         goto out;
 199                 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
 200                     dp->dp_meta_objset, obj));
 201         }
 202
 203         if (spa_feature_is_active(dp->dp_spa,
 204             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 205                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 206                     DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 207                     &dp->dp_bptree_obj);
 208                 if (err != 0)
 209                         goto out;
 210         }
 211
 212         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 213             DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 214             &dp->dp_tmp_userrefs_obj);
 215         if (err == ENOENT)
 216                 err = 0;
 217         if (err)
 218                 goto out;
 219
 220         err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 221
 222 out:
 223         rw_exit(&dp->dp_config_rwlock);
 224         return (err);
 225 }
 226
 227 void
 228 dsl_pool_close(dsl_pool_t *dp)
 229 {
 230         /* drop our references from dsl_pool_open() */
 231
 232         /*
 233          * Since we held the origin_snap from "syncing" context (which
 234          * includes pool-opening context), it actually only got a "ref"
 235          * and not a hold, so just drop that here.
 236          */
 237         if (dp->dp_origin_snap)
 238                 dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
 239         if (dp->dp_mos_dir)
 240                 dsl_dir_close(dp->dp_mos_dir, dp);
 241         if (dp->dp_free_dir)
 242                 dsl_dir_close(dp->dp_free_dir, dp);
 243         if (dp->dp_root_dir)
 244                 dsl_dir_close(dp->dp_root_dir, dp);
 245
 246         bpobj_close(&dp->dp_free_bpobj);
 247
 248         /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 249         if (dp->dp_meta_objset)
 250                 dmu_objset_evict(dp->dp_meta_objset);
 251
 252         txg_list_destroy(&dp->dp_dirty_datasets);
 253         txg_list_destroy(&dp->dp_dirty_zilogs);
 254         txg_list_destroy(&dp->dp_sync_tasks);
 255         txg_list_destroy(&dp->dp_dirty_dirs);
 256
 257         arc_flush(dp->dp_spa);
 258         txg_fini(dp);
 259         dsl_scan_fini(dp);
 260         rw_destroy(&dp->dp_config_rwlock);
 261         mutex_destroy(&dp->dp_lock);
 262         taskq_destroy(dp->dp_vnrele_taskq);
 263         if (dp->dp_blkstats)
 264                 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 265         kmem_free(dp, sizeof (dsl_pool_t));
 266 }
 267
 268 dsl_pool_t *
 269 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 270 {
 271         int err;
 272         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 273         dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 274         objset_t *os;
 275         dsl_dataset_t *ds;
 276         uint64_t obj;
 277
 278         /* create and open the MOS (meta-objset) */
 279         dp->dp_meta_objset = dmu_objset_create_impl(spa,
 280             NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 281
 282         /* create the pool directory */
 283         err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 284             DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 285         ASSERT3U(err, ==, 0);
 286
 287         /* Initialize scan structures */
 288         VERIFY3U(0, ==, dsl_scan_init(dp, txg));
 289
 290         /* create and open the root dir */
 291         dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 292         VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
 293             NULL, dp, &dp->dp_root_dir));
 294
 295         /* create and open the meta-objset dir */
 296         (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 297         VERIFY(0 == dsl_pool_open_special_dir(dp,
 298             MOS_DIR_NAME, &dp->dp_mos_dir));
 299
 300         if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 301                 /* create and open the free dir */
 302                 (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 303                     FREE_DIR_NAME, tx);
 304                 VERIFY(0 == dsl_pool_open_special_dir(dp,
 305                     FREE_DIR_NAME, &dp->dp_free_dir));
 306
 307                 /* create and open the free_bplist */
 308                 obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
 309                 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 310                     DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 311                 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
 312                     dp->dp_meta_objset, obj));
 313         }
 314
 315         if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 316                 dsl_pool_create_origin(dp, tx);
 317
 318         /* create the root dataset */
 319         obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 320
 321         /* create the root objset */
 322         VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 323         os = dmu_objset_create_impl(dp->dp_spa, ds,
 324             dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 325 #ifdef _KERNEL
 326         zfs_create_fs(os, kcred, zplprops, tx);
 327 #endif
 328         dsl_dataset_rele(ds, FTAG);
 329
 330         dmu_tx_commit(tx);
 331
 332         return (dp);
 333 }
 334
 335 /*
 336  * Account for the meta-objset space in its placeholder dsl_dir.
 337  */
 338 void
 339 dsl_pool_mos_diduse_space(dsl_pool_t *dp,
 340     int64_t used, int64_t comp, int64_t uncomp)
 341 {
 342         ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 343         mutex_enter(&dp->dp_lock);
 344         dp->dp_mos_used_delta += used;
 345         dp->dp_mos_compressed_delta += comp;
 346         dp->dp_mos_uncompressed_delta += uncomp;
 347         mutex_exit(&dp->dp_lock);
 348 }
 349
 350 static int
 351 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 352 {
 353         dsl_deadlist_t *dl = arg;
 354         dsl_pool_t *dp = dmu_objset_pool(dl->dl_os);
 355         rw_enter(&dp->dp_config_rwlock, RW_READER);
 356         dsl_deadlist_insert(dl, bp, tx);
 357         rw_exit(&dp->dp_config_rwlock);
 358         return (0);
 359 }
 360
 361 void
 362 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 363 {
 364         zio_t *zio;
 365         dmu_tx_t *tx;
 366         dsl_dir_t *dd;
 367         dsl_dataset_t *ds;
 368         objset_t *mos = dp->dp_meta_objset;
 369         hrtime_t start, write_time;
 370         uint64_t data_written;
 371         int err;
 372         list_t synced_datasets;
 373
 374         list_create(&synced_datasets, sizeof (dsl_dataset_t),
 375             offsetof(dsl_dataset_t, ds_synced_link));
 376
 377         /*
 378          * We need to copy dp_space_towrite() before doing
 379          * dsl_sync_task_group_sync(), because
 380          * dsl_dataset_snapshot_reserve_space() will increase
 381          * dp_space_towrite but not actually write anything.
 382          */
 383         data_written = dp->dp_space_towrite[txg & TXG_MASK];
 384
 385         tx = dmu_tx_create_assigned(dp, txg);
 386
 387         dp->dp_read_overhead = 0;
 388         start = gethrtime();
 389
 390         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 391         while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 392                 /*
 393                  * We must not sync any non-MOS datasets twice, because
 394                  * we may have taken a snapshot of them.  However, we
 395                  * may sync newly-created datasets on pass 2.
 396                  */
 397                 ASSERT(!list_link_active(&ds->ds_synced_link));
 398                 list_insert_tail(&synced_datasets, ds);
 399                 dsl_dataset_sync(ds, zio, tx);
 400         }
 401         DTRACE_PROBE(pool_sync__1setup);
 402         err = zio_wait(zio);
 403
 404         write_time = gethrtime() - start;
 405         ASSERT(err == 0);
 406         DTRACE_PROBE(pool_sync__2rootzio);
 407
 408         /*
 409          * After the data blocks have been written (ensured by the zio_wait()
 410          * above), update the user/group space accounting.
 411          */
 412         for (ds = list_head(&synced_datasets); ds;
 413             ds = list_next(&synced_datasets, ds))
 414                 dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 415
 416         /*
 417          * Sync the datasets again to push out the changes due to
 418          * userspace updates.  This must be done before we process the
 419          * sync tasks, so that any snapshots will have the correct
 420          * user accounting information (and we won't get confused
 421          * about which blocks are part of the snapshot).
 422          */
 423         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 424         while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 425                 ASSERT(list_link_active(&ds->ds_synced_link));
 426                 dmu_buf_rele(ds->ds_dbuf, ds);
 427                 dsl_dataset_sync(ds, zio, tx);
 428         }
 429         err = zio_wait(zio);
 430
 431         /*
 432          * Now that the datasets have been completely synced, we can
 433          * clean up our in-memory structures accumulated while syncing:
 434          *
 435          *  - move dead blocks from the pending deadlist to the on-disk deadlist
 436          *  - clean up zil records
 437          *  - release hold from dsl_dataset_dirty()
 438          */
 439         while (ds = list_remove_head(&synced_datasets)) {
 440                 objset_t *os = ds->ds_objset;
 441                 bplist_iterate(&ds->ds_pending_deadlist,
 442                     deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 443                 ASSERT(!dmu_objset_is_dirty(os, txg));
 444                 dmu_buf_rele(ds->ds_dbuf, ds);
 445         }
 446
 447         start = gethrtime();
 448         while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
 449                 dsl_dir_sync(dd, tx);
 450         write_time += gethrtime() - start;
 451
 452         /*
 453          * The MOS's space is accounted for in the pool/$MOS
 454          * (dp_mos_dir).  We can't modify the mos while we're syncing
 455          * it, so we remember the deltas and apply them here.
 456          */
 457         if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 458             dp->dp_mos_uncompressed_delta != 0) {
 459                 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 460                     dp->dp_mos_used_delta,
 461                     dp->dp_mos_compressed_delta,
 462                     dp->dp_mos_uncompressed_delta, tx);
 463                 dp->dp_mos_used_delta = 0;
 464                 dp->dp_mos_compressed_delta = 0;
 465                 dp->dp_mos_uncompressed_delta = 0;
 466         }
 467
 468         start = gethrtime();
 469         if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 470             list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 471                 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 472                 dmu_objset_sync(mos, zio, tx);
 473                 err = zio_wait(zio);
 474                 ASSERT(err == 0);
 475                 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 476                 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 477         }
 478         write_time += gethrtime() - start;
 479         DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
 480             hrtime_t, dp->dp_read_overhead);
 481         write_time -= dp->dp_read_overhead;
 482
 483         /*
 484          * If we modify a dataset in the same txg that we want to destroy it,
 485          * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 486          * dsl_dir_destroy_check() will fail if there are unexpected holds.
 487          * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 488          * and clearing the hold on it) before we process the sync_tasks.
 489          * The MOS data dirtied by the sync_tasks will be synced on the next
 490          * pass.
 491          */
 492         DTRACE_PROBE(pool_sync__3task);
 493         if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 494                 dsl_sync_task_group_t *dstg;
 495                 /*
 496                  * No more sync tasks should have been added while we
 497                  * were syncing.
 498                  */
 499                 ASSERT(spa_sync_pass(dp->dp_spa) == 1);
 500                 while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
 501                         dsl_sync_task_group_sync(dstg, tx);
 502         }
 503
 504         dmu_tx_commit(tx);
 505
 506         dp->dp_space_towrite[txg & TXG_MASK] = 0;
 507         ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
 508
 509         /*
 510          * If the write limit max has not been explicitly set, set it
 511          * to a fraction of available physical memory (default 1/8th).
 512          * Note that we must inflate the limit because the spa
 513          * inflates write sizes to account for data replication.
 514          * Check this each sync phase to catch changing memory size.
 515          */
 516         if (physmem != old_physmem && zfs_write_limit_shift) {
 517                 mutex_enter(&zfs_write_limit_lock);
 518                 old_physmem = physmem;
 519                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
 520                 zfs_write_limit_inflated = MAX(zfs_write_limit_min,
 521                     spa_get_asize(dp->dp_spa, zfs_write_limit_max));
 522                 mutex_exit(&zfs_write_limit_lock);
 523         }
 524
 525         /*
 526          * Attempt to keep the sync time consistent by adjusting the
 527          * amount of write traffic allowed into each transaction group.
 528          * Weight the throughput calculation towards the current value:
 529          *      thru = 3/4 old_thru + 1/4 new_thru
 530          *
 531          * Note: write_time is in nanosecs, so write_time/MICROSEC
 532          * yields millisecs
 533          */
 534         ASSERT(zfs_write_limit_min > 0);
 535         if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
 536                 uint64_t throughput = data_written / (write_time / MICROSEC);
 537
 538                 if (dp->dp_throughput)
 539                         dp->dp_throughput = throughput / 4 +
 540                             3 * dp->dp_throughput / 4;
 541                 else
 542                         dp->dp_throughput = throughput;
 543                 dp->dp_write_limit = MIN(zfs_write_limit_inflated,
 544                     MAX(zfs_write_limit_min,
 545                     dp->dp_throughput * zfs_txg_synctime_ms));
 546         }
 547 }
 548
 549 void
 550 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 551 {
 552         zilog_t *zilog;
 553         dsl_dataset_t *ds;
 554
 555         while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
 556                 ds = dmu_objset_ds(zilog->zl_os);
 557                 zil_clean(zilog, txg);
 558                 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 559                 dmu_buf_rele(ds->ds_dbuf, zilog);
 560         }
 561         ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 562 }
 563
 564 /*
 565  * TRUE if the current thread is the tx_sync_thread or if we
 566  * are being called from SPA context during pool initialization.
 567  */
 568 int
 569 dsl_pool_sync_context(dsl_pool_t *dp)
 570 {
 571         return (curthread == dp->dp_tx.tx_sync_thread ||
 572             spa_is_initializing(dp->dp_spa));
 573 }
 574
 575 uint64_t
 576 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 577 {
 578         uint64_t space, resv;
 579
 580         /*
 581          * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 582          * efficiency.
 583          * XXX The intent log is not accounted for, so it must fit
 584          * within this slop.
 585          *
 586          * If we're trying to assess whether it's OK to do a free,
 587          * cut the reservation in half to allow forward progress
 588          * (e.g. make it possible to rm(1) files from a full pool).
 589          */
 590         space = spa_get_dspace(dp->dp_spa);
 591         resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 592         if (netfree)
 593                 resv >>= 1;
 594
 595         return (space - resv);
 596 }
 597
 598 int
 599 dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
 600 {
 601         uint64_t reserved = 0;
 602         uint64_t write_limit = (zfs_write_limit_override ?
 603             zfs_write_limit_override : dp->dp_write_limit);
 604
 605         if (zfs_no_write_throttle) {
 606                 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
 607                     space);
 608                 return (0);
 609         }
 610
 611         /*
 612          * Check to see if we have exceeded the maximum allowed IO for
 613          * this transaction group.  We can do this without locks since
 614          * a little slop here is ok.  Note that we do the reserved check
 615          * with only half the requested reserve: this is because the
 616          * reserve requests are worst-case, and we really don't want to
 617          * throttle based off of worst-case estimates.
 618          */
 619         if (write_limit > 0) {
 620                 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
 621                     + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
 622
 623                 if (reserved && reserved > write_limit)
 624                         return (ERESTART);
 625         }
 626
 627         atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
 628
 629         /*
 630          * If this transaction group is over 7/8ths capacity, delay
 631          * the caller 1 clock tick.  This will slow down the "fill"
 632          * rate until the sync process can catch up with us.
 633          */
 634         if (reserved && reserved > (write_limit - (write_limit >> 3)))
 635                 txg_delay(dp, tx->tx_txg, 1);
 636
 637         return (0);
 638 }
 639
 640 void
 641 dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 642 {
 643         ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
 644         atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
 645 }
 646
 647 void
 648 dsl_pool_memory_pressure(dsl_pool_t *dp)
 649 {
 650         uint64_t space_inuse = 0;
 651         int i;
 652
 653         if (dp->dp_write_limit == zfs_write_limit_min)
 654                 return;
 655
 656         for (i = 0; i < TXG_SIZE; i++) {
 657                 space_inuse += dp->dp_space_towrite[i];
 658                 space_inuse += dp->dp_tempreserved[i];
 659         }
 660         dp->dp_write_limit = MAX(zfs_write_limit_min,
 661             MIN(dp->dp_write_limit, space_inuse / 4));
 662 }
 663
 664 void
 665 dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 666 {
 667         if (space > 0) {
 668                 mutex_enter(&dp->dp_lock);
 669                 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
 670                 mutex_exit(&dp->dp_lock);
 671         }
 672 }
 673
 674 /* ARGSUSED */
 675 static int
 676 upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 677 {
 678         dmu_tx_t *tx = arg;
 679         dsl_dataset_t *ds, *prev = NULL;
 680         int err;
 681         dsl_pool_t *dp = spa_get_dsl(spa);
 682
 683         err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
 684         if (err)
 685                 return (err);
 686
 687         while (ds->ds_phys->ds_prev_snap_obj != 0) {
 688                 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 689                     FTAG, &prev);
 690                 if (err) {
 691                         dsl_dataset_rele(ds, FTAG);
 692                         return (err);
 693                 }
 694
 695                 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
 696                         break;
 697                 dsl_dataset_rele(ds, FTAG);
 698                 ds = prev;
 699                 prev = NULL;
 700         }
 701
 702         if (prev == NULL) {
 703                 prev = dp->dp_origin_snap;
 704
 705                 /*
 706                  * The $ORIGIN can't have any data, or the accounting
 707                  * will be wrong.
 708                  */
 709                 ASSERT(prev->ds_phys->ds_bp.blk_birth == 0);
 710
 711                 /* The origin doesn't get attached to itself */
 712                 if (ds->ds_object == prev->ds_object) {
 713                         dsl_dataset_rele(ds, FTAG);
 714                         return (0);
 715                 }
 716
 717                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
 718                 ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
 719                 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
 720
 721                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 722                 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
 723
 724                 dmu_buf_will_dirty(prev->ds_dbuf, tx);
 725                 prev->ds_phys->ds_num_children++;
 726
 727                 if (ds->ds_phys->ds_next_snap_obj == 0) {
 728                         ASSERT(ds->ds_prev == NULL);
 729                         VERIFY(0 == dsl_dataset_hold_obj(dp,
 730                             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 731                 }
 732         }
 733
 734         ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object);
 735         ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
 736
 737         if (prev->ds_phys->ds_next_clones_obj == 0) {
 738                 dmu_buf_will_dirty(prev->ds_dbuf, tx);
 739                 prev->ds_phys->ds_next_clones_obj =
 740                     zap_create(dp->dp_meta_objset,
 741                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 742         }
 743         VERIFY(0 == zap_add_int(dp->dp_meta_objset,
 744             prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
 745
 746         dsl_dataset_rele(ds, FTAG);
 747         if (prev != dp->dp_origin_snap)
 748                 dsl_dataset_rele(prev, FTAG);
 749         return (0);
 750 }
 751
 752 void
 753 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 754 {
 755         ASSERT(dmu_tx_is_syncing(tx));
 756         ASSERT(dp->dp_origin_snap != NULL);
 757
 758         VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
 759             tx, DS_FIND_CHILDREN));
 760 }
 761
 762 /* ARGSUSED */
 763 static int
 764 upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 765 {
 766         dmu_tx_t *tx = arg;
 767         dsl_dataset_t *ds;
 768         dsl_pool_t *dp = spa_get_dsl(spa);
 769         objset_t *mos = dp->dp_meta_objset;
 770
 771         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 772
 773         if (ds->ds_dir->dd_phys->dd_origin_obj) {
 774                 dsl_dataset_t *origin;
 775
 776                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 777                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
 778
 779                 if (origin->ds_dir->dd_phys->dd_clones == 0) {
 780                         dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 781                         origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
 782                             DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 783                 }
 784
 785                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
 786                     origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 787
 788                 dsl_dataset_rele(origin, FTAG);
 789         }
 790
 791         dsl_dataset_rele(ds, FTAG);
 792         return (0);
 793 }
 794
 795 void
 796 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 797 {
 798         ASSERT(dmu_tx_is_syncing(tx));
 799         uint64_t obj;
 800
 801         (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 802         VERIFY(0 == dsl_pool_open_special_dir(dp,
 803             FREE_DIR_NAME, &dp->dp_free_dir));
 804
 805         /*
 806          * We can't use bpobj_alloc(), because spa_version() still
 807          * returns the old version, and we need a new-version bpobj with
 808          * subobj support.  So call dmu_object_alloc() directly.
 809          */
 810         obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 811             SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 812         VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 813             DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 814         VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
 815             dp->dp_meta_objset, obj));
 816
 817         VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
 818             upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 819 }
 820
 821 void
 822 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 823 {
 824         uint64_t dsobj;
 825         dsl_dataset_t *ds;
 826
 827         ASSERT(dmu_tx_is_syncing(tx));
 828         ASSERT(dp->dp_origin_snap == NULL);
 829
 830         /* create the origin dir, ds, & snap-ds */
 831         rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 832         dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 833             NULL, 0, kcred, tx);
 834         VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 835         dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
 836         VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 837             dp, &dp->dp_origin_snap));
 838         dsl_dataset_rele(ds, FTAG);
 839         rw_exit(&dp->dp_config_rwlock);
 840 }
 841
 842 taskq_t *
 843 dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 844 {
 845         return (dp->dp_vnrele_taskq);
 846 }
 847
 848 /*
 849  * Walk through the pool-wide zap object of temporary snapshot user holds
 850  * and release them.
 851  */
 852 void
 853 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 854 {
 855         zap_attribute_t za;
 856         zap_cursor_t zc;
 857         objset_t *mos = dp->dp_meta_objset;
 858         uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 859
 860         if (zapobj == 0)
 861                 return;
 862         ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 863
 864         for (zap_cursor_init(&zc, mos, zapobj);
 865             zap_cursor_retrieve(&zc, &za) == 0;
 866             zap_cursor_advance(&zc)) {
 867                 char *htag;
 868                 uint64_t dsobj;
 869
 870                 htag = strchr(za.za_name, '-');
 871                 *htag = '\0';
 872                 ++htag;
 873                 dsobj = strtonum(za.za_name, NULL);
 874                 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
 875         }
 876         zap_cursor_fini(&zc);
 877 }
 878
 879 /*
 880  * Create the pool-wide zap object for storing temporary snapshot holds.
 881  */
 882 void
 883 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 884 {
 885         objset_t *mos = dp->dp_meta_objset;
 886
 887         ASSERT(dp->dp_tmp_userrefs_obj == 0);
 888         ASSERT(dmu_tx_is_syncing(tx));
 889
 890         dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 891             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 892 }
 893
 894 static int
 895 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 896     const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
 897 {
 898         objset_t *mos = dp->dp_meta_objset;
 899         uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 900         char *name;
 901         int error;
 902
 903         ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 904         ASSERT(dmu_tx_is_syncing(tx));
 905
 906         /*
 907          * If the pool was created prior to SPA_VERSION_USERREFS, the
 908          * zap object for temporary holds might not exist yet.
 909          */
 910         if (zapobj == 0) {
 911                 if (holding) {
 912                         dsl_pool_user_hold_create_obj(dp, tx);
 913                         zapobj = dp->dp_tmp_userrefs_obj;
 914                 } else {
 915                         return (ENOENT);
 916                 }
 917         }
 918
 919         name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 920         if (holding)
 921                 error = zap_add(mos, zapobj, name, 8, 1, now, tx);
 922         else
 923                 error = zap_remove(mos, zapobj, name, tx);
 924         strfree(name);
 925
 926         return (error);
 927 }
 928
 929 /*
 930  * Add a temporary hold for the given dataset object and tag.
 931  */
 932 int
 933 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 934     uint64_t *now, dmu_tx_t *tx)
 935 {
 936         return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 937 }
 938
 939 /*
 940  * Release a temporary hold for the given dataset object and tag.
 941  */
 942 int
 943 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 944     dmu_tx_t *tx)
 945 {
 946         return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
 947             tx, B_FALSE));
 948 }