sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/dmu.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/space_map.h>
  32 #include <sys/metaslab_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/zio.h>
  35 #include <sys/spa_impl.h>
  36 #include <sys/zfeature.h>
  37 #include <sys/vdev_indirect_mapping.h>
  38 #include <sys/zap.h>
  39
  40 SYSCTL_DECL(_vfs_zfs);
  41 SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
  42
  43 #define GANG_ALLOCATION(flags) \
  44         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  45
  46 uint64_t metaslab_aliquot = 512ULL << 10;
  47 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
  48 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, force_ganging, CTLFLAG_RWTUN,
  49     &metaslab_force_ganging, 0,
  50     "Force gang block allocation for blocks larger than or equal to this value");
  51
  52 /*
  53  * Since we can touch multiple metaslabs (and their respective space maps)
  54  * with each transaction group, we benefit from having a smaller space map
  55  * block size since it allows us to issue more I/O operations scattered
  56  * around the disk.
  57  */
  58 int zfs_metaslab_sm_blksz = (1 << 12);
  59 SYSCTL_INT(_vfs_zfs, OID_AUTO, metaslab_sm_blksz, CTLFLAG_RDTUN,
  60     &zfs_metaslab_sm_blksz, 0,
  61     "Block size for metaslab DTL space map.  Power of 2 and greater than 4096.");
  62
  63 /*
  64  * The in-core space map representation is more compact than its on-disk form.
  65  * The zfs_condense_pct determines how much more compact the in-core
  66  * space map representation must be before we compact it on-disk.
  67  * Values should be greater than or equal to 100.
  68  */
  69 int zfs_condense_pct = 200;
  70 SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
  71     &zfs_condense_pct, 0,
  72     "Condense on-disk spacemap when it is more than this many percents"
  73     " of in-memory counterpart");
  74
  75 /*
  76  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  77  * space used on disk. In particular, a space map uses data in increments of
  78  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
  79  * same number of blocks after condensing. Since the goal of condensing is to
  80  * reduce the number of IOPs required to read the space map, we only want to
  81  * condense when we can be sure we will reduce the number of blocks used by the
  82  * space map. Unfortunately, we cannot precisely compute whether or not this is
  83  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  84  * we apply the following heuristic: do not condense a spacemap unless the
  85  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  86  * blocks.
  87  */
  88 int zfs_metaslab_condense_block_threshold = 4;
  89
  90 /*
  91  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  92  * be eligible for allocation. The value is defined as a percentage of
  93  * free space. Metaslab groups that have more free space than
  94  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  95  * a metaslab group's free space is less than or equal to the
  96  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  97  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  98  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  99  * groups are allowed to accept allocations. Gang blocks are always
 100  * eligible to allocate on any metaslab group. The default value of 0 means
 101  * no metaslab group will be excluded based on this criterion.
 102  */
 103 int zfs_mg_noalloc_threshold = 0;
 104 SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
 105     &zfs_mg_noalloc_threshold, 0,
 106     "Percentage of metaslab group size that should be free"
 107     " to make it eligible for allocation");
 108
 109 /*
 110  * Metaslab groups are considered eligible for allocations if their
 111  * fragmenation metric (measured as a percentage) is less than or equal to
 112  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
 113  * then it will be skipped unless all metaslab groups within the metaslab
 114  * class have also crossed this threshold.
 115  */
 116 int zfs_mg_fragmentation_threshold = 85;
 117 SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
 118     &zfs_mg_fragmentation_threshold, 0,
 119     "Percentage of metaslab group size that should be considered "
 120     "eligible for allocations unless all metaslab groups within the metaslab class "
 121     "have also crossed this threshold");
 122
 123 /*
 124  * Allow metaslabs to keep their active state as long as their fragmentation
 125  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 126  * active metaslab that exceeds this threshold will no longer keep its active
 127  * status allowing better metaslabs to be selected.
 128  */
 129 int zfs_metaslab_fragmentation_threshold = 70;
 130 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
 131     &zfs_metaslab_fragmentation_threshold, 0,
 132     "Maximum percentage of metaslab fragmentation level to keep their active state");
 133
 134 /*
 135  * When set will load all metaslabs when pool is first opened.
 136  */
 137 int metaslab_debug_load = 0;
 138 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
 139     &metaslab_debug_load, 0,
 140     "Load all metaslabs when pool is first opened");
 141
 142 /*
 143  * When set will prevent metaslabs from being unloaded.
 144  */
 145 int metaslab_debug_unload = 0;
 146 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
 147     &metaslab_debug_unload, 0,
 148     "Prevent metaslabs from being unloaded");
 149
 150 /*
 151  * Minimum size which forces the dynamic allocator to change
 152  * it's allocation strategy.  Once the space map cannot satisfy
 153  * an allocation of this size then it switches to using more
 154  * aggressive strategy (i.e search by size rather than offset).
 155  */
 156 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 157 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
 158     &metaslab_df_alloc_threshold, 0,
 159     "Minimum size which forces the dynamic allocator to change it's allocation strategy");
 160
 161 /*
 162  * The minimum free space, in percent, which must be available
 163  * in a space map to continue allocations in a first-fit fashion.
 164  * Once the space map's free space drops below this level we dynamically
 165  * switch to using best-fit allocations.
 166  */
 167 int metaslab_df_free_pct = 4;
 168 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
 169     &metaslab_df_free_pct, 0,
 170     "The minimum free space, in percent, which must be available in a "
 171     "space map to continue allocations in a first-fit fashion");
 172
 173 /*
 174  * A metaslab is considered "free" if it contains a contiguous
 175  * segment which is greater than metaslab_min_alloc_size.
 176  */
 177 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 178 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
 179     &metaslab_min_alloc_size, 0,
 180     "A metaslab is considered \"free\" if it contains a contiguous "
 181     "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
 182
 183 /*
 184  * Percentage of all cpus that can be used by the metaslab taskq.
 185  */
 186 int metaslab_load_pct = 50;
 187 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
 188     &metaslab_load_pct, 0,
 189     "Percentage of cpus that can be used by the metaslab taskq");
 190
 191 /*
 192  * Determines how many txgs a metaslab may remain loaded without having any
 193  * allocations from it. As long as a metaslab continues to be used we will
 194  * keep it loaded.
 195  */
 196 int metaslab_unload_delay = TXG_SIZE * 2;
 197 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
 198     &metaslab_unload_delay, 0,
 199     "Number of TXGs that an unused metaslab can be kept in memory");
 200
 201 /*
 202  * Max number of metaslabs per group to preload.
 203  */
 204 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 205 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
 206     &metaslab_preload_limit, 0,
 207     "Max number of metaslabs per group to preload");
 208
 209 /*
 210  * Enable/disable preloading of metaslab.
 211  */
 212 boolean_t metaslab_preload_enabled = B_TRUE;
 213 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
 214     &metaslab_preload_enabled, 0,
 215     "Max number of metaslabs per group to preload");
 216
 217 /*
 218  * Enable/disable fragmentation weighting on metaslabs.
 219  */
 220 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
 221 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
 222     &metaslab_fragmentation_factor_enabled, 0,
 223     "Enable fragmentation weighting on metaslabs");
 224
 225 /*
 226  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 227  */
 228 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
 229 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
 230     &metaslab_lba_weighting_enabled, 0,
 231     "Enable LBA weighting (i.e. outer tracks are given preference)");
 232
 233 /*
 234  * Enable/disable metaslab group biasing.
 235  */
 236 boolean_t metaslab_bias_enabled = B_TRUE;
 237 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
 238     &metaslab_bias_enabled, 0,
 239     "Enable metaslab group biasing");
 240
 241 /*
 242  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 243  */
 244 boolean_t zfs_remap_blkptr_enable = B_TRUE;
 245
 246 /*
 247  * Enable/disable segment-based metaslab selection.
 248  */
 249 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
 250
 251 /*
 252  * When using segment-based metaslab selection, we will continue
 253  * allocating from the active metaslab until we have exhausted
 254  * zfs_metaslab_switch_threshold of its buckets.
 255  */
 256 int zfs_metaslab_switch_threshold = 2;
 257
 258 /*
 259  * Internal switch to enable/disable the metaslab allocation tracing
 260  * facility.
 261  */
 262 boolean_t metaslab_trace_enabled = B_TRUE;
 263
 264 /*
 265  * Maximum entries that the metaslab allocation tracing facility will keep
 266  * in a given list when running in non-debug mode. We limit the number
 267  * of entries in non-debug mode to prevent us from using up too much memory.
 268  * The limit should be sufficiently large that we don't expect any allocation
 269  * to every exceed this value. In debug mode, the system will panic if this
 270  * limit is ever reached allowing for further investigation.
 271  */
 272 uint64_t metaslab_trace_max_entries = 5000;
 273
 274 static uint64_t metaslab_weight(metaslab_t *);
 275 static void metaslab_set_fragmentation(metaslab_t *);
 276 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 277 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 278 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 279 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 280
 281 kmem_cache_t *metaslab_alloc_trace_cache;
 282
 283 /*
 284  * ==========================================================================
 285  * Metaslab classes
 286  * ==========================================================================
 287  */
 288 metaslab_class_t *
 289 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 290 {
 291         metaslab_class_t *mc;
 292
 293         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 294
 295         mc->mc_spa = spa;
 296         mc->mc_rotor = NULL;
 297         mc->mc_ops = ops;
 298         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 299         mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
 300             sizeof (refcount_t), KM_SLEEP);
 301         mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
 302             sizeof (uint64_t), KM_SLEEP);
 303         for (int i = 0; i < spa->spa_alloc_count; i++)
 304                 refcount_create_tracked(&mc->mc_alloc_slots[i]);
 305
 306         return (mc);
 307 }
 308
 309 void
 310 metaslab_class_destroy(metaslab_class_t *mc)
 311 {
 312         ASSERT(mc->mc_rotor == NULL);
 313         ASSERT(mc->mc_alloc == 0);
 314         ASSERT(mc->mc_deferred == 0);
 315         ASSERT(mc->mc_space == 0);
 316         ASSERT(mc->mc_dspace == 0);
 317
 318         for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
 319                 refcount_destroy(&mc->mc_alloc_slots[i]);
 320         kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
 321             sizeof (refcount_t));
 322         kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
 323             sizeof (uint64_t));
 324         mutex_destroy(&mc->mc_lock);
 325         kmem_free(mc, sizeof (metaslab_class_t));
 326 }
 327
 328 int
 329 metaslab_class_validate(metaslab_class_t *mc)
 330 {
 331         metaslab_group_t *mg;
 332         vdev_t *vd;
 333
 334         /*
 335          * Must hold one of the spa_config locks.
 336          */
 337         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 338             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 339
 340         if ((mg = mc->mc_rotor) == NULL)
 341                 return (0);
 342
 343         do {
 344                 vd = mg->mg_vd;
 345                 ASSERT(vd->vdev_mg != NULL);
 346                 ASSERT3P(vd->vdev_top, ==, vd);
 347                 ASSERT3P(mg->mg_class, ==, mc);
 348                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 349         } while ((mg = mg->mg_next) != mc->mc_rotor);
 350
 351         return (0);
 352 }
 353
 354 void
 355 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 356     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 357 {
 358         atomic_add_64(&mc->mc_alloc, alloc_delta);
 359         atomic_add_64(&mc->mc_deferred, defer_delta);
 360         atomic_add_64(&mc->mc_space, space_delta);
 361         atomic_add_64(&mc->mc_dspace, dspace_delta);
 362 }
 363
 364 void
 365 metaslab_class_minblocksize_update(metaslab_class_t *mc)
 366 {
 367         metaslab_group_t *mg;
 368         vdev_t *vd;
 369         uint64_t minashift = UINT64_MAX;
 370
 371         if ((mg = mc->mc_rotor) == NULL) {
 372                 mc->mc_minblocksize = SPA_MINBLOCKSIZE;
 373                 return;
 374         }
 375
 376         do {
 377                 vd = mg->mg_vd;
 378                 if (vd->vdev_ashift < minashift)
 379                         minashift = vd->vdev_ashift;
 380         } while ((mg = mg->mg_next) != mc->mc_rotor);
 381
 382         mc->mc_minblocksize = 1ULL << minashift;
 383 }
 384
 385 uint64_t
 386 metaslab_class_get_alloc(metaslab_class_t *mc)
 387 {
 388         return (mc->mc_alloc);
 389 }
 390
 391 uint64_t
 392 metaslab_class_get_deferred(metaslab_class_t *mc)
 393 {
 394         return (mc->mc_deferred);
 395 }
 396
 397 uint64_t
 398 metaslab_class_get_space(metaslab_class_t *mc)
 399 {
 400         return (mc->mc_space);
 401 }
 402
 403 uint64_t
 404 metaslab_class_get_dspace(metaslab_class_t *mc)
 405 {
 406         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 407 }
 408
 409 uint64_t
 410 metaslab_class_get_minblocksize(metaslab_class_t *mc)
 411 {
 412         return (mc->mc_minblocksize);
 413 }
 414
 415 void
 416 metaslab_class_histogram_verify(metaslab_class_t *mc)
 417 {
 418         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 419         uint64_t *mc_hist;
 420         int i;
 421
 422         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 423                 return;
 424
 425         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 426             KM_SLEEP);
 427
 428         for (int c = 0; c < rvd->vdev_children; c++) {
 429                 vdev_t *tvd = rvd->vdev_child[c];
 430                 metaslab_group_t *mg = tvd->vdev_mg;
 431
 432                 /*
 433                  * Skip any holes, uninitialized top-levels, or
 434                  * vdevs that are not in this metalab class.
 435                  */
 436                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 437                     mg->mg_class != mc) {
 438                         continue;
 439                 }
 440
 441                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 442                         mc_hist[i] += mg->mg_histogram[i];
 443         }
 444
 445         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 446                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 447
 448         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 449 }
 450
 451 /*
 452  * Calculate the metaslab class's fragmentation metric. The metric
 453  * is weighted based on the space contribution of each metaslab group.
 454  * The return value will be a number between 0 and 100 (inclusive), or
 455  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 456  * zfs_frag_table for more information about the metric.
 457  */
 458 uint64_t
 459 metaslab_class_fragmentation(metaslab_class_t *mc)
 460 {
 461         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 462         uint64_t fragmentation = 0;
 463
 464         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 465
 466         for (int c = 0; c < rvd->vdev_children; c++) {
 467                 vdev_t *tvd = rvd->vdev_child[c];
 468                 metaslab_group_t *mg = tvd->vdev_mg;
 469
 470                 /*
 471                  * Skip any holes, uninitialized top-levels,
 472                  * or vdevs that are not in this metalab class.
 473                  */
 474                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 475                     mg->mg_class != mc) {
 476                         continue;
 477                 }
 478
 479                 /*
 480                  * If a metaslab group does not contain a fragmentation
 481                  * metric then just bail out.
 482                  */
 483                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 484                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 485                         return (ZFS_FRAG_INVALID);
 486                 }
 487
 488                 /*
 489                  * Determine how much this metaslab_group is contributing
 490                  * to the overall pool fragmentation metric.
 491                  */
 492                 fragmentation += mg->mg_fragmentation *
 493                     metaslab_group_get_space(mg);
 494         }
 495         fragmentation /= metaslab_class_get_space(mc);
 496
 497         ASSERT3U(fragmentation, <=, 100);
 498         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 499         return (fragmentation);
 500 }
 501
 502 /*
 503  * Calculate the amount of expandable space that is available in
 504  * this metaslab class. If a device is expanded then its expandable
 505  * space will be the amount of allocatable space that is currently not
 506  * part of this metaslab class.
 507  */
 508 uint64_t
 509 metaslab_class_expandable_space(metaslab_class_t *mc)
 510 {
 511         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 512         uint64_t space = 0;
 513
 514         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 515         for (int c = 0; c < rvd->vdev_children; c++) {
 516                 uint64_t tspace;
 517                 vdev_t *tvd = rvd->vdev_child[c];
 518                 metaslab_group_t *mg = tvd->vdev_mg;
 519
 520                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 521                     mg->mg_class != mc) {
 522                         continue;
 523                 }
 524
 525                 /*
 526                  * Calculate if we have enough space to add additional
 527                  * metaslabs. We report the expandable space in terms
 528                  * of the metaslab size since that's the unit of expansion.
 529                  * Adjust by efi system partition size.
 530                  */
 531                 tspace = tvd->vdev_max_asize - tvd->vdev_asize;
 532                 if (tspace > mc->mc_spa->spa_bootsize) {
 533                         tspace -= mc->mc_spa->spa_bootsize;
 534                 }
 535                 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
 536         }
 537         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 538         return (space);
 539 }
 540
 541 static int
 542 metaslab_compare(const void *x1, const void *x2)
 543 {
 544         const metaslab_t *m1 = x1;
 545         const metaslab_t *m2 = x2;
 546
 547         int sort1 = 0;
 548         int sort2 = 0;
 549         if (m1->ms_allocator != -1 && m1->ms_primary)
 550                 sort1 = 1;
 551         else if (m1->ms_allocator != -1 && !m1->ms_primary)
 552                 sort1 = 2;
 553         if (m2->ms_allocator != -1 && m2->ms_primary)
 554                 sort2 = 1;
 555         else if (m2->ms_allocator != -1 && !m2->ms_primary)
 556                 sort2 = 2;
 557
 558         /*
 559          * Sort inactive metaslabs first, then primaries, then secondaries. When
 560          * selecting a metaslab to allocate from, an allocator first tries its
 561          * primary, then secondary active metaslab. If it doesn't have active
 562          * metaslabs, or can't allocate from them, it searches for an inactive
 563          * metaslab to activate. If it can't find a suitable one, it will steal
 564          * a primary or secondary metaslab from another allocator.
 565          */
 566         if (sort1 < sort2)
 567                 return (-1);
 568         if (sort1 > sort2)
 569                 return (1);
 570
 571         if (m1->ms_weight < m2->ms_weight)
 572                 return (1);
 573         if (m1->ms_weight > m2->ms_weight)
 574                 return (-1);
 575
 576         /*
 577          * If the weights are identical, use the offset to force uniqueness.
 578          */
 579         if (m1->ms_start < m2->ms_start)
 580                 return (-1);
 581         if (m1->ms_start > m2->ms_start)
 582                 return (1);
 583
 584         ASSERT3P(m1, ==, m2);
 585
 586         return (0);
 587 }
 588
 589 /*
 590  * Verify that the space accounting on disk matches the in-core range_trees.
 591  */
 592 void
 593 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 594 {
 595         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 596         uint64_t allocated = 0;
 597         uint64_t sm_free_space, msp_free_space;
 598
 599         ASSERT(MUTEX_HELD(&msp->ms_lock));
 600
 601         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 602                 return;
 603
 604         /*
 605          * We can only verify the metaslab space when we're called
 606          * from syncing context with a loaded metaslab that has an allocated
 607          * space map. Calling this in non-syncing context does not
 608          * provide a consistent view of the metaslab since we're performing
 609          * allocations in the future.
 610          */
 611         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 612             !msp->ms_loaded)
 613                 return;
 614
 615         sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 616             space_map_alloc_delta(msp->ms_sm);
 617
 618         /*
 619          * Account for future allocations since we would have already
 620          * deducted that space from the ms_freetree.
 621          */
 622         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 623                 allocated +=
 624                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 625         }
 626
 627         msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
 628             msp->ms_deferspace + range_tree_space(msp->ms_freed);
 629
 630         VERIFY3U(sm_free_space, ==, msp_free_space);
 631 }
 632
 633 /*
 634  * ==========================================================================
 635  * Metaslab groups
 636  * ==========================================================================
 637  */
 638 /*
 639  * Update the allocatable flag and the metaslab group's capacity.
 640  * The allocatable flag is set to true if the capacity is below
 641  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 642  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 643  * transitions from allocatable to non-allocatable or vice versa then the
 644  * metaslab group's class is updated to reflect the transition.
 645  */
 646 static void
 647 metaslab_group_alloc_update(metaslab_group_t *mg)
 648 {
 649         vdev_t *vd = mg->mg_vd;
 650         metaslab_class_t *mc = mg->mg_class;
 651         vdev_stat_t *vs = &vd->vdev_stat;
 652         boolean_t was_allocatable;
 653         boolean_t was_initialized;
 654
 655         ASSERT(vd == vd->vdev_top);
 656         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 657             SCL_ALLOC);
 658
 659         mutex_enter(&mg->mg_lock);
 660         was_allocatable = mg->mg_allocatable;
 661         was_initialized = mg->mg_initialized;
 662
 663         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 664             (vs->vs_space + 1);
 665
 666         mutex_enter(&mc->mc_lock);
 667
 668         /*
 669          * If the metaslab group was just added then it won't
 670          * have any space until we finish syncing out this txg.
 671          * At that point we will consider it initialized and available
 672          * for allocations.  We also don't consider non-activated
 673          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 674          * to be initialized, because they can't be used for allocation.
 675          */
 676         mg->mg_initialized = metaslab_group_initialized(mg);
 677         if (!was_initialized && mg->mg_initialized) {
 678                 mc->mc_groups++;
 679         } else if (was_initialized && !mg->mg_initialized) {
 680                 ASSERT3U(mc->mc_groups, >, 0);
 681                 mc->mc_groups--;
 682         }
 683         if (mg->mg_initialized)
 684                 mg->mg_no_free_space = B_FALSE;
 685
 686         /*
 687          * A metaslab group is considered allocatable if it has plenty
 688          * of free space or is not heavily fragmented. We only take
 689          * fragmentation into account if the metaslab group has a valid
 690          * fragmentation metric (i.e. a value between 0 and 100).
 691          */
 692         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 693             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 694             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 695             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 696
 697         /*
 698          * The mc_alloc_groups maintains a count of the number of
 699          * groups in this metaslab class that are still above the
 700          * zfs_mg_noalloc_threshold. This is used by the allocating
 701          * threads to determine if they should avoid allocations to
 702          * a given group. The allocator will avoid allocations to a group
 703          * if that group has reached or is below the zfs_mg_noalloc_threshold
 704          * and there are still other groups that are above the threshold.
 705          * When a group transitions from allocatable to non-allocatable or
 706          * vice versa we update the metaslab class to reflect that change.
 707          * When the mc_alloc_groups value drops to 0 that means that all
 708          * groups have reached the zfs_mg_noalloc_threshold making all groups
 709          * eligible for allocations. This effectively means that all devices
 710          * are balanced again.
 711          */
 712         if (was_allocatable && !mg->mg_allocatable)
 713                 mc->mc_alloc_groups--;
 714         else if (!was_allocatable && mg->mg_allocatable)
 715                 mc->mc_alloc_groups++;
 716         mutex_exit(&mc->mc_lock);
 717
 718         mutex_exit(&mg->mg_lock);
 719 }
 720
 721 metaslab_group_t *
 722 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 723 {
 724         metaslab_group_t *mg;
 725
 726         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 727         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 728         mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 729             KM_SLEEP);
 730         mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 731             KM_SLEEP);
 732         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 733             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 734         mg->mg_vd = vd;
 735         mg->mg_class = mc;
 736         mg->mg_activation_count = 0;
 737         mg->mg_initialized = B_FALSE;
 738         mg->mg_no_free_space = B_TRUE;
 739         mg->mg_allocators = allocators;
 740
 741         mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
 742             KM_SLEEP);
 743         mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
 744             sizeof (uint64_t), KM_SLEEP);
 745         for (int i = 0; i < allocators; i++) {
 746                 refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
 747                 mg->mg_cur_max_alloc_queue_depth[i] = 0;
 748         }
 749
 750         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 751             minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 752
 753         return (mg);
 754 }
 755
 756 void
 757 metaslab_group_destroy(metaslab_group_t *mg)
 758 {
 759         ASSERT(mg->mg_prev == NULL);
 760         ASSERT(mg->mg_next == NULL);
 761         /*
 762          * We may have gone below zero with the activation count
 763          * either because we never activated in the first place or
 764          * because we're done, and possibly removing the vdev.
 765          */
 766         ASSERT(mg->mg_activation_count <= 0);
 767
 768         taskq_destroy(mg->mg_taskq);
 769         avl_destroy(&mg->mg_metaslab_tree);
 770         kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
 771         kmem_free(mg->mg_secondaries, mg->mg_allocators *
 772             sizeof (metaslab_t *));
 773         mutex_destroy(&mg->mg_lock);
 774
 775         for (int i = 0; i < mg->mg_allocators; i++) {
 776                 refcount_destroy(&mg->mg_alloc_queue_depth[i]);
 777                 mg->mg_cur_max_alloc_queue_depth[i] = 0;
 778         }
 779         kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
 780             sizeof (refcount_t));
 781         kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
 782             sizeof (uint64_t));
 783
 784         kmem_free(mg, sizeof (metaslab_group_t));
 785 }
 786
 787 void
 788 metaslab_group_activate(metaslab_group_t *mg)
 789 {
 790         metaslab_class_t *mc = mg->mg_class;
 791         metaslab_group_t *mgprev, *mgnext;
 792
 793         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
 794
 795         ASSERT(mc->mc_rotor != mg);
 796         ASSERT(mg->mg_prev == NULL);
 797         ASSERT(mg->mg_next == NULL);
 798         ASSERT(mg->mg_activation_count <= 0);
 799
 800         if (++mg->mg_activation_count <= 0)
 801                 return;
 802
 803         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 804         metaslab_group_alloc_update(mg);
 805
 806         if ((mgprev = mc->mc_rotor) == NULL) {
 807                 mg->mg_prev = mg;
 808                 mg->mg_next = mg;
 809         } else {
 810                 mgnext = mgprev->mg_next;
 811                 mg->mg_prev = mgprev;
 812                 mg->mg_next = mgnext;
 813                 mgprev->mg_next = mg;
 814                 mgnext->mg_prev = mg;
 815         }
 816         mc->mc_rotor = mg;
 817         metaslab_class_minblocksize_update(mc);
 818 }
 819
 820 /*
 821  * Passivate a metaslab group and remove it from the allocation rotor.
 822  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
 823  * a metaslab group. This function will momentarily drop spa_config_locks
 824  * that are lower than the SCL_ALLOC lock (see comment below).
 825  */
 826 void
 827 metaslab_group_passivate(metaslab_group_t *mg)
 828 {
 829         metaslab_class_t *mc = mg->mg_class;
 830         spa_t *spa = mc->mc_spa;
 831         metaslab_group_t *mgprev, *mgnext;
 832         int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 833
 834         ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 835             (SCL_ALLOC | SCL_ZIO));
 836
 837         if (--mg->mg_activation_count != 0) {
 838                 ASSERT(mc->mc_rotor != mg);
 839                 ASSERT(mg->mg_prev == NULL);
 840                 ASSERT(mg->mg_next == NULL);
 841                 ASSERT(mg->mg_activation_count < 0);
 842                 return;
 843         }
 844
 845         /*
 846          * The spa_config_lock is an array of rwlocks, ordered as
 847          * follows (from highest to lowest):
 848          *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 849          *      SCL_ZIO > SCL_FREE > SCL_VDEV
 850          * (For more information about the spa_config_lock see spa_misc.c)
 851          * The higher the lock, the broader its coverage. When we passivate
 852          * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 853          * config locks. However, the metaslab group's taskq might be trying
 854          * to preload metaslabs so we must drop the SCL_ZIO lock and any
 855          * lower locks to allow the I/O to complete. At a minimum,
 856          * we continue to hold the SCL_ALLOC lock, which prevents any future
 857          * allocations from taking place and any changes to the vdev tree.
 858          */
 859         spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 860         taskq_wait(mg->mg_taskq);
 861         spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 862         metaslab_group_alloc_update(mg);
 863         for (int i = 0; i < mg->mg_allocators; i++) {
 864                 metaslab_t *msp = mg->mg_primaries[i];
 865                 if (msp != NULL) {
 866                         mutex_enter(&msp->ms_lock);
 867                         metaslab_passivate(msp,
 868                             metaslab_weight_from_range_tree(msp));
 869                         mutex_exit(&msp->ms_lock);
 870                 }
 871                 msp = mg->mg_secondaries[i];
 872                 if (msp != NULL) {
 873                         mutex_enter(&msp->ms_lock);
 874                         metaslab_passivate(msp,
 875                             metaslab_weight_from_range_tree(msp));
 876                         mutex_exit(&msp->ms_lock);
 877                 }
 878         }
 879
 880         mgprev = mg->mg_prev;
 881         mgnext = mg->mg_next;
 882
 883         if (mg == mgnext) {
 884                 mc->mc_rotor = NULL;
 885         } else {
 886                 mc->mc_rotor = mgnext;
 887                 mgprev->mg_next = mgnext;
 888                 mgnext->mg_prev = mgprev;
 889         }
 890
 891         mg->mg_prev = NULL;
 892         mg->mg_next = NULL;
 893         metaslab_class_minblocksize_update(mc);
 894 }
 895
 896 boolean_t
 897 metaslab_group_initialized(metaslab_group_t *mg)
 898 {
 899         vdev_t *vd = mg->mg_vd;
 900         vdev_stat_t *vs = &vd->vdev_stat;
 901
 902         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 903 }
 904
 905 uint64_t
 906 metaslab_group_get_space(metaslab_group_t *mg)
 907 {
 908         return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 909 }
 910
 911 void
 912 metaslab_group_histogram_verify(metaslab_group_t *mg)
 913 {
 914         uint64_t *mg_hist;
 915         vdev_t *vd = mg->mg_vd;
 916         uint64_t ashift = vd->vdev_ashift;
 917         int i;
 918
 919         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 920                 return;
 921
 922         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 923             KM_SLEEP);
 924
 925         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 926             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 927
 928         for (int m = 0; m < vd->vdev_ms_count; m++) {
 929                 metaslab_t *msp = vd->vdev_ms[m];
 930
 931                 if (msp->ms_sm == NULL)
 932                         continue;
 933
 934                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 935                         mg_hist[i + ashift] +=
 936                             msp->ms_sm->sm_phys->smp_histogram[i];
 937         }
 938
 939         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 940                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 941
 942         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 943 }
 944
 945 static void
 946 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 947 {
 948         metaslab_class_t *mc = mg->mg_class;
 949         uint64_t ashift = mg->mg_vd->vdev_ashift;
 950
 951         ASSERT(MUTEX_HELD(&msp->ms_lock));
 952         if (msp->ms_sm == NULL)
 953                 return;
 954
 955         mutex_enter(&mg->mg_lock);
 956         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 957                 mg->mg_histogram[i + ashift] +=
 958                     msp->ms_sm->sm_phys->smp_histogram[i];
 959                 mc->mc_histogram[i + ashift] +=
 960                     msp->ms_sm->sm_phys->smp_histogram[i];
 961         }
 962         mutex_exit(&mg->mg_lock);
 963 }
 964
 965 void
 966 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 967 {
 968         metaslab_class_t *mc = mg->mg_class;
 969         uint64_t ashift = mg->mg_vd->vdev_ashift;
 970
 971         ASSERT(MUTEX_HELD(&msp->ms_lock));
 972         if (msp->ms_sm == NULL)
 973                 return;
 974
 975         mutex_enter(&mg->mg_lock);
 976         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 977                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
 978                     msp->ms_sm->sm_phys->smp_histogram[i]);
 979                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
 980                     msp->ms_sm->sm_phys->smp_histogram[i]);
 981
 982                 mg->mg_histogram[i + ashift] -=
 983                     msp->ms_sm->sm_phys->smp_histogram[i];
 984                 mc->mc_histogram[i + ashift] -=
 985                     msp->ms_sm->sm_phys->smp_histogram[i];
 986         }
 987         mutex_exit(&mg->mg_lock);
 988 }
 989
 990 static void
 991 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 992 {
 993         ASSERT(msp->ms_group == NULL);
 994         mutex_enter(&mg->mg_lock);
 995         msp->ms_group = mg;
 996         msp->ms_weight = 0;
 997         avl_add(&mg->mg_metaslab_tree, msp);
 998         mutex_exit(&mg->mg_lock);
 999
1000         mutex_enter(&msp->ms_lock);
1001         metaslab_group_histogram_add(mg, msp);
1002         mutex_exit(&msp->ms_lock);
1003 }
1004
1005 static void
1006 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1007 {
1008         mutex_enter(&msp->ms_lock);
1009         metaslab_group_histogram_remove(mg, msp);
1010         mutex_exit(&msp->ms_lock);
1011
1012         mutex_enter(&mg->mg_lock);
1013         ASSERT(msp->ms_group == mg);
1014         avl_remove(&mg->mg_metaslab_tree, msp);
1015         msp->ms_group = NULL;
1016         mutex_exit(&mg->mg_lock);
1017 }
1018
1019 static void
1020 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1021 {
1022         ASSERT(MUTEX_HELD(&mg->mg_lock));
1023         ASSERT(msp->ms_group == mg);
1024         avl_remove(&mg->mg_metaslab_tree, msp);
1025         msp->ms_weight = weight;
1026         avl_add(&mg->mg_metaslab_tree, msp);
1027
1028 }
1029
1030 static void
1031 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1032 {
1033         /*
1034          * Although in principle the weight can be any value, in
1035          * practice we do not use values in the range [1, 511].
1036          */
1037         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1038         ASSERT(MUTEX_HELD(&msp->ms_lock));
1039
1040         mutex_enter(&mg->mg_lock);
1041         metaslab_group_sort_impl(mg, msp, weight);
1042         mutex_exit(&mg->mg_lock);
1043 }
1044
1045 /*
1046  * Calculate the fragmentation for a given metaslab group. We can use
1047  * a simple average here since all metaslabs within the group must have
1048  * the same size. The return value will be a value between 0 and 100
1049  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
1050  * group have a fragmentation metric.
1051  */
1052 uint64_t
1053 metaslab_group_fragmentation(metaslab_group_t *mg)
1054 {
1055         vdev_t *vd = mg->mg_vd;
1056         uint64_t fragmentation = 0;
1057         uint64_t valid_ms = 0;
1058
1059         for (int m = 0; m < vd->vdev_ms_count; m++) {
1060                 metaslab_t *msp = vd->vdev_ms[m];
1061
1062                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1063                         continue;
1064
1065                 valid_ms++;
1066                 fragmentation += msp->ms_fragmentation;
1067         }
1068
1069         if (valid_ms <= vd->vdev_ms_count / 2)
1070                 return (ZFS_FRAG_INVALID);
1071
1072         fragmentation /= valid_ms;
1073         ASSERT3U(fragmentation, <=, 100);
1074         return (fragmentation);
1075 }
1076
1077 /*
1078  * Determine if a given metaslab group should skip allocations. A metaslab
1079  * group should avoid allocations if its free capacity is less than the
1080  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1081  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1082  * that can still handle allocations. If the allocation throttle is enabled
1083  * then we skip allocations to devices that have reached their maximum
1084  * allocation queue depth unless the selected metaslab group is the only
1085  * eligible group remaining.
1086  */
1087 static boolean_t
1088 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1089     uint64_t psize, int allocator)
1090 {
1091         spa_t *spa = mg->mg_vd->vdev_spa;
1092         metaslab_class_t *mc = mg->mg_class;
1093
1094         /*
1095          * We can only consider skipping this metaslab group if it's
1096          * in the normal metaslab class and there are other metaslab
1097          * groups to select from. Otherwise, we always consider it eligible
1098          * for allocations.
1099          */
1100         if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
1101                 return (B_TRUE);
1102
1103         /*
1104          * If the metaslab group's mg_allocatable flag is set (see comments
1105          * in metaslab_group_alloc_update() for more information) and
1106          * the allocation throttle is disabled then allow allocations to this
1107          * device. However, if the allocation throttle is enabled then
1108          * check if we have reached our allocation limit (mg_alloc_queue_depth)
1109          * to determine if we should allow allocations to this metaslab group.
1110          * If all metaslab groups are no longer considered allocatable
1111          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1112          * gang block size then we allow allocations on this metaslab group
1113          * regardless of the mg_allocatable or throttle settings.
1114          */
1115         if (mg->mg_allocatable) {
1116                 metaslab_group_t *mgp;
1117                 int64_t qdepth;
1118                 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
1119
1120                 if (!mc->mc_alloc_throttle_enabled)
1121                         return (B_TRUE);
1122
1123                 /*
1124                  * If this metaslab group does not have any free space, then
1125                  * there is no point in looking further.
1126                  */
1127                 if (mg->mg_no_free_space)
1128                         return (B_FALSE);
1129
1130                 qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
1131
1132                 /*
1133                  * If this metaslab group is below its qmax or it's
1134                  * the only allocatable metasable group, then attempt
1135                  * to allocate from it.
1136                  */
1137                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1138                         return (B_TRUE);
1139                 ASSERT3U(mc->mc_alloc_groups, >, 1);
1140
1141                 /*
1142                  * Since this metaslab group is at or over its qmax, we
1143                  * need to determine if there are metaslab groups after this
1144                  * one that might be able to handle this allocation. This is
1145                  * racy since we can't hold the locks for all metaslab
1146                  * groups at the same time when we make this check.
1147                  */
1148                 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1149                         qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1150
1151                         qdepth = refcount_count(
1152                             &mgp->mg_alloc_queue_depth[allocator]);
1153
1154                         /*
1155                          * If there is another metaslab group that
1156                          * might be able to handle the allocation, then
1157                          * we return false so that we skip this group.
1158                          */
1159                         if (qdepth < qmax && !mgp->mg_no_free_space)
1160                                 return (B_FALSE);
1161                 }
1162
1163                 /*
1164                  * We didn't find another group to handle the allocation
1165                  * so we can't skip this metaslab group even though
1166                  * we are at or over our qmax.
1167                  */
1168                 return (B_TRUE);
1169
1170         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1171                 return (B_TRUE);
1172         }
1173         return (B_FALSE);
1174 }
1175
1176 /*
1177  * ==========================================================================
1178  * Range tree callbacks
1179  * ==========================================================================
1180  */
1181
1182 /*
1183  * Comparison function for the private size-ordered tree. Tree is sorted
1184  * by size, larger sizes at the end of the tree.
1185  */
1186 static int
1187 metaslab_rangesize_compare(const void *x1, const void *x2)
1188 {
1189         const range_seg_t *r1 = x1;
1190         const range_seg_t *r2 = x2;
1191         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1192         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1193
1194         if (rs_size1 < rs_size2)
1195                 return (-1);
1196         if (rs_size1 > rs_size2)
1197                 return (1);
1198
1199         if (r1->rs_start < r2->rs_start)
1200                 return (-1);
1201
1202         if (r1->rs_start > r2->rs_start)
1203                 return (1);
1204
1205         return (0);
1206 }
1207
1208 /*
1209  * ==========================================================================
1210  * Common allocator routines
1211  * ==========================================================================
1212  */
1213
1214 /*
1215  * Return the maximum contiguous segment within the metaslab.
1216  */
1217 uint64_t
1218 metaslab_block_maxsize(metaslab_t *msp)
1219 {
1220         avl_tree_t *t = &msp->ms_allocatable_by_size;
1221         range_seg_t *rs;
1222
1223         if (t == NULL || (rs = avl_last(t)) == NULL)
1224                 return (0ULL);
1225
1226         return (rs->rs_end - rs->rs_start);
1227 }
1228
1229 static range_seg_t *
1230 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1231 {
1232         range_seg_t *rs, rsearch;
1233         avl_index_t where;
1234
1235         rsearch.rs_start = start;
1236         rsearch.rs_end = start + size;
1237
1238         rs = avl_find(t, &rsearch, &where);
1239         if (rs == NULL) {
1240                 rs = avl_nearest(t, where, AVL_AFTER);
1241         }
1242
1243         return (rs);
1244 }
1245
1246 /*
1247  * This is a helper function that can be used by the allocator to find
1248  * a suitable block to allocate. This will search the specified AVL
1249  * tree looking for a block that matches the specified criteria.
1250  */
1251 static uint64_t
1252 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1253     uint64_t align)
1254 {
1255         range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1256
1257         while (rs != NULL) {
1258                 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1259
1260                 if (offset + size <= rs->rs_end) {
1261                         *cursor = offset + size;
1262                         return (offset);
1263                 }
1264                 rs = AVL_NEXT(t, rs);
1265         }
1266
1267         /*
1268          * If we know we've searched the whole map (*cursor == 0), give up.
1269          * Otherwise, reset the cursor to the beginning and try again.
1270          */
1271         if (*cursor == 0)
1272                 return (-1ULL);
1273
1274         *cursor = 0;
1275         return (metaslab_block_picker(t, cursor, size, align));
1276 }
1277
1278 /*
1279  * ==========================================================================
1280  * The first-fit block allocator
1281  * ==========================================================================
1282  */
1283 static uint64_t
1284 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1285 {
1286         /*
1287          * Find the largest power of 2 block size that evenly divides the
1288          * requested size. This is used to try to allocate blocks with similar
1289          * alignment from the same area of the metaslab (i.e. same cursor
1290          * bucket) but it does not guarantee that other allocations sizes
1291          * may exist in the same region.
1292          */
1293         uint64_t align = size & -size;
1294         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1295         avl_tree_t *t = &msp->ms_allocatable->rt_root;
1296
1297         return (metaslab_block_picker(t, cursor, size, align));
1298 }
1299
1300 static metaslab_ops_t metaslab_ff_ops = {
1301         metaslab_ff_alloc
1302 };
1303
1304 /*
1305  * ==========================================================================
1306  * Dynamic block allocator -
1307  * Uses the first fit allocation scheme until space get low and then
1308  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1309  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1310  * ==========================================================================
1311  */
1312 static uint64_t
1313 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1314 {
1315         /*
1316          * Find the largest power of 2 block size that evenly divides the
1317          * requested size. This is used to try to allocate blocks with similar
1318          * alignment from the same area of the metaslab (i.e. same cursor
1319          * bucket) but it does not guarantee that other allocations sizes
1320          * may exist in the same region.
1321          */
1322         uint64_t align = size & -size;
1323         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1324         range_tree_t *rt = msp->ms_allocatable;
1325         avl_tree_t *t = &rt->rt_root;
1326         uint64_t max_size = metaslab_block_maxsize(msp);
1327         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1328
1329         ASSERT(MUTEX_HELD(&msp->ms_lock));
1330         ASSERT3U(avl_numnodes(t), ==,
1331             avl_numnodes(&msp->ms_allocatable_by_size));
1332
1333         if (max_size < size)
1334                 return (-1ULL);
1335
1336         /*
1337          * If we're running low on space switch to using the size
1338          * sorted AVL tree (best-fit).
1339          */
1340         if (max_size < metaslab_df_alloc_threshold ||
1341             free_pct < metaslab_df_free_pct) {
1342                 t = &msp->ms_allocatable_by_size;
1343                 *cursor = 0;
1344         }
1345
1346         return (metaslab_block_picker(t, cursor, size, 1ULL));
1347 }
1348
1349 static metaslab_ops_t metaslab_df_ops = {
1350         metaslab_df_alloc
1351 };
1352
1353 /*
1354  * ==========================================================================
1355  * Cursor fit block allocator -
1356  * Select the largest region in the metaslab, set the cursor to the beginning
1357  * of the range and the cursor_end to the end of the range. As allocations
1358  * are made advance the cursor. Continue allocating from the cursor until
1359  * the range is exhausted and then find a new range.
1360  * ==========================================================================
1361  */
1362 static uint64_t
1363 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1364 {
1365         range_tree_t *rt = msp->ms_allocatable;
1366         avl_tree_t *t = &msp->ms_allocatable_by_size;
1367         uint64_t *cursor = &msp->ms_lbas[0];
1368         uint64_t *cursor_end = &msp->ms_lbas[1];
1369         uint64_t offset = 0;
1370
1371         ASSERT(MUTEX_HELD(&msp->ms_lock));
1372         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1373
1374         ASSERT3U(*cursor_end, >=, *cursor);
1375
1376         if ((*cursor + size) > *cursor_end) {
1377                 range_seg_t *rs;
1378
1379                 rs = avl_last(&msp->ms_allocatable_by_size);
1380                 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1381                         return (-1ULL);
1382
1383                 *cursor = rs->rs_start;
1384                 *cursor_end = rs->rs_end;
1385         }
1386
1387         offset = *cursor;
1388         *cursor += size;
1389
1390         return (offset);
1391 }
1392
1393 static metaslab_ops_t metaslab_cf_ops = {
1394         metaslab_cf_alloc
1395 };
1396
1397 /*
1398  * ==========================================================================
1399  * New dynamic fit allocator -
1400  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1401  * contiguous blocks. If no region is found then just use the largest segment
1402  * that remains.
1403  * ==========================================================================
1404  */
1405
1406 /*
1407  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1408  * to request from the allocator.
1409  */
1410 uint64_t metaslab_ndf_clump_shift = 4;
1411
1412 static uint64_t
1413 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1414 {
1415         avl_tree_t *t = &msp->ms_allocatable->rt_root;
1416         avl_index_t where;
1417         range_seg_t *rs, rsearch;
1418         uint64_t hbit = highbit64(size);
1419         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1420         uint64_t max_size = metaslab_block_maxsize(msp);
1421
1422         ASSERT(MUTEX_HELD(&msp->ms_lock));
1423         ASSERT3U(avl_numnodes(t), ==,
1424             avl_numnodes(&msp->ms_allocatable_by_size));
1425
1426         if (max_size < size)
1427                 return (-1ULL);
1428
1429         rsearch.rs_start = *cursor;
1430         rsearch.rs_end = *cursor + size;
1431
1432         rs = avl_find(t, &rsearch, &where);
1433         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1434                 t = &msp->ms_allocatable_by_size;
1435
1436                 rsearch.rs_start = 0;
1437                 rsearch.rs_end = MIN(max_size,
1438                     1ULL << (hbit + metaslab_ndf_clump_shift));
1439                 rs = avl_find(t, &rsearch, &where);
1440                 if (rs == NULL)
1441                         rs = avl_nearest(t, where, AVL_AFTER);
1442                 ASSERT(rs != NULL);
1443         }
1444
1445         if ((rs->rs_end - rs->rs_start) >= size) {
1446                 *cursor = rs->rs_start + size;
1447                 return (rs->rs_start);
1448         }
1449         return (-1ULL);
1450 }
1451
1452 static metaslab_ops_t metaslab_ndf_ops = {
1453         metaslab_ndf_alloc
1454 };
1455
1456 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1457
1458 /*
1459  * ==========================================================================
1460  * Metaslabs
1461  * ==========================================================================
1462  */
1463
1464 /*
1465  * Wait for any in-progress metaslab loads to complete.
1466  */
1467 void
1468 metaslab_load_wait(metaslab_t *msp)
1469 {
1470         ASSERT(MUTEX_HELD(&msp->ms_lock));
1471
1472         while (msp->ms_loading) {
1473                 ASSERT(!msp->ms_loaded);
1474                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1475         }
1476 }
1477
1478 int
1479 metaslab_load(metaslab_t *msp)
1480 {
1481         int error = 0;
1482         boolean_t success = B_FALSE;
1483
1484         ASSERT(MUTEX_HELD(&msp->ms_lock));
1485         ASSERT(!msp->ms_loaded);
1486         ASSERT(!msp->ms_loading);
1487
1488         msp->ms_loading = B_TRUE;
1489         /*
1490          * Nobody else can manipulate a loading metaslab, so it's now safe
1491          * to drop the lock.  This way we don't have to hold the lock while
1492          * reading the spacemap from disk.
1493          */
1494         mutex_exit(&msp->ms_lock);
1495
1496         /*
1497          * If the space map has not been allocated yet, then treat
1498          * all the space in the metaslab as free and add it to ms_allocatable.
1499          */
1500         if (msp->ms_sm != NULL) {
1501                 error = space_map_load(msp->ms_sm, msp->ms_allocatable,
1502                     SM_FREE);
1503         } else {
1504                 range_tree_add(msp->ms_allocatable,
1505                     msp->ms_start, msp->ms_size);
1506         }
1507
1508         success = (error == 0);
1509
1510         mutex_enter(&msp->ms_lock);
1511         msp->ms_loading = B_FALSE;
1512
1513         if (success) {
1514                 ASSERT3P(msp->ms_group, !=, NULL);
1515                 msp->ms_loaded = B_TRUE;
1516
1517                 /*
1518                  * If the metaslab already has a spacemap, then we need to
1519                  * remove all segments from the defer tree; otherwise, the
1520                  * metaslab is completely empty and we can skip this.
1521                  */
1522                 if (msp->ms_sm != NULL) {
1523                         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1524                                 range_tree_walk(msp->ms_defer[t],
1525                                     range_tree_remove, msp->ms_allocatable);
1526                         }
1527                 }
1528                 msp->ms_max_size = metaslab_block_maxsize(msp);
1529         }
1530         cv_broadcast(&msp->ms_load_cv);
1531         return (error);
1532 }
1533
1534 void
1535 metaslab_unload(metaslab_t *msp)
1536 {
1537         ASSERT(MUTEX_HELD(&msp->ms_lock));
1538         range_tree_vacate(msp->ms_allocatable, NULL, NULL);
1539         msp->ms_loaded = B_FALSE;
1540         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1541         msp->ms_max_size = 0;
1542 }
1543
1544 int
1545 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1546     metaslab_t **msp)
1547 {
1548         vdev_t *vd = mg->mg_vd;
1549         objset_t *mos = vd->vdev_spa->spa_meta_objset;
1550         metaslab_t *ms;
1551         int error;
1552
1553         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1554         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1555         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1556         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1557         ms->ms_id = id;
1558         ms->ms_start = id << vd->vdev_ms_shift;
1559         ms->ms_size = 1ULL << vd->vdev_ms_shift;
1560         ms->ms_allocator = -1;
1561         ms->ms_new = B_TRUE;
1562
1563         /*
1564          * We only open space map objects that already exist. All others
1565          * will be opened when we finally allocate an object for it.
1566          */
1567         if (object != 0) {
1568                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1569                     ms->ms_size, vd->vdev_ashift);
1570
1571                 if (error != 0) {
1572                         kmem_free(ms, sizeof (metaslab_t));
1573                         return (error);
1574                 }
1575
1576                 ASSERT(ms->ms_sm != NULL);
1577         }
1578
1579         /*
1580          * We create the main range tree here, but we don't create the
1581          * other range trees until metaslab_sync_done().  This serves
1582          * two purposes: it allows metaslab_sync_done() to detect the
1583          * addition of new space; and for debugging, it ensures that we'd
1584          * data fault on any attempt to use this metaslab before it's ready.
1585          */
1586         ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
1587             metaslab_rangesize_compare, 0);
1588         metaslab_group_add(mg, ms);
1589
1590         metaslab_set_fragmentation(ms);
1591
1592         /*
1593          * If we're opening an existing pool (txg == 0) or creating
1594          * a new one (txg == TXG_INITIAL), all space is available now.
1595          * If we're adding space to an existing pool, the new space
1596          * does not become available until after this txg has synced.
1597          * The metaslab's weight will also be initialized when we sync
1598          * out this txg. This ensures that we don't attempt to allocate
1599          * from it before we have initialized it completely.
1600          */
1601         if (txg <= TXG_INITIAL)
1602                 metaslab_sync_done(ms, 0);
1603
1604         /*
1605          * If metaslab_debug_load is set and we're initializing a metaslab
1606          * that has an allocated space map object then load the its space
1607          * map so that can verify frees.
1608          */
1609         if (metaslab_debug_load && ms->ms_sm != NULL) {
1610                 mutex_enter(&ms->ms_lock);
1611                 VERIFY0(metaslab_load(ms));
1612                 mutex_exit(&ms->ms_lock);
1613         }
1614
1615         if (txg != 0) {
1616                 vdev_dirty(vd, 0, NULL, txg);
1617                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1618         }
1619
1620         *msp = ms;
1621
1622         return (0);
1623 }
1624
1625 void
1626 metaslab_fini(metaslab_t *msp)
1627 {
1628         metaslab_group_t *mg = msp->ms_group;
1629
1630         metaslab_group_remove(mg, msp);
1631
1632         mutex_enter(&msp->ms_lock);
1633         VERIFY(msp->ms_group == NULL);
1634         vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1635             0, -msp->ms_size);
1636         space_map_close(msp->ms_sm);
1637
1638         metaslab_unload(msp);
1639         range_tree_destroy(msp->ms_allocatable);
1640         range_tree_destroy(msp->ms_freeing);
1641         range_tree_destroy(msp->ms_freed);
1642
1643         for (int t = 0; t < TXG_SIZE; t++) {
1644                 range_tree_destroy(msp->ms_allocating[t]);
1645         }
1646
1647         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1648                 range_tree_destroy(msp->ms_defer[t]);
1649         }
1650         ASSERT0(msp->ms_deferspace);
1651
1652         range_tree_destroy(msp->ms_checkpointing);
1653
1654         mutex_exit(&msp->ms_lock);
1655         cv_destroy(&msp->ms_load_cv);
1656         mutex_destroy(&msp->ms_lock);
1657         mutex_destroy(&msp->ms_sync_lock);
1658         ASSERT3U(msp->ms_allocator, ==, -1);
1659
1660         kmem_free(msp, sizeof (metaslab_t));
1661 }
1662
1663 #define FRAGMENTATION_TABLE_SIZE        17
1664
1665 /*
1666  * This table defines a segment size based fragmentation metric that will
1667  * allow each metaslab to derive its own fragmentation value. This is done
1668  * by calculating the space in each bucket of the spacemap histogram and
1669  * multiplying that by the fragmetation metric in this table. Doing
1670  * this for all buckets and dividing it by the total amount of free
1671  * space in this metaslab (i.e. the total free space in all buckets) gives
1672  * us the fragmentation metric. This means that a high fragmentation metric
1673  * equates to most of the free space being comprised of small segments.
1674  * Conversely, if the metric is low, then most of the free space is in
1675  * large segments. A 10% change in fragmentation equates to approximately
1676  * double the number of segments.
1677  *
1678  * This table defines 0% fragmented space using 16MB segments. Testing has
1679  * shown that segments that are greater than or equal to 16MB do not suffer
1680  * from drastic performance problems. Using this value, we derive the rest
1681  * of the table. Since the fragmentation value is never stored on disk, it
1682  * is possible to change these calculations in the future.
1683  */
1684 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1685         100,    /* 512B */
1686         100,    /* 1K   */
1687         98,     /* 2K   */
1688         95,     /* 4K   */
1689         90,     /* 8K   */
1690         80,     /* 16K  */
1691         70,     /* 32K  */
1692         60,     /* 64K  */
1693         50,     /* 128K */
1694         40,     /* 256K */
1695         30,     /* 512K */
1696         20,     /* 1M   */
1697         15,     /* 2M   */
1698         10,     /* 4M   */
1699         5,      /* 8M   */
1700         0       /* 16M  */
1701 };
1702
1703 /*
1704  * Calclate the metaslab's fragmentation metric. A return value
1705  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1706  * not support this metric. Otherwise, the return value should be in the
1707  * range [0, 100].
1708  */
1709 static void
1710 metaslab_set_fragmentation(metaslab_t *msp)
1711 {
1712         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1713         uint64_t fragmentation = 0;
1714         uint64_t total = 0;
1715         boolean_t feature_enabled = spa_feature_is_enabled(spa,
1716             SPA_FEATURE_SPACEMAP_HISTOGRAM);
1717
1718         if (!feature_enabled) {
1719                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1720                 return;
1721         }
1722
1723         /*
1724          * A null space map means that the entire metaslab is free
1725          * and thus is not fragmented.
1726          */
1727         if (msp->ms_sm == NULL) {
1728                 msp->ms_fragmentation = 0;
1729                 return;
1730         }
1731
1732         /*
1733          * If this metaslab's space map has not been upgraded, flag it
1734          * so that we upgrade next time we encounter it.
1735          */
1736         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1737                 uint64_t txg = spa_syncing_txg(spa);
1738                 vdev_t *vd = msp->ms_group->mg_vd;
1739
1740                 /*
1741                  * If we've reached the final dirty txg, then we must
1742                  * be shutting down the pool. We don't want to dirty
1743                  * any data past this point so skip setting the condense
1744                  * flag. We can retry this action the next time the pool
1745                  * is imported.
1746                  */
1747                 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1748                         msp->ms_condense_wanted = B_TRUE;
1749                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1750                         spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1751                             "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1752                             vd->vdev_id);
1753                 }
1754                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1755                 return;
1756         }
1757
1758         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1759                 uint64_t space = 0;
1760                 uint8_t shift = msp->ms_sm->sm_shift;
1761
1762                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1763                     FRAGMENTATION_TABLE_SIZE - 1);
1764
1765                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1766                         continue;
1767
1768                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1769                 total += space;
1770
1771                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1772                 fragmentation += space * zfs_frag_table[idx];
1773         }
1774
1775         if (total > 0)
1776                 fragmentation /= total;
1777         ASSERT3U(fragmentation, <=, 100);
1778
1779         msp->ms_fragmentation = fragmentation;
1780 }
1781
1782 /*
1783  * Compute a weight -- a selection preference value -- for the given metaslab.
1784  * This is based on the amount of free space, the level of fragmentation,
1785  * the LBA range, and whether the metaslab is loaded.
1786  */
1787 static uint64_t
1788 metaslab_space_weight(metaslab_t *msp)
1789 {
1790         metaslab_group_t *mg = msp->ms_group;
1791         vdev_t *vd = mg->mg_vd;
1792         uint64_t weight, space;
1793
1794         ASSERT(MUTEX_HELD(&msp->ms_lock));
1795         ASSERT(!vd->vdev_removing);
1796
1797         /*
1798          * The baseline weight is the metaslab's free space.
1799          */
1800         space = msp->ms_size - space_map_allocated(msp->ms_sm);
1801
1802         if (metaslab_fragmentation_factor_enabled &&
1803             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1804                 /*
1805                  * Use the fragmentation information to inversely scale
1806                  * down the baseline weight. We need to ensure that we
1807                  * don't exclude this metaslab completely when it's 100%
1808                  * fragmented. To avoid this we reduce the fragmented value
1809                  * by 1.
1810                  */
1811                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1812
1813                 /*
1814                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1815                  * this metaslab again. The fragmentation metric may have
1816                  * decreased the space to something smaller than
1817                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1818                  * so that we can consume any remaining space.
1819                  */
1820                 if (space > 0 && space < SPA_MINBLOCKSIZE)
1821                         space = SPA_MINBLOCKSIZE;
1822         }
1823         weight = space;
1824
1825         /*
1826          * Modern disks have uniform bit density and constant angular velocity.
1827          * Therefore, the outer recording zones are faster (higher bandwidth)
1828          * than the inner zones by the ratio of outer to inner track diameter,
1829          * which is typically around 2:1.  We account for this by assigning
1830          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1831          * In effect, this means that we'll select the metaslab with the most
1832          * free bandwidth rather than simply the one with the most free space.
1833          */
1834         if (metaslab_lba_weighting_enabled) {
1835                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1836                 ASSERT(weight >= space && weight <= 2 * space);
1837         }
1838
1839         /*
1840          * If this metaslab is one we're actively using, adjust its
1841          * weight to make it preferable to any inactive metaslab so
1842          * we'll polish it off. If the fragmentation on this metaslab
1843          * has exceed our threshold, then don't mark it active.
1844          */
1845         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1846             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1847                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1848         }
1849
1850         WEIGHT_SET_SPACEBASED(weight);
1851         return (weight);
1852 }
1853
1854 /*
1855  * Return the weight of the specified metaslab, according to the segment-based
1856  * weighting algorithm. The metaslab must be loaded. This function can
1857  * be called within a sync pass since it relies only on the metaslab's
1858  * range tree which is always accurate when the metaslab is loaded.
1859  */
1860 static uint64_t
1861 metaslab_weight_from_range_tree(metaslab_t *msp)
1862 {
1863         uint64_t weight = 0;
1864         uint32_t segments = 0;
1865
1866         ASSERT(msp->ms_loaded);
1867
1868         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1869             i--) {
1870                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1871                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1872
1873                 segments <<= 1;
1874                 segments += msp->ms_allocatable->rt_histogram[i];
1875
1876                 /*
1877                  * The range tree provides more precision than the space map
1878                  * and must be downgraded so that all values fit within the
1879                  * space map's histogram. This allows us to compare loaded
1880                  * vs. unloaded metaslabs to determine which metaslab is
1881                  * considered "best".
1882                  */
1883                 if (i > max_idx)
1884                         continue;
1885
1886                 if (segments != 0) {
1887                         WEIGHT_SET_COUNT(weight, segments);
1888                         WEIGHT_SET_INDEX(weight, i);
1889                         WEIGHT_SET_ACTIVE(weight, 0);
1890                         break;
1891                 }
1892         }
1893         return (weight);
1894 }
1895
1896 /*
1897  * Calculate the weight based on the on-disk histogram. This should only
1898  * be called after a sync pass has completely finished since the on-disk
1899  * information is updated in metaslab_sync().
1900  */
1901 static uint64_t
1902 metaslab_weight_from_spacemap(metaslab_t *msp)
1903 {
1904         uint64_t weight = 0;
1905
1906         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1907                 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1908                         WEIGHT_SET_COUNT(weight,
1909                             msp->ms_sm->sm_phys->smp_histogram[i]);
1910                         WEIGHT_SET_INDEX(weight, i +
1911                             msp->ms_sm->sm_shift);
1912                         WEIGHT_SET_ACTIVE(weight, 0);
1913                         break;
1914                 }
1915         }
1916         return (weight);
1917 }
1918
1919 /*
1920  * Compute a segment-based weight for the specified metaslab. The weight
1921  * is determined by highest bucket in the histogram. The information
1922  * for the highest bucket is encoded into the weight value.
1923  */
1924 static uint64_t
1925 metaslab_segment_weight(metaslab_t *msp)
1926 {
1927         metaslab_group_t *mg = msp->ms_group;
1928         uint64_t weight = 0;
1929         uint8_t shift = mg->mg_vd->vdev_ashift;
1930
1931         ASSERT(MUTEX_HELD(&msp->ms_lock));
1932
1933         /*
1934          * The metaslab is completely free.
1935          */
1936         if (space_map_allocated(msp->ms_sm) == 0) {
1937                 int idx = highbit64(msp->ms_size) - 1;
1938                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1939
1940                 if (idx < max_idx) {
1941                         WEIGHT_SET_COUNT(weight, 1ULL);
1942                         WEIGHT_SET_INDEX(weight, idx);
1943                 } else {
1944                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1945                         WEIGHT_SET_INDEX(weight, max_idx);
1946                 }
1947                 WEIGHT_SET_ACTIVE(weight, 0);
1948                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1949
1950                 return (weight);
1951         }
1952
1953         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1954
1955         /*
1956          * If the metaslab is fully allocated then just make the weight 0.
1957          */
1958         if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1959                 return (0);
1960         /*
1961          * If the metaslab is already loaded, then use the range tree to
1962          * determine the weight. Otherwise, we rely on the space map information
1963          * to generate the weight.
1964          */
1965         if (msp->ms_loaded) {
1966                 weight = metaslab_weight_from_range_tree(msp);
1967         } else {
1968                 weight = metaslab_weight_from_spacemap(msp);
1969         }
1970
1971         /*
1972          * If the metaslab was active the last time we calculated its weight
1973          * then keep it active. We want to consume the entire region that
1974          * is associated with this weight.
1975          */
1976         if (msp->ms_activation_weight != 0 && weight != 0)
1977                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1978         return (weight);
1979 }
1980
1981 /*
1982  * Determine if we should attempt to allocate from this metaslab. If the
1983  * metaslab has a maximum size then we can quickly determine if the desired
1984  * allocation size can be satisfied. Otherwise, if we're using segment-based
1985  * weighting then we can determine the maximum allocation that this metaslab
1986  * can accommodate based on the index encoded in the weight. If we're using
1987  * space-based weights then rely on the entire weight (excluding the weight
1988  * type bit).
1989  */
1990 boolean_t
1991 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1992 {
1993         boolean_t should_allocate;
1994
1995         if (msp->ms_max_size != 0)
1996                 return (msp->ms_max_size >= asize);
1997
1998         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1999                 /*
2000                  * The metaslab segment weight indicates segments in the
2001                  * range [2^i, 2^(i+1)), where i is the index in the weight.
2002                  * Since the asize might be in the middle of the range, we
2003                  * should attempt the allocation if asize < 2^(i+1).
2004                  */
2005                 should_allocate = (asize <
2006                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
2007         } else {
2008                 should_allocate = (asize <=
2009                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
2010         }
2011         return (should_allocate);
2012 }
2013
2014 static uint64_t
2015 metaslab_weight(metaslab_t *msp)
2016 {
2017         vdev_t *vd = msp->ms_group->mg_vd;
2018         spa_t *spa = vd->vdev_spa;
2019         uint64_t weight;
2020
2021         ASSERT(MUTEX_HELD(&msp->ms_lock));
2022
2023         /*
2024          * If this vdev is in the process of being removed, there is nothing
2025          * for us to do here.
2026          */
2027         if (vd->vdev_removing)
2028                 return (0);
2029
2030         metaslab_set_fragmentation(msp);
2031
2032         /*
2033          * Update the maximum size if the metaslab is loaded. This will
2034          * ensure that we get an accurate maximum size if newly freed space
2035          * has been added back into the free tree.
2036          */
2037         if (msp->ms_loaded)
2038                 msp->ms_max_size = metaslab_block_maxsize(msp);
2039
2040         /*
2041          * Segment-based weighting requires space map histogram support.
2042          */
2043         if (zfs_metaslab_segment_weight_enabled &&
2044             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2045             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2046             sizeof (space_map_phys_t))) {
2047                 weight = metaslab_segment_weight(msp);
2048         } else {
2049                 weight = metaslab_space_weight(msp);
2050         }
2051         return (weight);
2052 }
2053
2054 static int
2055 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2056     int allocator, uint64_t activation_weight)
2057 {
2058         /*
2059          * If we're activating for the claim code, we don't want to actually
2060          * set the metaslab up for a specific allocator.
2061          */
2062         if (activation_weight == METASLAB_WEIGHT_CLAIM)
2063                 return (0);
2064         metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2065             mg->mg_primaries : mg->mg_secondaries);
2066
2067         ASSERT(MUTEX_HELD(&msp->ms_lock));
2068         mutex_enter(&mg->mg_lock);
2069         if (arr[allocator] != NULL) {
2070                 mutex_exit(&mg->mg_lock);
2071                 return (EEXIST);
2072         }
2073
2074         arr[allocator] = msp;
2075         ASSERT3S(msp->ms_allocator, ==, -1);
2076         msp->ms_allocator = allocator;
2077         msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
2078         mutex_exit(&mg->mg_lock);
2079
2080         return (0);
2081 }
2082
2083 static int
2084 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
2085 {
2086         ASSERT(MUTEX_HELD(&msp->ms_lock));
2087
2088         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
2089                 int error = 0;
2090                 metaslab_load_wait(msp);
2091                 if (!msp->ms_loaded) {
2092                         if ((error = metaslab_load(msp)) != 0) {
2093                                 metaslab_group_sort(msp->ms_group, msp, 0);
2094                                 return (error);
2095                         }
2096                 }
2097                 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2098                         /*
2099                          * The metaslab was activated for another allocator
2100                          * while we were waiting, we should reselect.
2101                          */
2102                         return (EBUSY);
2103                 }
2104                 if ((error = metaslab_activate_allocator(msp->ms_group, msp,
2105                     allocator, activation_weight)) != 0) {
2106                         return (error);
2107                 }
2108
2109                 msp->ms_activation_weight = msp->ms_weight;
2110                 metaslab_group_sort(msp->ms_group, msp,
2111                     msp->ms_weight | activation_weight);
2112         }
2113         ASSERT(msp->ms_loaded);
2114         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
2115
2116         return (0);
2117 }
2118
2119 static void
2120 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2121     uint64_t weight)
2122 {
2123         ASSERT(MUTEX_HELD(&msp->ms_lock));
2124         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
2125                 metaslab_group_sort(mg, msp, weight);
2126                 return;
2127         }
2128
2129         mutex_enter(&mg->mg_lock);
2130         ASSERT3P(msp->ms_group, ==, mg);
2131         if (msp->ms_primary) {
2132                 ASSERT3U(0, <=, msp->ms_allocator);
2133                 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
2134                 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
2135                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
2136                 mg->mg_primaries[msp->ms_allocator] = NULL;
2137         } else {
2138                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
2139                 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
2140                 mg->mg_secondaries[msp->ms_allocator] = NULL;
2141         }
2142         msp->ms_allocator = -1;
2143         metaslab_group_sort_impl(mg, msp, weight);
2144         mutex_exit(&mg->mg_lock);
2145 }
2146
2147 static void
2148 metaslab_passivate(metaslab_t *msp, uint64_t weight)
2149 {
2150         uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
2151
2152         /*
2153          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
2154          * this metaslab again.  In that case, it had better be empty,
2155          * or we would be leaving space on the table.
2156          */
2157         ASSERT(size >= SPA_MINBLOCKSIZE ||
2158             range_tree_is_empty(msp->ms_allocatable));
2159         ASSERT0(weight & METASLAB_ACTIVE_MASK);
2160
2161         msp->ms_activation_weight = 0;
2162         metaslab_passivate_allocator(msp->ms_group, msp, weight);
2163         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2164 }
2165
2166 /*
2167  * Segment-based metaslabs are activated once and remain active until
2168  * we either fail an allocation attempt (similar to space-based metaslabs)
2169  * or have exhausted the free space in zfs_metaslab_switch_threshold
2170  * buckets since the metaslab was activated. This function checks to see
2171  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
2172  * metaslab and passivates it proactively. This will allow us to select a
2173  * metaslabs with larger contiguous region if any remaining within this
2174  * metaslab group. If we're in sync pass > 1, then we continue using this
2175  * metaslab so that we don't dirty more block and cause more sync passes.
2176  */
2177 void
2178 metaslab_segment_may_passivate(metaslab_t *msp)
2179 {
2180         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2181
2182         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
2183                 return;
2184
2185         /*
2186          * Since we are in the middle of a sync pass, the most accurate
2187          * information that is accessible to us is the in-core range tree
2188          * histogram; calculate the new weight based on that information.
2189          */
2190         uint64_t weight = metaslab_weight_from_range_tree(msp);
2191         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
2192         int current_idx = WEIGHT_GET_INDEX(weight);
2193
2194         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
2195                 metaslab_passivate(msp, weight);
2196 }
2197
2198 static void
2199 metaslab_preload(void *arg)
2200 {
2201         metaslab_t *msp = arg;
2202         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2203
2204         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
2205
2206         mutex_enter(&msp->ms_lock);
2207         metaslab_load_wait(msp);
2208         if (!msp->ms_loaded)
2209                 (void) metaslab_load(msp);
2210         msp->ms_selected_txg = spa_syncing_txg(spa);
2211         mutex_exit(&msp->ms_lock);
2212 }
2213
2214 static void
2215 metaslab_group_preload(metaslab_group_t *mg)
2216 {
2217         spa_t *spa = mg->mg_vd->vdev_spa;
2218         metaslab_t *msp;
2219         avl_tree_t *t = &mg->mg_metaslab_tree;
2220         int m = 0;
2221
2222         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
2223                 taskq_wait(mg->mg_taskq);
2224                 return;
2225         }
2226
2227         mutex_enter(&mg->mg_lock);
2228
2229         /*
2230          * Load the next potential metaslabs
2231          */
2232         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
2233                 ASSERT3P(msp->ms_group, ==, mg);
2234
2235                 /*
2236                  * We preload only the maximum number of metaslabs specified
2237                  * by metaslab_preload_limit. If a metaslab is being forced
2238                  * to condense then we preload it too. This will ensure
2239                  * that force condensing happens in the next txg.
2240                  */
2241                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
2242                         continue;
2243                 }
2244
2245                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
2246                     msp, TQ_SLEEP) != 0);
2247         }
2248         mutex_exit(&mg->mg_lock);
2249 }
2250
2251 /*
2252  * Determine if the space map's on-disk footprint is past our tolerance
2253  * for inefficiency. We would like to use the following criteria to make
2254  * our decision:
2255  *
2256  * 1. The size of the space map object should not dramatically increase as a
2257  * result of writing out the free space range tree.
2258  *
2259  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2260  * times the size than the free space range tree representation
2261  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
2262  *
2263  * 3. The on-disk size of the space map should actually decrease.
2264  *
2265  * Unfortunately, we cannot compute the on-disk size of the space map in this
2266  * context because we cannot accurately compute the effects of compression, etc.
2267  * Instead, we apply the heuristic described in the block comment for
2268  * zfs_metaslab_condense_block_threshold - we only condense if the space used
2269  * is greater than a threshold number of blocks.
2270  */
2271 static boolean_t
2272 metaslab_should_condense(metaslab_t *msp)
2273 {
2274         space_map_t *sm = msp->ms_sm;
2275         vdev_t *vd = msp->ms_group->mg_vd;
2276         uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
2277         uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
2278
2279         ASSERT(MUTEX_HELD(&msp->ms_lock));
2280         ASSERT(msp->ms_loaded);
2281
2282         /*
2283          * Allocations and frees in early passes are generally more space
2284          * efficient (in terms of blocks described in space map entries)
2285          * than the ones in later passes (e.g. we don't compress after
2286          * sync pass 5) and condensing a metaslab multiple times in a txg
2287          * could degrade performance.
2288          *
2289          * Thus we prefer condensing each metaslab at most once every txg at
2290          * the earliest sync pass possible. If a metaslab is eligible for
2291          * condensing again after being considered for condensing within the
2292          * same txg, it will hopefully be dirty in the next txg where it will
2293          * be condensed at an earlier pass.
2294          */
2295         if (msp->ms_condense_checked_txg == current_txg)
2296                 return (B_FALSE);
2297         msp->ms_condense_checked_txg = current_txg;
2298
2299         /*
2300          * We always condense metaslabs that are empty and metaslabs for
2301          * which a condense request has been made.
2302          */
2303         if (avl_is_empty(&msp->ms_allocatable_by_size) ||
2304             msp->ms_condense_wanted)
2305                 return (B_TRUE);
2306
2307         uint64_t object_size = space_map_length(msp->ms_sm);
2308         uint64_t optimal_size = space_map_estimate_optimal_size(sm,
2309             msp->ms_allocatable, SM_NO_VDEVID);
2310
2311         dmu_object_info_t doi;
2312         dmu_object_info_from_db(sm->sm_dbuf, &doi);
2313         uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2314
2315         return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
2316             object_size > zfs_metaslab_condense_block_threshold * record_size);
2317 }
2318
2319 /*
2320  * Condense the on-disk space map representation to its minimized form.
2321  * The minimized form consists of a small number of allocations followed by
2322  * the entries of the free range tree.
2323  */
2324 static void
2325 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2326 {
2327         range_tree_t *condense_tree;
2328         space_map_t *sm = msp->ms_sm;
2329
2330         ASSERT(MUTEX_HELD(&msp->ms_lock));
2331         ASSERT(msp->ms_loaded);
2332
2333         zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2334             "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2335             msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2336             msp->ms_group->mg_vd->vdev_spa->spa_name,
2337             space_map_length(msp->ms_sm),
2338             avl_numnodes(&msp->ms_allocatable->rt_root),
2339             msp->ms_condense_wanted ? "TRUE" : "FALSE");
2340
2341         msp->ms_condense_wanted = B_FALSE;
2342
2343         /*
2344          * Create an range tree that is 100% allocated. We remove segments
2345          * that have been freed in this txg, any deferred frees that exist,
2346          * and any allocation in the future. Removing segments should be
2347          * a relatively inexpensive operation since we expect these trees to
2348          * have a small number of nodes.
2349          */
2350         condense_tree = range_tree_create(NULL, NULL);
2351         range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2352
2353         range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
2354         range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
2355
2356         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2357                 range_tree_walk(msp->ms_defer[t],
2358                     range_tree_remove, condense_tree);
2359         }
2360
2361         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2362                 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
2363                     range_tree_remove, condense_tree);
2364         }
2365
2366         /*
2367          * We're about to drop the metaslab's lock thus allowing
2368          * other consumers to change it's content. Set the
2369          * metaslab's ms_condensing flag to ensure that
2370          * allocations on this metaslab do not occur while we're
2371          * in the middle of committing it to disk. This is only critical
2372          * for ms_allocatable as all other range trees use per txg
2373          * views of their content.
2374          */
2375         msp->ms_condensing = B_TRUE;
2376
2377         mutex_exit(&msp->ms_lock);
2378         space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
2379
2380         /*
2381          * While we would ideally like to create a space map representation
2382          * that consists only of allocation records, doing so can be
2383          * prohibitively expensive because the in-core free tree can be
2384          * large, and therefore computationally expensive to subtract
2385          * from the condense_tree. Instead we sync out two trees, a cheap
2386          * allocation only tree followed by the in-core free tree. While not
2387          * optimal, this is typically close to optimal, and much cheaper to
2388          * compute.
2389          */
2390         space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
2391         range_tree_vacate(condense_tree, NULL, NULL);
2392         range_tree_destroy(condense_tree);
2393
2394         space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
2395         mutex_enter(&msp->ms_lock);
2396         msp->ms_condensing = B_FALSE;
2397 }
2398
2399 /*
2400  * Write a metaslab to disk in the context of the specified transaction group.
2401  */
2402 void
2403 metaslab_sync(metaslab_t *msp, uint64_t txg)
2404 {
2405         metaslab_group_t *mg = msp->ms_group;
2406         vdev_t *vd = mg->mg_vd;
2407         spa_t *spa = vd->vdev_spa;
2408         objset_t *mos = spa_meta_objset(spa);
2409         range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
2410         dmu_tx_t *tx;
2411         uint64_t object = space_map_object(msp->ms_sm);
2412
2413         ASSERT(!vd->vdev_ishole);
2414
2415         /*
2416          * This metaslab has just been added so there's no work to do now.
2417          */
2418         if (msp->ms_freeing == NULL) {
2419                 ASSERT3P(alloctree, ==, NULL);
2420                 return;
2421         }
2422
2423         ASSERT3P(alloctree, !=, NULL);
2424         ASSERT3P(msp->ms_freeing, !=, NULL);
2425         ASSERT3P(msp->ms_freed, !=, NULL);
2426         ASSERT3P(msp->ms_checkpointing, !=, NULL);
2427
2428         /*
2429          * Normally, we don't want to process a metaslab if there are no
2430          * allocations or frees to perform. However, if the metaslab is being
2431          * forced to condense and it's loaded, we need to let it through.
2432          */
2433         if (range_tree_is_empty(alloctree) &&
2434             range_tree_is_empty(msp->ms_freeing) &&
2435             range_tree_is_empty(msp->ms_checkpointing) &&
2436             !(msp->ms_loaded && msp->ms_condense_wanted))
2437                 return;
2438
2439
2440         VERIFY(txg <= spa_final_dirty_txg(spa));
2441
2442         /*
2443          * The only state that can actually be changing concurrently with
2444          * metaslab_sync() is the metaslab's ms_allocatable.  No other
2445          * thread can be modifying this txg's alloc, freeing,
2446          * freed, or space_map_phys_t.  We drop ms_lock whenever we
2447          * could call into the DMU, because the DMU can call down to us
2448          * (e.g. via zio_free()) at any time.
2449          *
2450          * The spa_vdev_remove_thread() can be reading metaslab state
2451          * concurrently, and it is locked out by the ms_sync_lock.  Note
2452          * that the ms_lock is insufficient for this, because it is dropped
2453          * by space_map_write().
2454          */
2455         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2456
2457         if (msp->ms_sm == NULL) {
2458                 uint64_t new_object;
2459
2460                 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
2461                 VERIFY3U(new_object, !=, 0);
2462
2463                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2464                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
2465                 ASSERT(msp->ms_sm != NULL);
2466         }
2467
2468         if (!range_tree_is_empty(msp->ms_checkpointing) &&
2469             vd->vdev_checkpoint_sm == NULL) {
2470                 ASSERT(spa_has_checkpoint(spa));
2471
2472                 uint64_t new_object = space_map_alloc(mos,
2473                     vdev_standard_sm_blksz, tx);
2474                 VERIFY3U(new_object, !=, 0);
2475
2476                 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
2477                     mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
2478                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2479
2480                 /*
2481                  * We save the space map object as an entry in vdev_top_zap
2482                  * so it can be retrieved when the pool is reopened after an
2483                  * export or through zdb.
2484                  */
2485                 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
2486                     vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
2487                     sizeof (new_object), 1, &new_object, tx));
2488         }
2489
2490         mutex_enter(&msp->ms_sync_lock);
2491         mutex_enter(&msp->ms_lock);
2492
2493         /*
2494          * Note: metaslab_condense() clears the space map's histogram.
2495          * Therefore we must verify and remove this histogram before
2496          * condensing.
2497          */
2498         metaslab_group_histogram_verify(mg);
2499         metaslab_class_histogram_verify(mg->mg_class);
2500         metaslab_group_histogram_remove(mg, msp);
2501
2502         if (msp->ms_loaded && metaslab_should_condense(msp)) {
2503                 metaslab_condense(msp, txg, tx);
2504         } else {
2505                 mutex_exit(&msp->ms_lock);
2506                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
2507                     SM_NO_VDEVID, tx);
2508                 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
2509                     SM_NO_VDEVID, tx);
2510                 mutex_enter(&msp->ms_lock);
2511         }
2512
2513         if (!range_tree_is_empty(msp->ms_checkpointing)) {
2514                 ASSERT(spa_has_checkpoint(spa));
2515                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2516
2517                 /*
2518                  * Since we are doing writes to disk and the ms_checkpointing
2519                  * tree won't be changing during that time, we drop the
2520                  * ms_lock while writing to the checkpoint space map.
2521                  */
2522                 mutex_exit(&msp->ms_lock);
2523                 space_map_write(vd->vdev_checkpoint_sm,
2524                     msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
2525                 mutex_enter(&msp->ms_lock);
2526                 space_map_update(vd->vdev_checkpoint_sm);
2527
2528                 spa->spa_checkpoint_info.sci_dspace +=
2529                     range_tree_space(msp->ms_checkpointing);
2530                 vd->vdev_stat.vs_checkpoint_space +=
2531                     range_tree_space(msp->ms_checkpointing);
2532                 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
2533                     -vd->vdev_checkpoint_sm->sm_alloc);
2534
2535                 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
2536         }
2537
2538         if (msp->ms_loaded) {
2539                 /*
2540                  * When the space map is loaded, we have an accurate
2541                  * histogram in the range tree. This gives us an opportunity
2542                  * to bring the space map's histogram up-to-date so we clear
2543                  * it first before updating it.
2544                  */
2545                 space_map_histogram_clear(msp->ms_sm);
2546                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
2547
2548                 /*
2549                  * Since we've cleared the histogram we need to add back
2550                  * any free space that has already been processed, plus
2551                  * any deferred space. This allows the on-disk histogram
2552                  * to accurately reflect all free space even if some space
2553                  * is not yet available for allocation (i.e. deferred).
2554                  */
2555                 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
2556
2557                 /*
2558                  * Add back any deferred free space that has not been
2559                  * added back into the in-core free tree yet. This will
2560                  * ensure that we don't end up with a space map histogram
2561                  * that is completely empty unless the metaslab is fully
2562                  * allocated.
2563                  */
2564                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2565                         space_map_histogram_add(msp->ms_sm,
2566                             msp->ms_defer[t], tx);
2567                 }
2568         }
2569
2570         /*
2571          * Always add the free space from this sync pass to the space
2572          * map histogram. We want to make sure that the on-disk histogram
2573          * accounts for all free space. If the space map is not loaded,
2574          * then we will lose some accuracy but will correct it the next
2575          * time we load the space map.
2576          */
2577         space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
2578
2579         metaslab_group_histogram_add(mg, msp);
2580         metaslab_group_histogram_verify(mg);
2581         metaslab_class_histogram_verify(mg->mg_class);
2582
2583         /*
2584          * For sync pass 1, we avoid traversing this txg's free range tree
2585          * and instead will just swap the pointers for freeing and
2586          * freed. We can safely do this since the freed_tree is
2587          * guaranteed to be empty on the initial pass.
2588          */
2589         if (spa_sync_pass(spa) == 1) {
2590                 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
2591         } else {
2592                 range_tree_vacate(msp->ms_freeing,
2593                     range_tree_add, msp->ms_freed);
2594         }
2595         range_tree_vacate(alloctree, NULL, NULL);
2596
2597         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2598         ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
2599             & TXG_MASK]));
2600         ASSERT0(range_tree_space(msp->ms_freeing));
2601         ASSERT0(range_tree_space(msp->ms_checkpointing));
2602
2603         mutex_exit(&msp->ms_lock);
2604
2605         if (object != space_map_object(msp->ms_sm)) {
2606                 object = space_map_object(msp->ms_sm);
2607                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2608                     msp->ms_id, sizeof (uint64_t), &object, tx);
2609         }
2610         mutex_exit(&msp->ms_sync_lock);
2611         dmu_tx_commit(tx);
2612 }
2613
2614 /*
2615  * Called after a transaction group has completely synced to mark
2616  * all of the metaslab's free space as usable.
2617  */
2618 void
2619 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2620 {
2621         metaslab_group_t *mg = msp->ms_group;
2622         vdev_t *vd = mg->mg_vd;
2623         spa_t *spa = vd->vdev_spa;
2624         range_tree_t **defer_tree;
2625         int64_t alloc_delta, defer_delta;
2626         boolean_t defer_allowed = B_TRUE;
2627
2628         ASSERT(!vd->vdev_ishole);
2629
2630         mutex_enter(&msp->ms_lock);
2631
2632         /*
2633          * If this metaslab is just becoming available, initialize its
2634          * range trees and add its capacity to the vdev.
2635          */
2636         if (msp->ms_freed == NULL) {
2637                 for (int t = 0; t < TXG_SIZE; t++) {
2638                         ASSERT(msp->ms_allocating[t] == NULL);
2639
2640                         msp->ms_allocating[t] = range_tree_create(NULL, NULL);
2641                 }
2642
2643                 ASSERT3P(msp->ms_freeing, ==, NULL);
2644                 msp->ms_freeing = range_tree_create(NULL, NULL);
2645
2646                 ASSERT3P(msp->ms_freed, ==, NULL);
2647                 msp->ms_freed = range_tree_create(NULL, NULL);
2648
2649                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2650                         ASSERT(msp->ms_defer[t] == NULL);
2651
2652                         msp->ms_defer[t] = range_tree_create(NULL, NULL);
2653                 }
2654
2655                 ASSERT3P(msp->ms_checkpointing, ==, NULL);
2656                 msp->ms_checkpointing = range_tree_create(NULL, NULL);
2657
2658                 vdev_space_update(vd, 0, 0, msp->ms_size);
2659         }
2660         ASSERT0(range_tree_space(msp->ms_freeing));
2661         ASSERT0(range_tree_space(msp->ms_checkpointing));
2662
2663         defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
2664
2665         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2666             metaslab_class_get_alloc(spa_normal_class(spa));
2667         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2668                 defer_allowed = B_FALSE;
2669         }
2670
2671         defer_delta = 0;
2672         alloc_delta = space_map_alloc_delta(msp->ms_sm);
2673         if (defer_allowed) {
2674                 defer_delta = range_tree_space(msp->ms_freed) -
2675                     range_tree_space(*defer_tree);
2676         } else {
2677                 defer_delta -= range_tree_space(*defer_tree);
2678         }
2679
2680         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2681
2682         /*
2683          * If there's a metaslab_load() in progress, wait for it to complete
2684          * so that we have a consistent view of the in-core space map.
2685          */
2686         metaslab_load_wait(msp);
2687
2688         /*
2689          * Move the frees from the defer_tree back to the free
2690          * range tree (if it's loaded). Swap the freed_tree and
2691          * the defer_tree -- this is safe to do because we've
2692          * just emptied out the defer_tree.
2693          */
2694         range_tree_vacate(*defer_tree,
2695             msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
2696         if (defer_allowed) {
2697                 range_tree_swap(&msp->ms_freed, defer_tree);
2698         } else {
2699                 range_tree_vacate(msp->ms_freed,
2700                     msp->ms_loaded ? range_tree_add : NULL,
2701                     msp->ms_allocatable);
2702         }
2703         space_map_update(msp->ms_sm);
2704
2705         msp->ms_deferspace += defer_delta;
2706         ASSERT3S(msp->ms_deferspace, >=, 0);
2707         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2708         if (msp->ms_deferspace != 0) {
2709                 /*
2710                  * Keep syncing this metaslab until all deferred frees
2711                  * are back in circulation.
2712                  */
2713                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2714         }
2715
2716         if (msp->ms_new) {
2717                 msp->ms_new = B_FALSE;
2718                 mutex_enter(&mg->mg_lock);
2719                 mg->mg_ms_ready++;
2720                 mutex_exit(&mg->mg_lock);
2721         }
2722         /*
2723          * Calculate the new weights before unloading any metaslabs.
2724          * This will give us the most accurate weighting.
2725          */
2726         metaslab_group_sort(mg, msp, metaslab_weight(msp) |
2727             (msp->ms_weight & METASLAB_ACTIVE_MASK));
2728
2729         /*
2730          * If the metaslab is loaded and we've not tried to load or allocate
2731          * from it in 'metaslab_unload_delay' txgs, then unload it.
2732          */
2733         if (msp->ms_loaded &&
2734             msp->ms_selected_txg + metaslab_unload_delay < txg) {
2735                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2736                         VERIFY0(range_tree_space(
2737                             msp->ms_allocating[(txg + t) & TXG_MASK]));
2738                 }
2739                 if (msp->ms_allocator != -1) {
2740                         metaslab_passivate(msp, msp->ms_weight &
2741                             ~METASLAB_ACTIVE_MASK);
2742                 }
2743
2744                 if (!metaslab_debug_unload)
2745                         metaslab_unload(msp);
2746         }
2747
2748         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2749         ASSERT0(range_tree_space(msp->ms_freeing));
2750         ASSERT0(range_tree_space(msp->ms_freed));
2751         ASSERT0(range_tree_space(msp->ms_checkpointing));
2752
2753         mutex_exit(&msp->ms_lock);
2754 }
2755
2756 void
2757 metaslab_sync_reassess(metaslab_group_t *mg)
2758 {
2759         spa_t *spa = mg->mg_class->mc_spa;
2760
2761         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2762         metaslab_group_alloc_update(mg);
2763         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2764
2765         /*
2766          * Preload the next potential metaslabs but only on active
2767          * metaslab groups. We can get into a state where the metaslab
2768          * is no longer active since we dirty metaslabs as we remove a
2769          * a device, thus potentially making the metaslab group eligible
2770          * for preloading.
2771          */
2772         if (mg->mg_activation_count > 0) {
2773                 metaslab_group_preload(mg);
2774         }
2775         spa_config_exit(spa, SCL_ALLOC, FTAG);
2776 }
2777
2778 static uint64_t
2779 metaslab_distance(metaslab_t *msp, dva_t *dva)
2780 {
2781         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2782         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2783         uint64_t start = msp->ms_id;
2784
2785         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2786                 return (1ULL << 63);
2787
2788         if (offset < start)
2789                 return ((start - offset) << ms_shift);
2790         if (offset > start)
2791                 return ((offset - start) << ms_shift);
2792         return (0);
2793 }
2794
2795 /*
2796  * ==========================================================================
2797  * Metaslab allocation tracing facility
2798  * ==========================================================================
2799  */
2800 kstat_t *metaslab_trace_ksp;
2801 kstat_named_t metaslab_trace_over_limit;
2802
2803 void
2804 metaslab_alloc_trace_init(void)
2805 {
2806         ASSERT(metaslab_alloc_trace_cache == NULL);
2807         metaslab_alloc_trace_cache = kmem_cache_create(
2808             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2809             0, NULL, NULL, NULL, NULL, NULL, 0);
2810         metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2811             "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2812         if (metaslab_trace_ksp != NULL) {
2813                 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2814                 kstat_named_init(&metaslab_trace_over_limit,
2815                     "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2816                 kstat_install(metaslab_trace_ksp);
2817         }
2818 }
2819
2820 void
2821 metaslab_alloc_trace_fini(void)
2822 {
2823         if (metaslab_trace_ksp != NULL) {
2824                 kstat_delete(metaslab_trace_ksp);
2825                 metaslab_trace_ksp = NULL;
2826         }
2827         kmem_cache_destroy(metaslab_alloc_trace_cache);
2828         metaslab_alloc_trace_cache = NULL;
2829 }
2830
2831 /*
2832  * Add an allocation trace element to the allocation tracing list.
2833  */
2834 static void
2835 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2836     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
2837     int allocator)
2838 {
2839         if (!metaslab_trace_enabled)
2840                 return;
2841
2842         /*
2843          * When the tracing list reaches its maximum we remove
2844          * the second element in the list before adding a new one.
2845          * By removing the second element we preserve the original
2846          * entry as a clue to what allocations steps have already been
2847          * performed.
2848          */
2849         if (zal->zal_size == metaslab_trace_max_entries) {
2850                 metaslab_alloc_trace_t *mat_next;
2851 #ifdef DEBUG
2852                 panic("too many entries in allocation list");
2853 #endif
2854                 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2855                 zal->zal_size--;
2856                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2857                 list_remove(&zal->zal_list, mat_next);
2858                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2859         }
2860
2861         metaslab_alloc_trace_t *mat =
2862             kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2863         list_link_init(&mat->mat_list_node);
2864         mat->mat_mg = mg;
2865         mat->mat_msp = msp;
2866         mat->mat_size = psize;
2867         mat->mat_dva_id = dva_id;
2868         mat->mat_offset = offset;
2869         mat->mat_weight = 0;
2870         mat->mat_allocator = allocator;
2871
2872         if (msp != NULL)
2873                 mat->mat_weight = msp->ms_weight;
2874
2875         /*
2876          * The list is part of the zio so locking is not required. Only
2877          * a single thread will perform allocations for a given zio.
2878          */
2879         list_insert_tail(&zal->zal_list, mat);
2880         zal->zal_size++;
2881
2882         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2883 }
2884
2885 void
2886 metaslab_trace_init(zio_alloc_list_t *zal)
2887 {
2888         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2889             offsetof(metaslab_alloc_trace_t, mat_list_node));
2890         zal->zal_size = 0;
2891 }
2892
2893 void
2894 metaslab_trace_fini(zio_alloc_list_t *zal)
2895 {
2896         metaslab_alloc_trace_t *mat;
2897
2898         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2899                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
2900         list_destroy(&zal->zal_list);
2901         zal->zal_size = 0;
2902 }
2903
2904 /*
2905  * ==========================================================================
2906  * Metaslab block operations
2907  * ==========================================================================
2908  */
2909
2910 static void
2911 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
2912     int allocator)
2913 {
2914         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2915             (flags & METASLAB_DONT_THROTTLE))
2916                 return;
2917
2918         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2919         if (!mg->mg_class->mc_alloc_throttle_enabled)
2920                 return;
2921
2922         (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
2923 }
2924
2925 static void
2926 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
2927 {
2928         uint64_t max = mg->mg_max_alloc_queue_depth;
2929         uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
2930         while (cur < max) {
2931                 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
2932                     cur, cur + 1) == cur) {
2933                         atomic_inc_64(
2934                             &mg->mg_class->mc_alloc_max_slots[allocator]);
2935                         return;
2936                 }
2937                 cur = mg->mg_cur_max_alloc_queue_depth[allocator];
2938         }
2939 }
2940
2941 void
2942 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
2943     int allocator, boolean_t io_complete)
2944 {
2945         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2946             (flags & METASLAB_DONT_THROTTLE))
2947                 return;
2948
2949         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2950         if (!mg->mg_class->mc_alloc_throttle_enabled)
2951                 return;
2952
2953         (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
2954         if (io_complete)
2955                 metaslab_group_increment_qdepth(mg, allocator);
2956 }
2957
2958 void
2959 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
2960     int allocator)
2961 {
2962 #ifdef ZFS_DEBUG
2963         const dva_t *dva = bp->blk_dva;
2964         int ndvas = BP_GET_NDVAS(bp);
2965
2966         for (int d = 0; d < ndvas; d++) {
2967                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2968                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2969                 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
2970                     tag));
2971         }
2972 #endif
2973 }
2974
2975 static uint64_t
2976 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2977 {
2978         uint64_t start;
2979         range_tree_t *rt = msp->ms_allocatable;
2980         metaslab_class_t *mc = msp->ms_group->mg_class;
2981
2982         VERIFY(!msp->ms_condensing);
2983
2984         start = mc->mc_ops->msop_alloc(msp, size);
2985         if (start != -1ULL) {
2986                 metaslab_group_t *mg = msp->ms_group;
2987                 vdev_t *vd = mg->mg_vd;
2988
2989                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2990                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2991                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2992                 range_tree_remove(rt, start, size);
2993
2994                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
2995                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2996
2997                 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
2998
2999                 /* Track the last successful allocation */
3000                 msp->ms_alloc_txg = txg;
3001                 metaslab_verify_space(msp, txg);
3002         }
3003
3004         /*
3005          * Now that we've attempted the allocation we need to update the
3006          * metaslab's maximum block size since it may have changed.
3007          */
3008         msp->ms_max_size = metaslab_block_maxsize(msp);
3009         return (start);
3010 }
3011
3012 /*
3013  * Find the metaslab with the highest weight that is less than what we've
3014  * already tried.  In the common case, this means that we will examine each
3015  * metaslab at most once. Note that concurrent callers could reorder metaslabs
3016  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
3017  * activated by another thread, and we fail to allocate from the metaslab we
3018  * have selected, we may not try the newly-activated metaslab, and instead
3019  * activate another metaslab.  This is not optimal, but generally does not cause
3020  * any problems (a possible exception being if every metaslab is completely full
3021  * except for the the newly-activated metaslab which we fail to examine).
3022  */
3023 static metaslab_t *
3024 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
3025     dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
3026     zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
3027 {
3028         avl_index_t idx;
3029         avl_tree_t *t = &mg->mg_metaslab_tree;
3030         metaslab_t *msp = avl_find(t, search, &idx);
3031         if (msp == NULL)
3032                 msp = avl_nearest(t, idx, AVL_AFTER);
3033
3034         for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
3035                 int i;
3036                 if (!metaslab_should_allocate(msp, asize)) {
3037                         metaslab_trace_add(zal, mg, msp, asize, d,
3038                             TRACE_TOO_SMALL, allocator);
3039                         continue;
3040                 }
3041
3042                 /*
3043                  * If the selected metaslab is condensing, skip it.
3044                  */
3045                 if (msp->ms_condensing)
3046                         continue;
3047
3048                 *was_active = msp->ms_allocator != -1;
3049                 /*
3050                  * If we're activating as primary, this is our first allocation
3051                  * from this disk, so we don't need to check how close we are.
3052                  * If the metaslab under consideration was already active,
3053                  * we're getting desperate enough to steal another allocator's
3054                  * metaslab, so we still don't care about distances.
3055                  */
3056                 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
3057                         break;
3058
3059                 uint64_t target_distance = min_distance
3060                     + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
3061                     min_distance >> 1);
3062
3063                 for (i = 0; i < d; i++) {
3064                         if (metaslab_distance(msp, &dva[i]) < target_distance)
3065                                 break;
3066                 }
3067                 if (i == d)
3068                         break;
3069         }
3070
3071         if (msp != NULL) {
3072                 search->ms_weight = msp->ms_weight;
3073                 search->ms_start = msp->ms_start + 1;
3074                 search->ms_allocator = msp->ms_allocator;
3075                 search->ms_primary = msp->ms_primary;
3076         }
3077         return (msp);
3078 }
3079
3080 /* ARGSUSED */
3081 static uint64_t
3082 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
3083     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
3084     int allocator)
3085 {
3086         metaslab_t *msp = NULL;
3087         uint64_t offset = -1ULL;
3088         uint64_t activation_weight;
3089         boolean_t tertiary = B_FALSE;
3090
3091         activation_weight = METASLAB_WEIGHT_PRIMARY;
3092         for (int i = 0; i < d; i++) {
3093                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
3094                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
3095                         activation_weight = METASLAB_WEIGHT_SECONDARY;
3096                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
3097                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
3098                         tertiary = B_TRUE;
3099                         break;
3100                 }
3101         }
3102
3103         /*
3104          * If we don't have enough metaslabs active to fill the entire array, we
3105          * just use the 0th slot.
3106          */
3107         if (mg->mg_ms_ready < mg->mg_allocators * 2) {
3108                 tertiary = B_FALSE;
3109                 allocator = 0;
3110         }
3111
3112         ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
3113
3114         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
3115         search->ms_weight = UINT64_MAX;
3116         search->ms_start = 0;
3117         /*
3118          * At the end of the metaslab tree are the already-active metaslabs,
3119          * first the primaries, then the secondaries. When we resume searching
3120          * through the tree, we need to consider ms_allocator and ms_primary so
3121          * we start in the location right after where we left off, and don't
3122          * accidentally loop forever considering the same metaslabs.
3123          */
3124         search->ms_allocator = -1;
3125         search->ms_primary = B_TRUE;
3126         for (;;) {
3127                 boolean_t was_active = B_FALSE;
3128
3129                 mutex_enter(&mg->mg_lock);
3130
3131                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
3132                     mg->mg_primaries[allocator] != NULL) {
3133                         msp = mg->mg_primaries[allocator];
3134                         was_active = B_TRUE;
3135                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
3136                     mg->mg_secondaries[allocator] != NULL && !tertiary) {
3137                         msp = mg->mg_secondaries[allocator];
3138                         was_active = B_TRUE;
3139                 } else {
3140                         msp = find_valid_metaslab(mg, activation_weight, dva, d,
3141                             min_distance, asize, allocator, zal, search,
3142                             &was_active);
3143                 }
3144
3145                 mutex_exit(&mg->mg_lock);
3146                 if (msp == NULL) {
3147                         kmem_free(search, sizeof (*search));
3148                         return (-1ULL);
3149                 }
3150
3151                 mutex_enter(&msp->ms_lock);
3152                 /*
3153                  * Ensure that the metaslab we have selected is still
3154                  * capable of handling our request. It's possible that
3155                  * another thread may have changed the weight while we
3156                  * were blocked on the metaslab lock. We check the
3157                  * active status first to see if we need to reselect
3158                  * a new metaslab.
3159                  */
3160                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
3161                         mutex_exit(&msp->ms_lock);
3162                         continue;
3163                 }
3164
3165                 /*
3166                  * If the metaslab is freshly activated for an allocator that
3167                  * isn't the one we're allocating from, or if it's a primary and
3168                  * we're seeking a secondary (or vice versa), we go back and
3169                  * select a new metaslab.
3170                  */
3171                 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
3172                     (msp->ms_allocator != -1) &&
3173                     (msp->ms_allocator != allocator || ((activation_weight ==
3174                     METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
3175                         mutex_exit(&msp->ms_lock);
3176                         continue;
3177                 }
3178
3179                 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3180                         metaslab_passivate(msp, msp->ms_weight &
3181                             ~METASLAB_WEIGHT_CLAIM);
3182                         mutex_exit(&msp->ms_lock);
3183                         continue;
3184                 }
3185
3186                 if (metaslab_activate(msp, allocator, activation_weight) != 0) {
3187                         mutex_exit(&msp->ms_lock);
3188                         continue;
3189                 }
3190
3191                 msp->ms_selected_txg = txg;
3192
3193                 /*
3194                  * Now that we have the lock, recheck to see if we should
3195                  * continue to use this metaslab for this allocation. The
3196                  * the metaslab is now loaded so metaslab_should_allocate() can
3197                  * accurately determine if the allocation attempt should
3198                  * proceed.
3199                  */
3200                 if (!metaslab_should_allocate(msp, asize)) {
3201                         /* Passivate this metaslab and select a new one. */
3202                         metaslab_trace_add(zal, mg, msp, asize, d,
3203                             TRACE_TOO_SMALL, allocator);
3204                         goto next;
3205                 }
3206
3207                 /*
3208                  * If this metaslab is currently condensing then pick again as
3209                  * we can't manipulate this metaslab until it's committed
3210                  * to disk.
3211                  */
3212                 if (msp->ms_condensing) {
3213                         metaslab_trace_add(zal, mg, msp, asize, d,
3214                             TRACE_CONDENSING, allocator);
3215                         metaslab_passivate(msp, msp->ms_weight &
3216                             ~METASLAB_ACTIVE_MASK);
3217                         mutex_exit(&msp->ms_lock);
3218                         continue;
3219                 }
3220
3221                 offset = metaslab_block_alloc(msp, asize, txg);
3222                 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
3223
3224                 if (offset != -1ULL) {
3225                         /* Proactively passivate the metaslab, if needed */
3226                         metaslab_segment_may_passivate(msp);
3227                         break;
3228                 }
3229 next:
3230                 ASSERT(msp->ms_loaded);
3231
3232                 /*
3233                  * We were unable to allocate from this metaslab so determine
3234                  * a new weight for this metaslab. Now that we have loaded
3235                  * the metaslab we can provide a better hint to the metaslab
3236                  * selector.
3237                  *
3238                  * For space-based metaslabs, we use the maximum block size.
3239                  * This information is only available when the metaslab
3240                  * is loaded and is more accurate than the generic free
3241                  * space weight that was calculated by metaslab_weight().
3242                  * This information allows us to quickly compare the maximum
3243                  * available allocation in the metaslab to the allocation
3244                  * size being requested.
3245                  *
3246                  * For segment-based metaslabs, determine the new weight
3247                  * based on the highest bucket in the range tree. We
3248                  * explicitly use the loaded segment weight (i.e. the range
3249                  * tree histogram) since it contains the space that is
3250                  * currently available for allocation and is accurate
3251                  * even within a sync pass.
3252                  */
3253                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
3254                         uint64_t weight = metaslab_block_maxsize(msp);
3255                         WEIGHT_SET_SPACEBASED(weight);
3256                         metaslab_passivate(msp, weight);
3257                 } else {
3258                         metaslab_passivate(msp,
3259                             metaslab_weight_from_range_tree(msp));
3260                 }
3261
3262                 /*
3263                  * We have just failed an allocation attempt, check
3264                  * that metaslab_should_allocate() agrees. Otherwise,
3265                  * we may end up in an infinite loop retrying the same
3266                  * metaslab.
3267                  */
3268                 ASSERT(!metaslab_should_allocate(msp, asize));
3269                 mutex_exit(&msp->ms_lock);
3270         }
3271         mutex_exit(&msp->ms_lock);
3272         kmem_free(search, sizeof (*search));
3273         return (offset);
3274 }
3275
3276 static uint64_t
3277 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
3278     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
3279     int allocator)
3280 {
3281         uint64_t offset;
3282         ASSERT(mg->mg_initialized);
3283
3284         offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
3285             min_distance, dva, d, allocator);
3286
3287         mutex_enter(&mg->mg_lock);
3288         if (offset == -1ULL) {
3289                 mg->mg_failed_allocations++;
3290                 metaslab_trace_add(zal, mg, NULL, asize, d,
3291                     TRACE_GROUP_FAILURE, allocator);
3292                 if (asize == SPA_GANGBLOCKSIZE) {
3293                         /*
3294                          * This metaslab group was unable to allocate
3295                          * the minimum gang block size so it must be out of
3296                          * space. We must notify the allocation throttle
3297                          * to start skipping allocation attempts to this
3298                          * metaslab group until more space becomes available.
3299                          * Note: this failure cannot be caused by the
3300                          * allocation throttle since the allocation throttle
3301                          * is only responsible for skipping devices and
3302                          * not failing block allocations.
3303                          */
3304                         mg->mg_no_free_space = B_TRUE;
3305                 }
3306         }
3307         mg->mg_allocations++;
3308         mutex_exit(&mg->mg_lock);
3309         return (offset);
3310 }
3311
3312 /*
3313  * If we have to write a ditto block (i.e. more than one DVA for a given BP)
3314  * on the same vdev as an existing DVA of this BP, then try to allocate it
3315  * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
3316  * existing DVAs.
3317  */
3318 int ditto_same_vdev_distance_shift = 3;
3319
3320 /*
3321  * Allocate a block for the specified i/o.
3322  */
3323 int
3324 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
3325     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
3326     zio_alloc_list_t *zal, int allocator)
3327 {
3328         metaslab_group_t *mg, *rotor;
3329         vdev_t *vd;
3330         boolean_t try_hard = B_FALSE;
3331
3332         ASSERT(!DVA_IS_VALID(&dva[d]));
3333
3334         /*
3335          * For testing, make some blocks above a certain size be gang blocks.
3336          */
3337         if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
3338                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
3339                     allocator);
3340                 return (SET_ERROR(ENOSPC));
3341         }
3342
3343         /*
3344          * Start at the rotor and loop through all mgs until we find something.
3345          * Note that there's no locking on mc_rotor or mc_aliquot because
3346          * nothing actually breaks if we miss a few updates -- we just won't
3347          * allocate quite as evenly.  It all balances out over time.
3348          *
3349          * If we are doing ditto or log blocks, try to spread them across
3350          * consecutive vdevs.  If we're forced to reuse a vdev before we've
3351          * allocated all of our ditto blocks, then try and spread them out on
3352          * that vdev as much as possible.  If it turns out to not be possible,
3353          * gradually lower our standards until anything becomes acceptable.
3354          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
3355          * gives us hope of containing our fault domains to something we're
3356          * able to reason about.  Otherwise, any two top-level vdev failures
3357          * will guarantee the loss of data.  With consecutive allocation,
3358          * only two adjacent top-level vdev failures will result in data loss.
3359          *
3360          * If we are doing gang blocks (hintdva is non-NULL), try to keep
3361          * ourselves on the same vdev as our gang block header.  That
3362          * way, we can hope for locality in vdev_cache, plus it makes our
3363          * fault domains something tractable.
3364          */
3365         if (hintdva) {
3366                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
3367
3368                 /*
3369                  * It's possible the vdev we're using as the hint no
3370                  * longer exists or its mg has been closed (e.g. by
3371                  * device removal).  Consult the rotor when
3372                  * all else fails.
3373                  */
3374                 if (vd != NULL && vd->vdev_mg != NULL) {
3375                         mg = vd->vdev_mg;
3376
3377                         if (flags & METASLAB_HINTBP_AVOID &&
3378                             mg->mg_next != NULL)
3379                                 mg = mg->mg_next;
3380                 } else {
3381                         mg = mc->mc_rotor;
3382                 }
3383         } else if (d != 0) {
3384                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3385                 mg = vd->vdev_mg->mg_next;
3386         } else {
3387                 mg = mc->mc_rotor;
3388         }
3389
3390         /*
3391          * If the hint put us into the wrong metaslab class, or into a
3392          * metaslab group that has been passivated, just follow the rotor.
3393          */
3394         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3395                 mg = mc->mc_rotor;
3396
3397         rotor = mg;
3398 top:
3399         do {
3400                 boolean_t allocatable;
3401
3402                 ASSERT(mg->mg_activation_count == 1);
3403                 vd = mg->mg_vd;
3404
3405                 /*
3406                  * Don't allocate from faulted devices.
3407                  */
3408                 if (try_hard) {
3409                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3410                         allocatable = vdev_allocatable(vd);
3411                         spa_config_exit(spa, SCL_ZIO, FTAG);
3412                 } else {
3413                         allocatable = vdev_allocatable(vd);
3414                 }
3415
3416                 /*
3417                  * Determine if the selected metaslab group is eligible
3418                  * for allocations. If we're ganging then don't allow
3419                  * this metaslab group to skip allocations since that would
3420                  * inadvertently return ENOSPC and suspend the pool
3421                  * even though space is still available.
3422                  */
3423                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3424                         allocatable = metaslab_group_allocatable(mg, rotor,
3425                             psize, allocator);
3426                 }
3427
3428                 if (!allocatable) {
3429                         metaslab_trace_add(zal, mg, NULL, psize, d,
3430                             TRACE_NOT_ALLOCATABLE, allocator);
3431                         goto next;
3432                 }
3433
3434                 ASSERT(mg->mg_initialized);
3435
3436                 /*
3437                  * Avoid writing single-copy data to a failing,
3438                  * non-redundant vdev, unless we've already tried all
3439                  * other vdevs.
3440                  */
3441                 if ((vd->vdev_stat.vs_write_errors > 0 ||
3442                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
3443                     d == 0 && !try_hard && vd->vdev_children == 0) {
3444                         metaslab_trace_add(zal, mg, NULL, psize, d,
3445                             TRACE_VDEV_ERROR, allocator);
3446                         goto next;
3447                 }
3448
3449                 ASSERT(mg->mg_class == mc);
3450
3451                 /*
3452                  * If we don't need to try hard, then require that the
3453                  * block be 1/8th of the device away from any other DVAs
3454                  * in this BP.  If we are trying hard, allow any offset
3455                  * to be used (distance=0).
3456                  */
3457                 uint64_t distance = 0;
3458                 if (!try_hard) {
3459                         distance = vd->vdev_asize >>
3460                             ditto_same_vdev_distance_shift;
3461                         if (distance <= (1ULL << vd->vdev_ms_shift))
3462                                 distance = 0;
3463                 }
3464
3465                 uint64_t asize = vdev_psize_to_asize(vd, psize);
3466                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3467
3468                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3469                     distance, dva, d, allocator);
3470
3471                 if (offset != -1ULL) {
3472                         /*
3473                          * If we've just selected this metaslab group,
3474                          * figure out whether the corresponding vdev is
3475                          * over- or under-used relative to the pool,
3476                          * and set an allocation bias to even it out.
3477                          */
3478                         if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3479                                 vdev_stat_t *vs = &vd->vdev_stat;
3480                                 int64_t vu, cu;
3481
3482                                 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3483                                 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3484
3485                                 /*
3486                                  * Calculate how much more or less we should
3487                                  * try to allocate from this device during
3488                                  * this iteration around the rotor.
3489                                  * For example, if a device is 80% full
3490                                  * and the pool is 20% full then we should
3491                                  * reduce allocations by 60% on this device.
3492                                  *
3493                                  * mg_bias = (20 - 80) * 512K / 100 = -307K
3494                                  *
3495                                  * This reduces allocations by 307K for this
3496                                  * iteration.
3497                                  */
3498                                 mg->mg_bias = ((cu - vu) *
3499                                     (int64_t)mg->mg_aliquot) / 100;
3500                         } else if (!metaslab_bias_enabled) {
3501                                 mg->mg_bias = 0;
3502                         }
3503
3504                         if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3505                             mg->mg_aliquot + mg->mg_bias) {
3506                                 mc->mc_rotor = mg->mg_next;
3507                                 mc->mc_aliquot = 0;
3508                         }
3509
3510                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
3511                         DVA_SET_OFFSET(&dva[d], offset);
3512                         DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3513                         DVA_SET_ASIZE(&dva[d], asize);
3514
3515                         return (0);
3516                 }
3517 next:
3518                 mc->mc_rotor = mg->mg_next;
3519                 mc->mc_aliquot = 0;
3520         } while ((mg = mg->mg_next) != rotor);
3521
3522         /*
3523          * If we haven't tried hard, do so now.
3524          */
3525         if (!try_hard) {
3526                 try_hard = B_TRUE;
3527                 goto top;
3528         }
3529
3530         bzero(&dva[d], sizeof (dva_t));
3531
3532         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
3533         return (SET_ERROR(ENOSPC));
3534 }
3535
3536 void
3537 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3538     boolean_t checkpoint)
3539 {
3540         metaslab_t *msp;
3541         spa_t *spa = vd->vdev_spa;
3542
3543         ASSERT(vdev_is_concrete(vd));
3544         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3545         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3546
3547         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3548
3549         VERIFY(!msp->ms_condensing);
3550         VERIFY3U(offset, >=, msp->ms_start);
3551         VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
3552         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3553         VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
3554
3555         metaslab_check_free_impl(vd, offset, asize);
3556
3557         mutex_enter(&msp->ms_lock);
3558         if (range_tree_is_empty(msp->ms_freeing) &&
3559             range_tree_is_empty(msp->ms_checkpointing)) {
3560                 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
3561         }
3562
3563         if (checkpoint) {
3564                 ASSERT(spa_has_checkpoint(spa));
3565                 range_tree_add(msp->ms_checkpointing, offset, asize);
3566         } else {
3567                 range_tree_add(msp->ms_freeing, offset, asize);
3568         }
3569         mutex_exit(&msp->ms_lock);
3570 }
3571
3572 /* ARGSUSED */
3573 void
3574 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3575     uint64_t size, void *arg)
3576 {
3577         boolean_t *checkpoint = arg;
3578
3579         ASSERT3P(checkpoint, !=, NULL);
3580
3581         if (vd->vdev_ops->vdev_op_remap != NULL)
3582                 vdev_indirect_mark_obsolete(vd, offset, size);
3583         else
3584                 metaslab_free_impl(vd, offset, size, *checkpoint);
3585 }
3586
3587 static void
3588 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
3589     boolean_t checkpoint)
3590 {
3591         spa_t *spa = vd->vdev_spa;
3592
3593         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3594
3595         if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
3596                 return;
3597
3598         if (spa->spa_vdev_removal != NULL &&
3599             spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
3600             vdev_is_concrete(vd)) {
3601                 /*
3602                  * Note: we check if the vdev is concrete because when
3603                  * we complete the removal, we first change the vdev to be
3604                  * an indirect vdev (in open context), and then (in syncing
3605                  * context) clear spa_vdev_removal.
3606                  */
3607                 free_from_removing_vdev(vd, offset, size);
3608         } else if (vd->vdev_ops->vdev_op_remap != NULL) {
3609                 vdev_indirect_mark_obsolete(vd, offset, size);
3610                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3611                     metaslab_free_impl_cb, &checkpoint);
3612         } else {
3613                 metaslab_free_concrete(vd, offset, size, checkpoint);
3614         }
3615 }
3616
3617 typedef struct remap_blkptr_cb_arg {
3618         blkptr_t *rbca_bp;
3619         spa_remap_cb_t rbca_cb;
3620         vdev_t *rbca_remap_vd;
3621         uint64_t rbca_remap_offset;
3622         void *rbca_cb_arg;
3623 } remap_blkptr_cb_arg_t;
3624
3625 void
3626 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3627     uint64_t size, void *arg)
3628 {
3629         remap_blkptr_cb_arg_t *rbca = arg;
3630         blkptr_t *bp = rbca->rbca_bp;
3631
3632         /* We can not remap split blocks. */
3633         if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
3634                 return;
3635         ASSERT0(inner_offset);
3636
3637         if (rbca->rbca_cb != NULL) {
3638                 /*
3639                  * At this point we know that we are not handling split
3640                  * blocks and we invoke the callback on the previous
3641                  * vdev which must be indirect.
3642                  */
3643                 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
3644
3645                 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
3646                     rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
3647
3648                 /* set up remap_blkptr_cb_arg for the next call */
3649                 rbca->rbca_remap_vd = vd;
3650                 rbca->rbca_remap_offset = offset;
3651         }
3652
3653         /*
3654          * The phys birth time is that of dva[0].  This ensures that we know
3655          * when each dva was written, so that resilver can determine which
3656          * blocks need to be scrubbed (i.e. those written during the time
3657          * the vdev was offline).  It also ensures that the key used in
3658          * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
3659          * we didn't change the phys_birth, a lookup in the ARC for a
3660          * remapped BP could find the data that was previously stored at
3661          * this vdev + offset.
3662          */
3663         vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
3664             DVA_GET_VDEV(&bp->blk_dva[0]));
3665         vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
3666         bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
3667             DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
3668
3669         DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
3670         DVA_SET_OFFSET(&bp->blk_dva[0], offset);
3671 }
3672
3673 /*
3674  * If the block pointer contains any indirect DVAs, modify them to refer to
3675  * concrete DVAs.  Note that this will sometimes not be possible, leaving
3676  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
3677  * segments in the mapping (i.e. it is a "split block").
3678  *
3679  * If the BP was remapped, calls the callback on the original dva (note the
3680  * callback can be called multiple times if the original indirect DVA refers
3681  * to another indirect DVA, etc).
3682  *
3683  * Returns TRUE if the BP was remapped.
3684  */
3685 boolean_t
3686 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
3687 {
3688         remap_blkptr_cb_arg_t rbca;
3689
3690         if (!zfs_remap_blkptr_enable)
3691                 return (B_FALSE);
3692
3693         if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
3694                 return (B_FALSE);
3695
3696         /*
3697          * Dedup BP's can not be remapped, because ddt_phys_select() depends
3698          * on DVA[0] being the same in the BP as in the DDT (dedup table).
3699          */
3700         if (BP_GET_DEDUP(bp))
3701                 return (B_FALSE);
3702
3703         /*
3704          * Gang blocks can not be remapped, because
3705          * zio_checksum_gang_verifier() depends on the DVA[0] that's in
3706          * the BP used to read the gang block header (GBH) being the same
3707          * as the DVA[0] that we allocated for the GBH.
3708          */
3709         if (BP_IS_GANG(bp))
3710                 return (B_FALSE);
3711
3712         /*
3713          * Embedded BP's have no DVA to remap.
3714          */
3715         if (BP_GET_NDVAS(bp) < 1)
3716                 return (B_FALSE);
3717
3718         /*
3719          * Note: we only remap dva[0].  If we remapped other dvas, we
3720          * would no longer know what their phys birth txg is.
3721          */
3722         dva_t *dva = &bp->blk_dva[0];
3723
3724         uint64_t offset = DVA_GET_OFFSET(dva);
3725         uint64_t size = DVA_GET_ASIZE(dva);
3726         vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
3727
3728         if (vd->vdev_ops->vdev_op_remap == NULL)
3729                 return (B_FALSE);
3730
3731         rbca.rbca_bp = bp;
3732         rbca.rbca_cb = callback;
3733         rbca.rbca_remap_vd = vd;
3734         rbca.rbca_remap_offset = offset;
3735         rbca.rbca_cb_arg = arg;
3736
3737         /*
3738          * remap_blkptr_cb() will be called in order for each level of
3739          * indirection, until a concrete vdev is reached or a split block is
3740          * encountered. old_vd and old_offset are updated within the callback
3741          * as we go from the one indirect vdev to the next one (either concrete
3742          * or indirect again) in that order.
3743          */
3744         vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
3745
3746         /* Check if the DVA wasn't remapped because it is a split block */
3747         if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
3748                 return (B_FALSE);
3749
3750         return (B_TRUE);
3751 }
3752
3753 /*
3754  * Undo the allocation of a DVA which happened in the given transaction group.
3755  */
3756 void
3757 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3758 {
3759         metaslab_t *msp;
3760         vdev_t *vd;
3761         uint64_t vdev = DVA_GET_VDEV(dva);
3762         uint64_t offset = DVA_GET_OFFSET(dva);
3763         uint64_t size = DVA_GET_ASIZE(dva);
3764
3765         ASSERT(DVA_IS_VALID(dva));
3766         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3767
3768         if (txg > spa_freeze_txg(spa))
3769                 return;
3770
3771         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3772             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3773                 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3774                     (u_longlong_t)vdev, (u_longlong_t)offset);
3775                 ASSERT(0);
3776                 return;
3777         }
3778
3779         ASSERT(!vd->vdev_removing);
3780         ASSERT(vdev_is_concrete(vd));
3781         ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3782         ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
3783
3784         if (DVA_GET_GANG(dva))
3785                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3786
3787         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3788
3789         mutex_enter(&msp->ms_lock);
3790         range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
3791             offset, size);
3792
3793         VERIFY(!msp->ms_condensing);
3794         VERIFY3U(offset, >=, msp->ms_start);
3795         VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3796         VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
3797             msp->ms_size);
3798         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3799         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3800         range_tree_add(msp->ms_allocatable, offset, size);
3801         mutex_exit(&msp->ms_lock);
3802 }
3803
3804 /*
3805  * Free the block represented by the given DVA.
3806  */
3807 void
3808 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
3809 {
3810         uint64_t vdev = DVA_GET_VDEV(dva);
3811         uint64_t offset = DVA_GET_OFFSET(dva);
3812         uint64_t size = DVA_GET_ASIZE(dva);
3813         vdev_t *vd = vdev_lookup_top(spa, vdev);
3814
3815         ASSERT(DVA_IS_VALID(dva));
3816         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3817
3818         if (DVA_GET_GANG(dva)) {
3819                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3820         }
3821
3822         metaslab_free_impl(vd, offset, size, checkpoint);
3823 }
3824
3825 /*
3826  * Reserve some allocation slots. The reservation system must be called
3827  * before we call into the allocator. If there aren't any available slots
3828  * then the I/O will be throttled until an I/O completes and its slots are
3829  * freed up. The function returns true if it was successful in placing
3830  * the reservation.
3831  */
3832 boolean_t
3833 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
3834     zio_t *zio, int flags)
3835 {
3836         uint64_t available_slots = 0;
3837         boolean_t slot_reserved = B_FALSE;
3838         uint64_t max = mc->mc_alloc_max_slots[allocator];
3839
3840         ASSERT(mc->mc_alloc_throttle_enabled);
3841         mutex_enter(&mc->mc_lock);
3842
3843         uint64_t reserved_slots =
3844             refcount_count(&mc->mc_alloc_slots[allocator]);
3845         if (reserved_slots < max)
3846                 available_slots = max - reserved_slots;
3847
3848         if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3849                 /*
3850                  * We reserve the slots individually so that we can unreserve
3851                  * them individually when an I/O completes.
3852                  */
3853                 for (int d = 0; d < slots; d++) {
3854                         reserved_slots =
3855                             refcount_add(&mc->mc_alloc_slots[allocator],
3856                             zio);
3857                 }
3858                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3859                 slot_reserved = B_TRUE;
3860         }
3861
3862         mutex_exit(&mc->mc_lock);
3863         return (slot_reserved);
3864 }
3865
3866 void
3867 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
3868     int allocator, zio_t *zio)
3869 {
3870         ASSERT(mc->mc_alloc_throttle_enabled);
3871         mutex_enter(&mc->mc_lock);
3872         for (int d = 0; d < slots; d++) {
3873                 (void) refcount_remove(&mc->mc_alloc_slots[allocator],
3874                     zio);
3875         }
3876         mutex_exit(&mc->mc_lock);
3877 }
3878
3879 static int
3880 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3881     uint64_t txg)
3882 {
3883         metaslab_t *msp;
3884         spa_t *spa = vd->vdev_spa;
3885         int error = 0;
3886
3887         if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
3888                 return (ENXIO);
3889
3890         ASSERT3P(vd->vdev_ms, !=, NULL);
3891         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3892
3893         mutex_enter(&msp->ms_lock);
3894
3895         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3896                 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
3897         /*
3898          * No need to fail in that case; someone else has activated the
3899          * metaslab, but that doesn't preclude us from using it.
3900          */
3901         if (error == EBUSY)
3902                 error = 0;
3903
3904         if (error == 0 &&
3905             !range_tree_contains(msp->ms_allocatable, offset, size))
3906                 error = SET_ERROR(ENOENT);
3907
3908         if (error || txg == 0) {        /* txg == 0 indicates dry run */
3909                 mutex_exit(&msp->ms_lock);
3910                 return (error);
3911         }
3912
3913         VERIFY(!msp->ms_condensing);
3914         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3915         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3916         VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
3917             msp->ms_size);
3918         range_tree_remove(msp->ms_allocatable, offset, size);
3919
3920         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3921                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
3922                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
3923                 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
3924                     offset, size);
3925         }
3926
3927         mutex_exit(&msp->ms_lock);
3928
3929         return (0);
3930 }
3931
3932 typedef struct metaslab_claim_cb_arg_t {
3933         uint64_t        mcca_txg;
3934         int             mcca_error;
3935 } metaslab_claim_cb_arg_t;
3936
3937 /* ARGSUSED */
3938 static void
3939 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3940     uint64_t size, void *arg)
3941 {
3942         metaslab_claim_cb_arg_t *mcca_arg = arg;
3943
3944         if (mcca_arg->mcca_error == 0) {
3945                 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
3946                     size, mcca_arg->mcca_txg);
3947         }
3948 }
3949
3950 int
3951 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
3952 {
3953         if (vd->vdev_ops->vdev_op_remap != NULL) {
3954                 metaslab_claim_cb_arg_t arg;
3955
3956                 /*
3957                  * Only zdb(1M) can claim on indirect vdevs.  This is used
3958                  * to detect leaks of mapped space (that are not accounted
3959                  * for in the obsolete counts, spacemap, or bpobj).
3960                  */
3961                 ASSERT(!spa_writeable(vd->vdev_spa));
3962                 arg.mcca_error = 0;
3963                 arg.mcca_txg = txg;
3964
3965                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3966                     metaslab_claim_impl_cb, &arg);
3967
3968                 if (arg.mcca_error == 0) {
3969                         arg.mcca_error = metaslab_claim_concrete(vd,
3970                             offset, size, txg);
3971                 }
3972                 return (arg.mcca_error);
3973         } else {
3974                 return (metaslab_claim_concrete(vd, offset, size, txg));
3975         }
3976 }
3977
3978 /*
3979  * Intent log support: upon opening the pool after a crash, notify the SPA
3980  * of blocks that the intent log has allocated for immediate write, but
3981  * which are still considered free by the SPA because the last transaction
3982  * group didn't commit yet.
3983  */
3984 static int
3985 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3986 {
3987         uint64_t vdev = DVA_GET_VDEV(dva);
3988         uint64_t offset = DVA_GET_OFFSET(dva);
3989         uint64_t size = DVA_GET_ASIZE(dva);
3990         vdev_t *vd;
3991
3992         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
3993                 return (SET_ERROR(ENXIO));
3994         }
3995
3996         ASSERT(DVA_IS_VALID(dva));
3997
3998         if (DVA_GET_GANG(dva))
3999                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4000
4001         return (metaslab_claim_impl(vd, offset, size, txg));
4002 }
4003
4004 int
4005 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4006     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
4007     zio_alloc_list_t *zal, zio_t *zio, int allocator)
4008 {
4009         dva_t *dva = bp->blk_dva;
4010         dva_t *hintdva = hintbp->blk_dva;
4011         int error = 0;
4012
4013         ASSERT(bp->blk_birth == 0);
4014         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4015
4016         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4017
4018         if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
4019                 spa_config_exit(spa, SCL_ALLOC, FTAG);
4020                 return (SET_ERROR(ENOSPC));
4021         }
4022
4023         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
4024         ASSERT(BP_GET_NDVAS(bp) == 0);
4025         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4026         ASSERT3P(zal, !=, NULL);
4027
4028         for (int d = 0; d < ndvas; d++) {
4029                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
4030                     txg, flags, zal, allocator);
4031                 if (error != 0) {
4032                         for (d--; d >= 0; d--) {
4033                                 metaslab_unalloc_dva(spa, &dva[d], txg);
4034                                 metaslab_group_alloc_decrement(spa,
4035                                     DVA_GET_VDEV(&dva[d]), zio, flags,
4036                                     allocator, B_FALSE);
4037                                 bzero(&dva[d], sizeof (dva_t));
4038                         }
4039                         spa_config_exit(spa, SCL_ALLOC, FTAG);
4040                         return (error);
4041                 } else {
4042                         /*
4043                          * Update the metaslab group's queue depth
4044                          * based on the newly allocated dva.
4045                          */
4046                         metaslab_group_alloc_increment(spa,
4047                             DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
4048                 }
4049
4050         }
4051         ASSERT(error == 0);
4052         ASSERT(BP_GET_NDVAS(bp) == ndvas);
4053
4054         spa_config_exit(spa, SCL_ALLOC, FTAG);
4055
4056         BP_SET_BIRTH(bp, txg, txg);
4057
4058         return (0);
4059 }
4060
4061 void
4062 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
4063 {
4064         const dva_t *dva = bp->blk_dva;
4065         int ndvas = BP_GET_NDVAS(bp);
4066
4067         ASSERT(!BP_IS_HOLE(bp));
4068         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
4069
4070         /*
4071          * If we have a checkpoint for the pool we need to make sure that
4072          * the blocks that we free that are part of the checkpoint won't be
4073          * reused until the checkpoint is discarded or we revert to it.
4074          *
4075          * The checkpoint flag is passed down the metaslab_free code path
4076          * and is set whenever we want to add a block to the checkpoint's
4077          * accounting. That is, we "checkpoint" blocks that existed at the
4078          * time the checkpoint was created and are therefore referenced by
4079          * the checkpointed uberblock.
4080          *
4081          * Note that, we don't checkpoint any blocks if the current
4082          * syncing txg <= spa_checkpoint_txg. We want these frees to sync
4083          * normally as they will be referenced by the checkpointed uberblock.
4084          */
4085         boolean_t checkpoint = B_FALSE;
4086         if (bp->blk_birth <= spa->spa_checkpoint_txg &&
4087             spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
4088                 /*
4089                  * At this point, if the block is part of the checkpoint
4090                  * there is no way it was created in the current txg.
4091                  */
4092                 ASSERT(!now);
4093                 ASSERT3U(spa_syncing_txg(spa), ==, txg);
4094                 checkpoint = B_TRUE;
4095         }
4096
4097         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
4098
4099         for (int d = 0; d < ndvas; d++) {
4100                 if (now) {
4101                         metaslab_unalloc_dva(spa, &dva[d], txg);
4102                 } else {
4103                         ASSERT3U(txg, ==, spa_syncing_txg(spa));
4104                         metaslab_free_dva(spa, &dva[d], checkpoint);
4105                 }
4106         }
4107
4108         spa_config_exit(spa, SCL_FREE, FTAG);
4109 }
4110
4111 int
4112 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
4113 {
4114         const dva_t *dva = bp->blk_dva;
4115         int ndvas = BP_GET_NDVAS(bp);
4116         int error = 0;
4117
4118         ASSERT(!BP_IS_HOLE(bp));
4119
4120         if (txg != 0) {
4121                 /*
4122                  * First do a dry run to make sure all DVAs are claimable,
4123                  * so we don't have to unwind from partial failures below.
4124                  */
4125                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
4126                         return (error);
4127         }
4128
4129         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4130
4131         for (int d = 0; d < ndvas; d++)
4132                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
4133                         break;
4134
4135         spa_config_exit(spa, SCL_ALLOC, FTAG);
4136
4137         ASSERT(error == 0 || txg == 0);
4138
4139         return (error);
4140 }
4141
4142 /* ARGSUSED */
4143 static void
4144 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
4145     uint64_t size, void *arg)
4146 {
4147         if (vd->vdev_ops == &vdev_indirect_ops)
4148                 return;
4149
4150         metaslab_check_free_impl(vd, offset, size);
4151 }
4152
4153 static void
4154 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
4155 {
4156         metaslab_t *msp;
4157         spa_t *spa = vd->vdev_spa;
4158
4159         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4160                 return;
4161
4162         if (vd->vdev_ops->vdev_op_remap != NULL) {
4163                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
4164                     metaslab_check_free_impl_cb, NULL);
4165                 return;
4166         }
4167
4168         ASSERT(vdev_is_concrete(vd));
4169         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4170         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4171
4172         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4173
4174         mutex_enter(&msp->ms_lock);
4175         if (msp->ms_loaded)
4176                 range_tree_verify(msp->ms_allocatable, offset, size);
4177
4178         range_tree_verify(msp->ms_freeing, offset, size);
4179         range_tree_verify(msp->ms_checkpointing, offset, size);
4180         range_tree_verify(msp->ms_freed, offset, size);
4181         for (int j = 0; j < TXG_DEFER_SIZE; j++)
4182                 range_tree_verify(msp->ms_defer[j], offset, size);
4183         mutex_exit(&msp->ms_lock);
4184 }
4185
4186 void
4187 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
4188 {
4189         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4190                 return;
4191
4192         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
4193         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
4194                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
4195                 vdev_t *vd = vdev_lookup_top(spa, vdev);
4196                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
4197                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
4198
4199                 if (DVA_GET_GANG(&bp->blk_dva[i]))
4200                         size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4201
4202                 ASSERT3P(vd, !=, NULL);
4203
4204                 metaslab_check_free_impl(vd, offset, size);
4205         }
4206         spa_config_exit(spa, SCL_VDEV, FTAG);
4207 }