4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
17 * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
20 #include <sys/zfs_context.h>
22 #include <sys/spa_impl.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/fs/zfs.h>
26 #include <sys/metaslab.h>
27 #include <sys/refcount.h>
29 #include <sys/vdev_indirect_mapping.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dsl_synctask.h>
37 * An indirect vdev corresponds to a vdev that has been removed. Since
38 * we cannot rewrite block pointers of snapshots, etc., we keep a
39 * mapping from old location on the removed device to the new location
40 * on another device in the pool and use this mapping whenever we need
41 * to access the DVA. Unfortunately, this mapping did not respect
42 * logical block boundaries when it was first created, and so a DVA on
43 * this indirect vdev may be "split" into multiple sections that each
44 * map to a different location. As a consequence, not all DVAs can be
45 * translated to an equivalent new DVA. Instead we must provide a
46 * "vdev_remap" operation that executes a callback on each contiguous
47 * segment of the new location. This function is used in multiple ways:
49 * - reads and repair writes to this device use the callback to create
50 * a child io for each mapped segment.
52 * - frees and claims to this device use the callback to free or claim
53 * each mapped segment. (Note that we don't actually need to claim
54 * log blocks on indirect vdevs, because we don't allocate to
55 * removing vdevs. However, zdb uses zio_claim() for its leak
60 * "Big theory statement" for how we mark blocks obsolete.
62 * When a block on an indirect vdev is freed or remapped, a section of
63 * that vdev's mapping may no longer be referenced (aka "obsolete"). We
64 * keep track of how much of each mapping entry is obsolete. When
65 * an entry becomes completely obsolete, we can remove it, thus reducing
66 * the memory used by the mapping. The complete picture of obsolescence
67 * is given by the following data structures, described below:
68 * - the entry-specific obsolete count
69 * - the vdev-specific obsolete spacemap
70 * - the pool-specific obsolete bpobj
72 * == On disk data structures used ==
74 * We track the obsolete space for the pool using several objects. Each
75 * of these objects is created on demand and freed when no longer
76 * needed, and is assumed to be empty if it does not exist.
77 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
79 * - Each vic_mapping_object (associated with an indirect vdev) can
80 * have a vimp_counts_object. This is an array of uint32_t's
81 * with the same number of entries as the vic_mapping_object. When
82 * the mapping is condensed, entries from the vic_obsolete_sm_object
83 * (see below) are folded into the counts. Therefore, each
84 * obsolete_counts entry tells us the number of bytes in the
85 * corresponding mapping entry that were not referenced when the
86 * mapping was last condensed.
88 * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
89 * This is a space map containing an alloc entry for every DVA that
90 * has been obsoleted since the last time this indirect vdev was
91 * condensed. We use this object in order to improve performance
92 * when marking a DVA as obsolete. Instead of modifying an arbitrary
93 * offset of the vimp_counts_object, we only need to append an entry
94 * to the end of this object. When a DVA becomes obsolete, it is
95 * added to the obsolete space map. This happens when the DVA is
96 * freed, remapped and not referenced by a snapshot, or the last
97 * snapshot referencing it is destroyed.
99 * - Each dataset can have a ds_remap_deadlist object. This is a
100 * deadlist object containing all blocks that were remapped in this
101 * dataset but referenced in a previous snapshot. Blocks can *only*
102 * appear on this list if they were remapped (dsl_dataset_block_remapped);
103 * blocks that were killed in a head dataset are put on the normal
104 * ds_deadlist and marked obsolete when they are freed.
106 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
107 * in the pool that need to be marked obsolete. When a snapshot is
108 * destroyed, we move some of the ds_remap_deadlist to the obsolete
109 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
110 * asynchronously process the obsolete bpobj, moving its entries to
111 * the specific vdevs' obsolete space maps.
113 * == Summary of how we mark blocks as obsolete ==
115 * - When freeing a block: if any DVA is on an indirect vdev, append to
116 * vic_obsolete_sm_object.
117 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
118 * references; otherwise append to vic_obsolete_sm_object).
119 * - When freeing a snapshot: move parts of ds_remap_deadlist to
120 * dp_obsolete_bpobj (same algorithm as ds_deadlist).
121 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
122 * individual vdev's vic_obsolete_sm_object.
126 * "Big theory statement" for how we condense indirect vdevs.
128 * Condensing an indirect vdev's mapping is the process of determining
129 * the precise counts of obsolete space for each mapping entry (by
130 * integrating the obsolete spacemap into the obsolete counts) and
131 * writing out a new mapping that contains only referenced entries.
133 * We condense a vdev when we expect the mapping to shrink (see
134 * vdev_indirect_should_condense()), but only perform one condense at a
135 * time to limit the memory usage. In addition, we use a separate
136 * open-context thread (spa_condense_indirect_thread) to incrementally
137 * create the new mapping object in a way that minimizes the impact on
138 * the rest of the system.
140 * == Generating a new mapping ==
142 * To generate a new mapping, we follow these steps:
144 * 1. Save the old obsolete space map and create a new mapping object
145 * (see spa_condense_indirect_start_sync()). This initializes the
146 * spa_condensing_indirect_phys with the "previous obsolete space map",
147 * which is now read only. Newly obsolete DVAs will be added to a
148 * new (initially empty) obsolete space map, and will not be
149 * considered as part of this condense operation.
151 * 2. Construct in memory the precise counts of obsolete space for each
152 * mapping entry, by incorporating the obsolete space map into the
153 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
155 * 3. Iterate through each mapping entry, writing to the new mapping any
156 * entries that are not completely obsolete (i.e. which don't have
157 * obsolete count == mapping length). (See
158 * spa_condense_indirect_generate_new_mapping().)
160 * 4. Destroy the old mapping object and switch over to the new one
161 * (spa_condense_indirect_complete_sync).
163 * == Restarting from failure ==
165 * To restart the condense when we import/open the pool, we must start
166 * at the 2nd step above: reconstruct the precise counts in memory,
167 * based on the space map + counts. Then in the 3rd step, we start
168 * iterating where we left off: at vimp_max_offset of the new mapping
172 boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
175 * Condense if at least this percent of the bytes in the mapping is
176 * obsolete. With the default of 25%, the amount of space mapped
177 * will be reduced to 1% of its original size after at most 16
178 * condenses. Higher values will condense less often (causing less
179 * i/o); lower values will reduce the mapping size more quickly.
181 int zfs_indirect_condense_obsolete_pct = 25;
184 * Condense if the obsolete space map takes up more than this amount of
185 * space on disk (logically). This limits the amount of disk space
186 * consumed by the obsolete space map; the default of 1GB is small enough
187 * that we typically don't mind "wasting" it.
189 uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
192 * Don't bother condensing if the mapping uses less than this amount of
193 * memory. The default of 128KB is considered a "trivial" amount of
194 * memory and not worth reducing.
196 uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
199 * This is used by the test suite so that it can ensure that certain
200 * actions happen while in the middle of a condense (which might otherwise
201 * complete too quickly). If used to reduce the performance impact of
202 * condensing in production, a maximum value of 1 should be sufficient.
204 int zfs_condense_indirect_commit_entry_delay_ticks = 0;
207 * Mark the given offset and size as being obsolete in the given txg.
210 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
213 spa_t *spa = vd->vdev_spa;
214 ASSERT3U(spa_syncing_txg(spa), ==, txg);
215 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
216 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
218 VERIFY(vdev_indirect_mapping_entry_for_offset(
219 vd->vdev_indirect_mapping, offset) != NULL);
221 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
222 mutex_enter(&vd->vdev_obsolete_lock);
223 range_tree_add(vd->vdev_obsolete_segments, offset, size);
224 mutex_exit(&vd->vdev_obsolete_lock);
225 vdev_dirty(vd, 0, NULL, txg);
230 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
231 * wrapper is provided because the DMU does not know about vdev_t's and
232 * cannot directly call vdev_indirect_mark_obsolete.
235 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
236 uint64_t size, dmu_tx_t *tx)
238 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
239 ASSERT(dmu_tx_is_syncing(tx));
241 /* The DMU can only remap indirect vdevs. */
242 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
243 vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
246 static spa_condensing_indirect_t *
247 spa_condensing_indirect_create(spa_t *spa)
249 spa_condensing_indirect_phys_t *scip =
250 &spa->spa_condensing_indirect_phys;
251 spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
252 objset_t *mos = spa->spa_meta_objset;
254 for (int i = 0; i < TXG_SIZE; i++) {
255 list_create(&sci->sci_new_mapping_entries[i],
256 sizeof (vdev_indirect_mapping_entry_t),
257 offsetof(vdev_indirect_mapping_entry_t, vime_node));
260 sci->sci_new_mapping =
261 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
267 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
269 for (int i = 0; i < TXG_SIZE; i++)
270 list_destroy(&sci->sci_new_mapping_entries[i]);
272 if (sci->sci_new_mapping != NULL)
273 vdev_indirect_mapping_close(sci->sci_new_mapping);
275 kmem_free(sci, sizeof (*sci));
279 vdev_indirect_should_condense(vdev_t *vd)
281 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
282 spa_t *spa = vd->vdev_spa;
284 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
286 if (!zfs_condense_indirect_vdevs_enable)
290 * We can only condense one indirect vdev at a time.
292 if (spa->spa_condensing_indirect != NULL)
295 if (spa_shutting_down(spa))
299 * The mapping object size must not change while we are
300 * condensing, so we can only condense indirect vdevs
301 * (not vdevs that are still in the middle of being removed).
303 if (vd->vdev_ops != &vdev_indirect_ops)
307 * If nothing new has been marked obsolete, there is no
308 * point in condensing.
310 if (vd->vdev_obsolete_sm == NULL) {
311 ASSERT0(vdev_obsolete_sm_object(vd));
315 ASSERT(vd->vdev_obsolete_sm != NULL);
317 ASSERT3U(vdev_obsolete_sm_object(vd), ==,
318 space_map_object(vd->vdev_obsolete_sm));
320 uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
321 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
322 uint64_t mapping_size = vdev_indirect_mapping_size(vim);
323 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
325 ASSERT3U(bytes_obsolete, <=, bytes_mapped);
328 * If a high percentage of the bytes that are mapped have become
329 * obsolete, condense (unless the mapping is already small enough).
330 * This has a good chance of reducing the amount of memory used
333 if (bytes_obsolete * 100 / bytes_mapped >=
334 zfs_indirect_condense_obsolete_pct &&
335 mapping_size > zfs_condense_min_mapping_bytes) {
336 zfs_dbgmsg("should condense vdev %llu because obsolete "
337 "spacemap covers %d%% of %lluMB mapping",
338 (u_longlong_t)vd->vdev_id,
339 (int)(bytes_obsolete * 100 / bytes_mapped),
340 (u_longlong_t)bytes_mapped / 1024 / 1024);
345 * If the obsolete space map takes up too much space on disk,
346 * condense in order to free up this disk space.
348 if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
349 zfs_dbgmsg("should condense vdev %llu because obsolete sm "
350 "length %lluMB >= max size %lluMB",
351 (u_longlong_t)vd->vdev_id,
352 (u_longlong_t)obsolete_sm_size / 1024 / 1024,
353 (u_longlong_t)zfs_condense_max_obsolete_bytes /
362 * This sync task completes (finishes) a condense, deleting the old
363 * mapping and replacing it with the new one.
366 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
368 spa_condensing_indirect_t *sci = arg;
369 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
370 spa_condensing_indirect_phys_t *scip =
371 &spa->spa_condensing_indirect_phys;
372 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
373 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
374 objset_t *mos = spa->spa_meta_objset;
375 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
376 uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
378 vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
380 ASSERT(dmu_tx_is_syncing(tx));
381 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
382 ASSERT3P(sci, ==, spa->spa_condensing_indirect);
383 for (int i = 0; i < TXG_SIZE; i++) {
384 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
386 ASSERT(vic->vic_mapping_object != 0);
387 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
388 ASSERT(scip->scip_next_mapping_object != 0);
389 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
392 * Reset vdev_indirect_mapping to refer to the new object.
394 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
395 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
396 vd->vdev_indirect_mapping = sci->sci_new_mapping;
397 rw_exit(&vd->vdev_indirect_rwlock);
399 sci->sci_new_mapping = NULL;
400 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
401 vic->vic_mapping_object = scip->scip_next_mapping_object;
402 scip->scip_next_mapping_object = 0;
404 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
405 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
406 scip->scip_prev_obsolete_sm_object = 0;
410 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
411 DMU_POOL_CONDENSING_INDIRECT, tx));
412 spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
413 spa->spa_condensing_indirect = NULL;
415 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
416 "new mapping object %llu has %llu entries "
417 "(was %llu entries)",
418 vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
419 new_count, old_count);
421 vdev_config_dirty(spa->spa_root_vdev);
425 * This sync task appends entries to the new mapping object.
428 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
430 spa_condensing_indirect_t *sci = arg;
431 uint64_t txg = dmu_tx_get_txg(tx);
432 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
434 ASSERT(dmu_tx_is_syncing(tx));
435 ASSERT3P(sci, ==, spa->spa_condensing_indirect);
437 vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
438 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
439 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
443 * Open-context function to add one entry to the new mapping. The new
444 * entry will be remembered and written from syncing context.
447 spa_condense_indirect_commit_entry(spa_t *spa,
448 vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
450 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
452 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
454 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
455 dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
456 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
457 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
460 * If we are the first entry committed this txg, kick off the sync
461 * task to write to the MOS on our behalf.
463 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
464 dsl_sync_task_nowait(dmu_tx_pool(tx),
465 spa_condense_indirect_commit_sync, sci,
466 0, ZFS_SPACE_CHECK_NONE, tx);
469 vdev_indirect_mapping_entry_t *vime =
470 kmem_alloc(sizeof (*vime), KM_SLEEP);
471 vime->vime_mapping = *vimep;
472 vime->vime_obsolete_count = count;
473 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
479 spa_condense_indirect_generate_new_mapping(vdev_t *vd,
480 uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
482 spa_t *spa = vd->vdev_spa;
483 uint64_t mapi = start_index;
484 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
485 uint64_t old_num_entries =
486 vdev_indirect_mapping_num_entries(old_mapping);
488 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
489 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
491 zfs_dbgmsg("starting condense of vdev %llu from index %llu",
492 (u_longlong_t)vd->vdev_id,
495 while (mapi < old_num_entries) {
497 if (zthr_iscancelled(zthr)) {
498 zfs_dbgmsg("pausing condense of vdev %llu "
499 "at index %llu", (u_longlong_t)vd->vdev_id,
504 vdev_indirect_mapping_entry_phys_t *entry =
505 &old_mapping->vim_entries[mapi];
506 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
507 ASSERT3U(obsolete_counts[mapi], <=, entry_size);
508 if (obsolete_counts[mapi] < entry_size) {
509 spa_condense_indirect_commit_entry(spa, entry,
510 obsolete_counts[mapi]);
513 * This delay may be requested for testing, debugging,
514 * or performance reasons.
516 delay(zfs_condense_indirect_commit_entry_delay_ticks);
525 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
529 return (spa->spa_condensing_indirect != NULL);
534 spa_condense_indirect_thread(void *arg, zthr_t *zthr)
539 ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
540 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
541 vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
542 ASSERT3P(vd, !=, NULL);
543 spa_config_exit(spa, SCL_VDEV, FTAG);
545 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
546 spa_condensing_indirect_phys_t *scip =
547 &spa->spa_condensing_indirect_phys;
549 uint64_t start_index;
550 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
551 space_map_t *prev_obsolete_sm = NULL;
553 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
554 ASSERT(scip->scip_next_mapping_object != 0);
555 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
556 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
558 for (int i = 0; i < TXG_SIZE; i++) {
560 * The list must start out empty in order for the
561 * _commit_sync() sync task to be properly registered
562 * on the first call to _commit_entry(); so it's wise
563 * to double check and ensure we actually are starting
566 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
569 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
570 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
571 space_map_update(prev_obsolete_sm);
572 counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
573 if (prev_obsolete_sm != NULL) {
574 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
575 counts, prev_obsolete_sm);
577 space_map_close(prev_obsolete_sm);
580 * Generate new mapping. Determine what index to continue from
581 * based on the max offset that we've already written in the
584 uint64_t max_offset =
585 vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
586 if (max_offset == 0) {
587 /* We haven't written anything to the new mapping yet. */
591 * Pick up from where we left off. _entry_for_offset()
592 * returns a pointer into the vim_entries array. If
593 * max_offset is greater than any of the mappings
594 * contained in the table NULL will be returned and
595 * that indicates we've exhausted our iteration of the
599 vdev_indirect_mapping_entry_phys_t *entry =
600 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
605 * We've already written the whole new mapping.
606 * This special value will cause us to skip the
607 * generate_new_mapping step and just do the sync
608 * task to complete the condense.
610 start_index = UINT64_MAX;
612 start_index = entry - old_mapping->vim_entries;
613 ASSERT3U(start_index, <,
614 vdev_indirect_mapping_num_entries(old_mapping));
618 spa_condense_indirect_generate_new_mapping(vd, counts,
621 vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
624 * If the zthr has received a cancellation signal while running
625 * in generate_new_mapping() or at any point after that, then bail
626 * early. We don't want to complete the condense if the spa is
629 if (zthr_iscancelled(zthr))
632 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
633 spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
640 * Sync task to begin the condensing process.
643 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
645 spa_t *spa = vd->vdev_spa;
646 spa_condensing_indirect_phys_t *scip =
647 &spa->spa_condensing_indirect_phys;
649 ASSERT0(scip->scip_next_mapping_object);
650 ASSERT0(scip->scip_prev_obsolete_sm_object);
651 ASSERT0(scip->scip_vdev);
652 ASSERT(dmu_tx_is_syncing(tx));
653 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
654 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
655 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
657 uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
658 ASSERT(obsolete_sm_obj != 0);
660 scip->scip_vdev = vd->vdev_id;
661 scip->scip_next_mapping_object =
662 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
664 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
667 * We don't need to allocate a new space map object, since
668 * vdev_indirect_sync_obsolete will allocate one when needed.
670 space_map_close(vd->vdev_obsolete_sm);
671 vd->vdev_obsolete_sm = NULL;
672 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
673 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
675 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
676 DMU_POOL_DIRECTORY_OBJECT,
677 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
678 sizeof (*scip) / sizeof (uint64_t), scip, tx));
680 ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
681 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
683 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
685 vd->vdev_id, dmu_tx_get_txg(tx),
686 (u_longlong_t)scip->scip_prev_obsolete_sm_object,
687 (u_longlong_t)scip->scip_next_mapping_object);
689 zthr_wakeup(spa->spa_condense_zthr);
693 * Sync to the given vdev's obsolete space map any segments that are no longer
694 * referenced as of the given txg.
696 * If the obsolete space map doesn't exist yet, create and open it.
699 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
701 spa_t *spa = vd->vdev_spa;
702 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
704 ASSERT3U(vic->vic_mapping_object, !=, 0);
705 ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
706 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
707 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
709 if (vdev_obsolete_sm_object(vd) == 0) {
710 uint64_t obsolete_sm_object =
711 space_map_alloc(spa->spa_meta_objset, tx);
713 ASSERT(vd->vdev_top_zap != 0);
714 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
715 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
716 sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
717 ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
719 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
720 VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
721 spa->spa_meta_objset, obsolete_sm_object,
722 0, vd->vdev_asize, 0));
723 space_map_update(vd->vdev_obsolete_sm);
726 ASSERT(vd->vdev_obsolete_sm != NULL);
727 ASSERT3U(vdev_obsolete_sm_object(vd), ==,
728 space_map_object(vd->vdev_obsolete_sm));
730 space_map_write(vd->vdev_obsolete_sm,
731 vd->vdev_obsolete_segments, SM_ALLOC, tx);
732 space_map_update(vd->vdev_obsolete_sm);
733 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
737 spa_condense_init(spa_t *spa)
739 int error = zap_lookup(spa->spa_meta_objset,
740 DMU_POOL_DIRECTORY_OBJECT,
741 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
742 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
743 &spa->spa_condensing_indirect_phys);
745 if (spa_writeable(spa)) {
746 spa->spa_condensing_indirect =
747 spa_condensing_indirect_create(spa);
750 } else if (error == ENOENT) {
758 spa_condense_fini(spa_t *spa)
760 if (spa->spa_condensing_indirect != NULL) {
761 spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
762 spa->spa_condensing_indirect = NULL;
767 spa_start_indirect_condensing_thread(spa_t *spa)
769 ASSERT3P(spa->spa_condense_zthr, ==, NULL);
770 spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
771 spa_condense_indirect_thread, spa);
775 * Gets the obsolete spacemap object from the vdev's ZAP.
776 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
780 vdev_obsolete_sm_object(vdev_t *vd)
782 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
783 if (vd->vdev_top_zap == 0) {
788 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
789 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
791 ASSERT(err == 0 || err == ENOENT);
797 vdev_obsolete_counts_are_precise(vdev_t *vd)
799 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
800 if (vd->vdev_top_zap == 0) {
805 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
806 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
808 ASSERT(err == 0 || err == ENOENT);
815 vdev_indirect_close(vdev_t *vd)
821 vdev_indirect_io_done(zio_t *zio)
827 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
828 uint64_t *logical_ashift, uint64_t *physical_ashift)
830 *psize = *max_psize = vd->vdev_asize +
831 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
832 *logical_ashift = vd->vdev_ashift;
833 *physical_ashift = vd->vdev_physical_ashift;
837 typedef struct remap_segment {
841 uint64_t rs_split_offset;
846 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
848 remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
850 rs->rs_offset = offset;
851 rs->rs_asize = asize;
852 rs->rs_split_offset = split_offset;
857 * Given an indirect vdev and an extent on that vdev, it duplicates the
858 * physical entries of the indirect mapping that correspond to the extent
859 * to a new array and returns a pointer to it. In addition, copied_entries
860 * is populated with the number of mapping entries that were duplicated.
862 * Note that the function assumes that the caller holds vdev_indirect_rwlock.
863 * This ensures that the mapping won't change due to condensing as we
864 * copy over its contents.
866 * Finally, since we are doing an allocation, it is up to the caller to
867 * free the array allocated in this function.
869 vdev_indirect_mapping_entry_phys_t *
870 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
871 uint64_t asize, uint64_t *copied_entries)
873 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
874 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
875 uint64_t entries = 0;
877 ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
879 vdev_indirect_mapping_entry_phys_t *first_mapping =
880 vdev_indirect_mapping_entry_for_offset(vim, offset);
881 ASSERT3P(first_mapping, !=, NULL);
883 vdev_indirect_mapping_entry_phys_t *m = first_mapping;
885 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
887 ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
888 ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
890 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
891 uint64_t inner_size = MIN(asize, size - inner_offset);
893 offset += inner_size;
899 size_t copy_length = entries * sizeof (*first_mapping);
900 duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
901 bcopy(first_mapping, duplicate_mappings, copy_length);
902 *copied_entries = entries;
904 return (duplicate_mappings);
908 * Goes through the relevant indirect mappings until it hits a concrete vdev
909 * and issues the callback. On the way to the concrete vdev, if any other
910 * indirect vdevs are encountered, then the callback will also be called on
911 * each of those indirect vdevs. For example, if the segment is mapped to
912 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
913 * mapped to segment B on concrete vdev 2, then the callback will be called on
914 * both vdev 1 and vdev 2.
916 * While the callback passed to vdev_indirect_remap() is called on every vdev
917 * the function encounters, certain callbacks only care about concrete vdevs.
918 * These types of callbacks should return immediately and explicitly when they
919 * are called on an indirect vdev.
921 * Because there is a possibility that a DVA section in the indirect device
922 * has been split into multiple sections in our mapping, we keep track
923 * of the relevant contiguous segments of the new location (remap_segment_t)
924 * in a stack. This way we can call the callback for each of the new sections
925 * created by a single section of the indirect device. Note though, that in
926 * this scenario the callbacks in each split block won't occur in-order in
927 * terms of offset, so callers should not make any assumptions about that.
929 * For callbacks that don't handle split blocks and immediately return when
930 * they encounter them (as is the case for remap_blkptr_cb), the caller can
931 * assume that its callback will be applied from the first indirect vdev
932 * encountered to the last one and then the concrete vdev, in that order.
935 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
936 void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
939 spa_t *spa = vd->vdev_spa;
941 list_create(&stack, sizeof (remap_segment_t),
942 offsetof(remap_segment_t, rs_node));
944 for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
945 rs != NULL; rs = list_remove_head(&stack)) {
946 vdev_t *v = rs->rs_vd;
947 uint64_t num_entries = 0;
949 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
950 ASSERT(rs->rs_asize > 0);
953 * Note: As this function can be called from open context
954 * (e.g. zio_read()), we need the following rwlock to
955 * prevent the mapping from being changed by condensing.
957 * So we grab the lock and we make a copy of the entries
958 * that are relevant to the extent that we are working on.
959 * Once that is done, we drop the lock and iterate over
960 * our copy of the mapping. Once we are done with the with
961 * the remap segment and we free it, we also free our copy
962 * of the indirect mapping entries that are relevant to it.
964 * This way we don't need to wait until the function is
965 * finished with a segment, to condense it. In addition, we
966 * don't need a recursive rwlock for the case that a call to
967 * vdev_indirect_remap() needs to call itself (through the
968 * codepath of its callback) for the same vdev in the middle
971 rw_enter(&v->vdev_indirect_rwlock, RW_READER);
972 vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
973 ASSERT3P(vim, !=, NULL);
975 vdev_indirect_mapping_entry_phys_t *mapping =
976 vdev_indirect_mapping_duplicate_adjacent_entries(v,
977 rs->rs_offset, rs->rs_asize, &num_entries);
978 ASSERT3P(mapping, !=, NULL);
979 ASSERT3U(num_entries, >, 0);
980 rw_exit(&v->vdev_indirect_rwlock);
982 for (uint64_t i = 0; i < num_entries; i++) {
984 * Note: the vdev_indirect_mapping can not change
985 * while we are running. It only changes while the
986 * removal is in progress, and then only from syncing
987 * context. While a removal is in progress, this
988 * function is only called for frees, which also only
989 * happen from syncing context.
991 vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
993 ASSERT3P(m, !=, NULL);
994 ASSERT3U(rs->rs_asize, >, 0);
996 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
997 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
998 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
1000 ASSERT3U(rs->rs_offset, >=,
1001 DVA_MAPPING_GET_SRC_OFFSET(m));
1002 ASSERT3U(rs->rs_offset, <,
1003 DVA_MAPPING_GET_SRC_OFFSET(m) + size);
1004 ASSERT3U(dst_vdev, !=, v->vdev_id);
1006 uint64_t inner_offset = rs->rs_offset -
1007 DVA_MAPPING_GET_SRC_OFFSET(m);
1008 uint64_t inner_size =
1009 MIN(rs->rs_asize, size - inner_offset);
1011 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
1012 ASSERT3P(dst_v, !=, NULL);
1014 if (dst_v->vdev_ops == &vdev_indirect_ops) {
1015 list_insert_head(&stack,
1016 rs_alloc(dst_v, dst_offset + inner_offset,
1017 inner_size, rs->rs_split_offset));
1021 if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
1022 IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
1024 * Note: This clause exists only solely for
1025 * testing purposes. We use it to ensure that
1026 * split blocks work and that the callbacks
1027 * using them yield the same result if issued
1030 uint64_t inner_half = inner_size / 2;
1032 func(rs->rs_split_offset + inner_half, dst_v,
1033 dst_offset + inner_offset + inner_half,
1036 func(rs->rs_split_offset, dst_v,
1037 dst_offset + inner_offset,
1040 func(rs->rs_split_offset, dst_v,
1041 dst_offset + inner_offset,
1045 rs->rs_offset += inner_size;
1046 rs->rs_asize -= inner_size;
1047 rs->rs_split_offset += inner_size;
1049 VERIFY0(rs->rs_asize);
1051 kmem_free(mapping, num_entries * sizeof (*mapping));
1052 kmem_free(rs, sizeof (remap_segment_t));
1054 list_destroy(&stack);
1058 vdev_indirect_child_io_done(zio_t *zio)
1060 zio_t *pio = zio->io_private;
1062 mutex_enter(&pio->io_lock);
1063 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
1064 mutex_exit(&pio->io_lock);
1066 abd_put(zio->io_abd);
1070 vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
1071 uint64_t size, void *arg)
1075 ASSERT3P(vd, !=, NULL);
1077 if (vd->vdev_ops == &vdev_indirect_ops)
1080 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
1081 abd_get_offset(zio->io_abd, split_offset),
1082 size, zio->io_type, zio->io_priority,
1083 0, vdev_indirect_child_io_done, zio));
1087 vdev_indirect_io_start(zio_t *zio)
1089 spa_t *spa = zio->io_spa;
1091 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1092 if (zio->io_type != ZIO_TYPE_READ) {
1093 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
1094 ASSERT((zio->io_flags &
1095 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
1098 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
1099 vdev_indirect_io_start_cb, zio);
1104 vdev_ops_t vdev_indirect_ops = {
1106 vdev_indirect_close,
1108 vdev_indirect_io_start,
1109 vdev_indirect_io_done,
1113 vdev_indirect_remap,
1114 VDEV_TYPE_INDIRECT, /* name of this vdev type */
1115 B_FALSE /* leaf vdev */