4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
28 #include <sys/dsl_dataset.h>
30 #include <sys/refcount.h>
32 #include <sys/zfs_context.h>
33 #include <sys/dsl_pool.h>
36 * Deadlist concurrency:
38 * Deadlists can only be modified from the syncing thread.
40 * Except for dsl_deadlist_insert(), it can only be modified with the
41 * dp_config_rwlock held with RW_WRITER.
43 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
44 * be called concurrently, from open context, with the dl_config_rwlock held
47 * Therefore, we only need to provide locking between dsl_deadlist_insert() and
48 * the accessors, protecting:
49 * dl_phys->dl_used,comp,uncomp
50 * and protecting the dl_tree from being loaded.
51 * The locking is provided by dl_lock. Note that locking on the bpobj_t
52 * provides its own locking, and dl_oldfmt is immutable.
56 dsl_deadlist_compare(const void *arg1, const void *arg2)
58 const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1;
59 const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2;
61 return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
65 dsl_deadlist_load_tree(dsl_deadlist_t *dl)
70 ASSERT(MUTEX_HELD(&dl->dl_lock));
72 ASSERT(!dl->dl_oldfmt);
76 avl_create(&dl->dl_tree, dsl_deadlist_compare,
77 sizeof (dsl_deadlist_entry_t),
78 offsetof(dsl_deadlist_entry_t, dle_node));
79 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
80 zap_cursor_retrieve(&zc, &za) == 0;
81 zap_cursor_advance(&zc)) {
82 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
83 dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
84 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
85 za.za_first_integer));
86 avl_add(&dl->dl_tree, dle);
89 dl->dl_havetree = B_TRUE;
93 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
95 dmu_object_info_t doi;
97 ASSERT(!dsl_deadlist_is_open(dl));
99 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
101 dl->dl_object = object;
102 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
103 dmu_object_info_from_db(dl->dl_dbuf, &doi);
104 if (doi.doi_type == DMU_OT_BPOBJ) {
105 dmu_buf_rele(dl->dl_dbuf, dl);
107 dl->dl_oldfmt = B_TRUE;
108 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
112 dl->dl_oldfmt = B_FALSE;
113 dl->dl_phys = dl->dl_dbuf->db_data;
114 dl->dl_havetree = B_FALSE;
118 dsl_deadlist_is_open(dsl_deadlist_t *dl)
120 return (dl->dl_os != NULL);
124 dsl_deadlist_close(dsl_deadlist_t *dl)
127 dsl_deadlist_entry_t *dle;
129 ASSERT(dsl_deadlist_is_open(dl));
132 dl->dl_oldfmt = B_FALSE;
133 bpobj_close(&dl->dl_bpobj);
139 if (dl->dl_havetree) {
140 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
142 bpobj_close(&dle->dle_bpobj);
143 kmem_free(dle, sizeof (*dle));
145 avl_destroy(&dl->dl_tree);
147 dmu_buf_rele(dl->dl_dbuf, dl);
148 mutex_destroy(&dl->dl_lock);
156 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
158 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
159 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
160 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
161 sizeof (dsl_deadlist_phys_t), tx));
165 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
167 dmu_object_info_t doi;
171 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
172 if (doi.doi_type == DMU_OT_BPOBJ) {
173 bpobj_free(os, dlobj, tx);
177 for (zap_cursor_init(&zc, os, dlobj);
178 zap_cursor_retrieve(&zc, &za) == 0;
179 zap_cursor_advance(&zc)) {
180 uint64_t obj = za.za_first_integer;
181 if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
182 bpobj_decr_empty(os, tx);
184 bpobj_free(os, obj, tx);
186 zap_cursor_fini(&zc);
187 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
191 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
192 const blkptr_t *bp, dmu_tx_t *tx)
194 ASSERT(MUTEX_HELD(&dl->dl_lock));
195 if (dle->dle_bpobj.bpo_object ==
196 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
197 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
198 bpobj_close(&dle->dle_bpobj);
199 bpobj_decr_empty(dl->dl_os, tx);
200 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
201 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
202 dle->dle_mintxg, obj, tx));
204 bpobj_enqueue(&dle->dle_bpobj, bp, tx);
208 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
209 uint64_t obj, dmu_tx_t *tx)
211 ASSERT(MUTEX_HELD(&dl->dl_lock));
212 if (dle->dle_bpobj.bpo_object !=
213 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
214 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
216 bpobj_close(&dle->dle_bpobj);
217 bpobj_decr_empty(dl->dl_os, tx);
218 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
219 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
220 dle->dle_mintxg, obj, tx));
225 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
227 dsl_deadlist_entry_t dle_tofind;
228 dsl_deadlist_entry_t *dle;
232 bpobj_enqueue(&dl->dl_bpobj, bp, tx);
236 mutex_enter(&dl->dl_lock);
237 dsl_deadlist_load_tree(dl);
239 dmu_buf_will_dirty(dl->dl_dbuf, tx);
240 dl->dl_phys->dl_used +=
241 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
242 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
243 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
245 dle_tofind.dle_mintxg = bp->blk_birth;
246 dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
248 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
250 dle = AVL_PREV(&dl->dl_tree, dle);
251 dle_enqueue(dl, dle, bp, tx);
252 mutex_exit(&dl->dl_lock);
256 * Insert new key in deadlist, which must be > all current entries.
257 * mintxg is not inclusive.
260 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
263 dsl_deadlist_entry_t *dle;
268 dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
269 dle->dle_mintxg = mintxg;
271 mutex_enter(&dl->dl_lock);
272 dsl_deadlist_load_tree(dl);
274 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
275 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
276 avl_add(&dl->dl_tree, dle);
278 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
280 mutex_exit(&dl->dl_lock);
284 * Remove this key, merging its entries into the previous key.
287 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
289 dsl_deadlist_entry_t dle_tofind;
290 dsl_deadlist_entry_t *dle, *dle_prev;
295 mutex_enter(&dl->dl_lock);
296 dsl_deadlist_load_tree(dl);
298 dle_tofind.dle_mintxg = mintxg;
299 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
300 dle_prev = AVL_PREV(&dl->dl_tree, dle);
302 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
304 avl_remove(&dl->dl_tree, dle);
305 bpobj_close(&dle->dle_bpobj);
306 kmem_free(dle, sizeof (*dle));
308 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
309 mutex_exit(&dl->dl_lock);
313 * Walk ds's snapshots to regenerate generate ZAP & AVL.
316 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
317 uint64_t mrs_obj, dmu_tx_t *tx)
319 dsl_deadlist_t dl = { 0 };
320 dsl_pool_t *dp = dmu_objset_pool(os);
322 dsl_deadlist_open(&dl, os, dlobj);
324 dsl_deadlist_close(&dl);
328 while (mrs_obj != 0) {
330 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
331 dsl_deadlist_add_key(&dl,
332 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
333 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
334 dsl_dataset_rele(ds, FTAG);
336 dsl_deadlist_close(&dl);
340 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
341 uint64_t mrs_obj, dmu_tx_t *tx)
343 dsl_deadlist_entry_t *dle;
346 newobj = dsl_deadlist_alloc(dl->dl_os, tx);
349 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
353 mutex_enter(&dl->dl_lock);
354 dsl_deadlist_load_tree(dl);
356 for (dle = avl_first(&dl->dl_tree); dle;
357 dle = AVL_NEXT(&dl->dl_tree, dle)) {
360 if (dle->dle_mintxg >= maxtxg)
363 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
364 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
365 dle->dle_mintxg, obj, tx));
367 mutex_exit(&dl->dl_lock);
372 dsl_deadlist_space(dsl_deadlist_t *dl,
373 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
375 ASSERT(dsl_deadlist_is_open(dl));
377 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
378 usedp, compp, uncompp));
382 mutex_enter(&dl->dl_lock);
383 *usedp = dl->dl_phys->dl_used;
384 *compp = dl->dl_phys->dl_comp;
385 *uncompp = dl->dl_phys->dl_uncomp;
386 mutex_exit(&dl->dl_lock);
390 * return space used in the range (mintxg, maxtxg].
391 * Includes maxtxg, does not include mintxg.
392 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
393 * larger than any bp in the deadlist (eg. UINT64_MAX)).
396 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
397 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
399 dsl_deadlist_entry_t *dle;
400 dsl_deadlist_entry_t dle_tofind;
404 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
405 mintxg, maxtxg, usedp, compp, uncompp));
409 *usedp = *compp = *uncompp = 0;
411 mutex_enter(&dl->dl_lock);
412 dsl_deadlist_load_tree(dl);
413 dle_tofind.dle_mintxg = mintxg;
414 dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
416 * If we don't find this mintxg, there shouldn't be anything
419 ASSERT(dle != NULL ||
420 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
422 for (; dle && dle->dle_mintxg < maxtxg;
423 dle = AVL_NEXT(&dl->dl_tree, dle)) {
424 uint64_t used, comp, uncomp;
426 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
427 &used, &comp, &uncomp));
433 mutex_exit(&dl->dl_lock);
437 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
440 dsl_deadlist_entry_t dle_tofind;
441 dsl_deadlist_entry_t *dle;
443 uint64_t used, comp, uncomp;
446 ASSERT(MUTEX_HELD(&dl->dl_lock));
448 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
449 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
452 dsl_deadlist_load_tree(dl);
454 dmu_buf_will_dirty(dl->dl_dbuf, tx);
455 dl->dl_phys->dl_used += used;
456 dl->dl_phys->dl_comp += comp;
457 dl->dl_phys->dl_uncomp += uncomp;
459 dle_tofind.dle_mintxg = birth;
460 dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
462 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
463 dle_enqueue_subobj(dl, dle, obj, tx);
467 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
469 dsl_deadlist_t *dl = arg;
470 dsl_deadlist_insert(dl, bp, tx);
475 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as
479 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
484 dsl_deadlist_phys_t *dlp;
485 dmu_object_info_t doi;
487 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
488 if (doi.doi_type == DMU_OT_BPOBJ) {
490 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
491 VERIFY3U(0, ==, bpobj_iterate(&bpo,
492 dsl_deadlist_insert_cb, dl, tx));
497 mutex_enter(&dl->dl_lock);
498 for (zap_cursor_init(&zc, dl->dl_os, obj);
499 zap_cursor_retrieve(&zc, &za) == 0;
500 zap_cursor_advance(&zc)) {
501 uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
502 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
503 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
505 zap_cursor_fini(&zc);
507 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
508 dlp = bonus->db_data;
509 dmu_buf_will_dirty(bonus, tx);
510 bzero(dlp, sizeof (*dlp));
511 dmu_buf_rele(bonus, FTAG);
512 mutex_exit(&dl->dl_lock);
516 * Remove entries on dl that are >= mintxg, and put them on the bpobj.
519 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
522 dsl_deadlist_entry_t dle_tofind;
523 dsl_deadlist_entry_t *dle;
526 ASSERT(!dl->dl_oldfmt);
528 mutex_enter(&dl->dl_lock);
529 dmu_buf_will_dirty(dl->dl_dbuf, tx);
530 dsl_deadlist_load_tree(dl);
532 dle_tofind.dle_mintxg = mintxg;
533 dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
535 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
537 uint64_t used, comp, uncomp;
538 dsl_deadlist_entry_t *dle_next;
540 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
542 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
543 &used, &comp, &uncomp));
544 ASSERT3U(dl->dl_phys->dl_used, >=, used);
545 ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
546 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
547 dl->dl_phys->dl_used -= used;
548 dl->dl_phys->dl_comp -= comp;
549 dl->dl_phys->dl_uncomp -= uncomp;
551 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
552 dle->dle_mintxg, tx));
554 dle_next = AVL_NEXT(&dl->dl_tree, dle);
555 avl_remove(&dl->dl_tree, dle);
556 bpobj_close(&dle->dle_bpobj);
557 kmem_free(dle, sizeof (*dle));
560 mutex_exit(&dl->dl_lock);