]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/contrib/openzfs/module/zfs/bpobj.c
Merge bmake-20220204
[FreeBSD/FreeBSD.git] / sys / contrib / openzfs / module / zfs / bpobj.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24  * Copyright (c) 2017 Datto Inc.
25  */
26
27 #include <sys/bpobj.h>
28 #include <sys/zfs_context.h>
29 #include <sys/zfs_refcount.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/zfeature.h>
32 #include <sys/zap.h>
33
34 /*
35  * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
36  */
37 uint64_t
38 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
39 {
40         spa_t *spa = dmu_objset_spa(os);
41         dsl_pool_t *dp = dmu_objset_pool(os);
42
43         if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
44                 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
45                         ASSERT0(dp->dp_empty_bpobj);
46                         dp->dp_empty_bpobj =
47                             bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
48                         VERIFY(zap_add(os,
49                             DMU_POOL_DIRECTORY_OBJECT,
50                             DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
51                             &dp->dp_empty_bpobj, tx) == 0);
52                 }
53                 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
54                 ASSERT(dp->dp_empty_bpobj != 0);
55                 return (dp->dp_empty_bpobj);
56         } else {
57                 return (bpobj_alloc(os, blocksize, tx));
58         }
59 }
60
61 void
62 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
63 {
64         dsl_pool_t *dp = dmu_objset_pool(os);
65
66         spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
67         if (!spa_feature_is_active(dmu_objset_spa(os),
68             SPA_FEATURE_EMPTY_BPOBJ)) {
69                 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
70                     DMU_POOL_DIRECTORY_OBJECT,
71                     DMU_POOL_EMPTY_BPOBJ, tx));
72                 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
73                 dp->dp_empty_bpobj = 0;
74         }
75 }
76
77 uint64_t
78 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
79 {
80         int size;
81
82         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
83                 size = BPOBJ_SIZE_V0;
84         else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
85                 size = BPOBJ_SIZE_V1;
86         else if (!spa_feature_is_active(dmu_objset_spa(os),
87             SPA_FEATURE_LIVELIST))
88                 size = BPOBJ_SIZE_V2;
89         else
90                 size = sizeof (bpobj_phys_t);
91
92         return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
93             DMU_OT_BPOBJ_HDR, size, tx));
94 }
95
96 void
97 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
98 {
99         int64_t i;
100         bpobj_t bpo;
101         dmu_object_info_t doi;
102         int epb;
103         dmu_buf_t *dbuf = NULL;
104
105         ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
106         VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
107
108         mutex_enter(&bpo.bpo_lock);
109
110         if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
111                 goto out;
112
113         VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
114         epb = doi.doi_data_block_size / sizeof (uint64_t);
115
116         for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
117                 uint64_t *objarray;
118                 uint64_t offset, blkoff;
119
120                 offset = i * sizeof (uint64_t);
121                 blkoff = P2PHASE(i, epb);
122
123                 if (dbuf == NULL || dbuf->db_offset > offset) {
124                         if (dbuf)
125                                 dmu_buf_rele(dbuf, FTAG);
126                         VERIFY3U(0, ==, dmu_buf_hold(os,
127                             bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
128                 }
129
130                 ASSERT3U(offset, >=, dbuf->db_offset);
131                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
132
133                 objarray = dbuf->db_data;
134                 bpobj_free(os, objarray[blkoff], tx);
135         }
136         if (dbuf) {
137                 dmu_buf_rele(dbuf, FTAG);
138                 dbuf = NULL;
139         }
140         VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
141
142 out:
143         mutex_exit(&bpo.bpo_lock);
144         bpobj_close(&bpo);
145
146         VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
147 }
148
149 int
150 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
151 {
152         dmu_object_info_t doi;
153         int err;
154
155         err = dmu_object_info(os, object, &doi);
156         if (err)
157                 return (err);
158
159         bzero(bpo, sizeof (*bpo));
160         mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
161
162         ASSERT(bpo->bpo_dbuf == NULL);
163         ASSERT(bpo->bpo_phys == NULL);
164         ASSERT(object != 0);
165         ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
166         ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
167
168         err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
169         if (err)
170                 return (err);
171
172         bpo->bpo_os = os;
173         bpo->bpo_object = object;
174         bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
175         bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
176         bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
177         bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
178         bpo->bpo_phys = bpo->bpo_dbuf->db_data;
179         return (0);
180 }
181
182 boolean_t
183 bpobj_is_open(const bpobj_t *bpo)
184 {
185         return (bpo->bpo_object != 0);
186 }
187
188 void
189 bpobj_close(bpobj_t *bpo)
190 {
191         /* Lame workaround for closing a bpobj that was never opened. */
192         if (bpo->bpo_object == 0)
193                 return;
194
195         dmu_buf_rele(bpo->bpo_dbuf, bpo);
196         if (bpo->bpo_cached_dbuf != NULL)
197                 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
198         bpo->bpo_dbuf = NULL;
199         bpo->bpo_phys = NULL;
200         bpo->bpo_cached_dbuf = NULL;
201         bpo->bpo_object = 0;
202
203         mutex_destroy(&bpo->bpo_lock);
204 }
205
206 static boolean_t
207 bpobj_is_empty_impl(bpobj_t *bpo)
208 {
209         ASSERT(MUTEX_HELD(&bpo->bpo_lock));
210         return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
211             (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
212 }
213
214 boolean_t
215 bpobj_is_empty(bpobj_t *bpo)
216 {
217         mutex_enter(&bpo->bpo_lock);
218         boolean_t is_empty = bpobj_is_empty_impl(bpo);
219         mutex_exit(&bpo->bpo_lock);
220         return (is_empty);
221 }
222
223 /*
224  * A recursive iteration of the bpobjs would be nice here but we run the risk
225  * of overflowing function stack space.  Instead, find each subobj and add it
226  * to the head of our list so it can be scanned for subjobjs.  Like a
227  * recursive implementation, the "deepest" subobjs will be freed first.
228  * When a subobj is found to have no additional subojs, free it.
229  */
230 typedef struct bpobj_info {
231         bpobj_t *bpi_bpo;
232         /*
233          * This object is a subobj of bpi_parent,
234          * at bpi_index in its subobj array.
235          */
236         struct bpobj_info *bpi_parent;
237         uint64_t bpi_index;
238         /* How many of our subobj's are left to process. */
239         uint64_t bpi_unprocessed_subobjs;
240         /* True after having visited this bpo's directly referenced BPs. */
241         boolean_t bpi_visited;
242         list_node_t bpi_node;
243 } bpobj_info_t;
244
245 static bpobj_info_t *
246 bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
247 {
248         bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
249         bpi->bpi_bpo = bpo;
250         bpi->bpi_parent = parent;
251         bpi->bpi_index = index;
252         if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
253                 bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
254         }
255         return (bpi);
256 }
257
258 /*
259  * Update bpobj and all of its parents with new space accounting.
260  */
261 static void
262 propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
263     int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
264 {
265
266         for (; bpi != NULL; bpi = bpi->bpi_parent) {
267                 bpobj_t *p = bpi->bpi_bpo;
268                 ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
269                 p->bpo_phys->bpo_bytes -= freed;
270                 ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
271                 if (p->bpo_havecomp) {
272                         p->bpo_phys->bpo_comp -= comp_freed;
273                         p->bpo_phys->bpo_uncomp -= uncomp_freed;
274                 }
275         }
276 }
277
278 static int
279 bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
280     int64_t start, dmu_tx_t *tx, boolean_t free)
281 {
282         int err = 0;
283         int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
284         dmu_buf_t *dbuf = NULL;
285         bpobj_t *bpo = bpi->bpi_bpo;
286
287         for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
288                 uint64_t offset = i * sizeof (blkptr_t);
289                 uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
290
291                 if (dbuf == NULL || dbuf->db_offset > offset) {
292                         if (dbuf)
293                                 dmu_buf_rele(dbuf, FTAG);
294                         err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
295                             offset, FTAG, &dbuf, 0);
296                         if (err)
297                                 break;
298                 }
299
300                 ASSERT3U(offset, >=, dbuf->db_offset);
301                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
302
303                 blkptr_t *bparray = dbuf->db_data;
304                 blkptr_t *bp = &bparray[blkoff];
305
306                 boolean_t bp_freed = BP_GET_FREE(bp);
307                 err = func(arg, bp, bp_freed, tx);
308                 if (err)
309                         break;
310
311                 if (free) {
312                         int sign = bp_freed ? -1 : +1;
313                         spa_t *spa = dmu_objset_spa(bpo->bpo_os);
314                         freed += sign * bp_get_dsize_sync(spa, bp);
315                         comp_freed += sign * BP_GET_PSIZE(bp);
316                         uncomp_freed += sign * BP_GET_UCSIZE(bp);
317                         ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
318                         bpo->bpo_phys->bpo_num_blkptrs--;
319                         ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
320                         if (bp_freed) {
321                                 ASSERT(bpo->bpo_havefreed);
322                                 bpo->bpo_phys->bpo_num_freed--;
323                                 ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
324                         }
325                 }
326         }
327         if (free) {
328                 propagate_space_reduction(bpi, freed, comp_freed,
329                     uncomp_freed, tx);
330                 VERIFY0(dmu_free_range(bpo->bpo_os,
331                     bpo->bpo_object,
332                     bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
333                     DMU_OBJECT_END, tx));
334         }
335         if (dbuf) {
336                 dmu_buf_rele(dbuf, FTAG);
337                 dbuf = NULL;
338         }
339         return (err);
340 }
341
342 /*
343  * Given an initial bpo, start by freeing the BPs that are directly referenced
344  * by that bpo. If the bpo has subobjs, read in its last subobj and push the
345  * subobj to our stack. By popping items off our stack, eventually we will
346  * encounter a bpo that has no subobjs.  We can free its bpobj_info_t, and if
347  * requested also free the now-empty bpo from disk and decrement
348  * its parent's subobj count. We continue popping each subobj from our stack,
349  * visiting its last subobj until they too have no more subobjs, and so on.
350  */
351 static int
352 bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
353     dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
354 {
355         list_t stack;
356         bpobj_info_t *bpi;
357         int err = 0;
358
359         /*
360          * Create a "stack" for us to work with without worrying about
361          * stack overflows. Initialize it with the initial_bpo.
362          */
363         list_create(&stack, sizeof (bpobj_info_t),
364             offsetof(bpobj_info_t, bpi_node));
365         mutex_enter(&initial_bpo->bpo_lock);
366
367         if (bpobj_size != NULL)
368                 *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
369
370         list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
371
372         while ((bpi = list_head(&stack)) != NULL) {
373                 bpobj_t *bpo = bpi->bpi_bpo;
374
375                 ASSERT3P(bpo, !=, NULL);
376                 ASSERT(MUTEX_HELD(&bpo->bpo_lock));
377                 ASSERT(bpobj_is_open(bpo));
378
379                 if (free)
380                         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
381
382                 if (bpi->bpi_visited == B_FALSE) {
383                         err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
384                             free);
385                         bpi->bpi_visited = B_TRUE;
386                         if (err != 0)
387                                 break;
388                 }
389                 /*
390                  * We've finished with this bpo's directly-referenced BP's and
391                  * it has no more unprocessed subobjs. We can free its
392                  * bpobj_info_t (unless it is the topmost, initial_bpo).
393                  * If we are freeing from disk, we can also do that.
394                  */
395                 if (bpi->bpi_unprocessed_subobjs == 0) {
396                         /*
397                          * If there are no entries, there should
398                          * be no bytes.
399                          */
400                         if (bpobj_is_empty_impl(bpo)) {
401                                 ASSERT0(bpo->bpo_phys->bpo_bytes);
402                                 ASSERT0(bpo->bpo_phys->bpo_comp);
403                                 ASSERT0(bpo->bpo_phys->bpo_uncomp);
404                         }
405
406                         /* The initial_bpo has no parent and is not closed. */
407                         if (bpi->bpi_parent != NULL) {
408                                 if (free) {
409                                         bpobj_t *p = bpi->bpi_parent->bpi_bpo;
410
411                                         ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
412                                         ASSERT3U(p->bpo_phys->bpo_num_subobjs,
413                                             >, 0);
414                                         ASSERT3U(bpi->bpi_index, ==,
415                                             p->bpo_phys->bpo_num_subobjs - 1);
416                                         ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
417                                             tx));
418
419                                         p->bpo_phys->bpo_num_subobjs--;
420
421                                         VERIFY0(dmu_free_range(p->bpo_os,
422                                             p->bpo_phys->bpo_subobjs,
423                                             bpi->bpi_index * sizeof (uint64_t),
424                                             sizeof (uint64_t), tx));
425
426                                         /* eliminate the empty subobj list */
427                                         if (bpo->bpo_havesubobj &&
428                                             bpo->bpo_phys->bpo_subobjs != 0) {
429                                                 ASSERT0(bpo->bpo_phys->
430                                                     bpo_num_subobjs);
431                                                 err = dmu_object_free(
432                                                     bpo->bpo_os,
433                                                     bpo->bpo_phys->bpo_subobjs,
434                                                     tx);
435                                                 if (err)
436                                                         break;
437                                                 bpo->bpo_phys->bpo_subobjs = 0;
438                                         }
439                                         err = dmu_object_free(p->bpo_os,
440                                             bpo->bpo_object, tx);
441                                         if (err)
442                                                 break;
443                                 }
444
445                                 mutex_exit(&bpo->bpo_lock);
446                                 bpobj_close(bpo);
447                                 kmem_free(bpo, sizeof (bpobj_t));
448                         } else {
449                                 mutex_exit(&bpo->bpo_lock);
450                         }
451
452                         /*
453                          * Finished processing this bpo. Unlock, and free
454                          * our "stack" info.
455                          */
456                         list_remove_head(&stack);
457                         kmem_free(bpi, sizeof (bpobj_info_t));
458                 } else {
459                         /*
460                          * We have unprocessed subobjs. Process the next one.
461                          */
462                         ASSERT(bpo->bpo_havecomp);
463                         ASSERT3P(bpobj_size, ==, NULL);
464
465                         /* Add the last subobj to stack. */
466                         int64_t i = bpi->bpi_unprocessed_subobjs - 1;
467                         uint64_t offset = i * sizeof (uint64_t);
468
469                         uint64_t obj_from_sublist;
470                         err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
471                             offset, sizeof (uint64_t), &obj_from_sublist,
472                             DMU_READ_PREFETCH);
473                         if (err)
474                                 break;
475                         bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
476                             KM_SLEEP);
477
478                         err = bpobj_open(sublist, bpo->bpo_os,
479                             obj_from_sublist);
480                         if (err)
481                                 break;
482
483                         list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
484                         mutex_enter(&sublist->bpo_lock);
485                         bpi->bpi_unprocessed_subobjs--;
486                 }
487         }
488         /*
489          * Cleanup anything left on the "stack" after we left the loop.
490          * Every bpo on the stack is locked so we must remember to undo
491          * that now (in LIFO order).
492          */
493         while ((bpi = list_remove_head(&stack)) != NULL) {
494                 bpobj_t *bpo = bpi->bpi_bpo;
495                 ASSERT(err != 0);
496                 ASSERT3P(bpo, !=, NULL);
497
498                 mutex_exit(&bpo->bpo_lock);
499
500                 /* do not free the initial_bpo */
501                 if (bpi->bpi_parent != NULL) {
502                         bpobj_close(bpi->bpi_bpo);
503                         kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
504                 }
505                 kmem_free(bpi, sizeof (bpobj_info_t));
506         }
507
508         list_destroy(&stack);
509
510         return (err);
511 }
512
513 /*
514  * Iterate and remove the entries.  If func returns nonzero, iteration
515  * will stop and that entry will not be removed.
516  */
517 int
518 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
519 {
520         return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
521 }
522
523 /*
524  * Iterate the entries.  If func returns nonzero, iteration will stop.
525  *
526  * If there are no subobjs:
527  *
528  * *bpobj_size can be used to return the number of block pointers in the
529  * bpobj.  Note that this may be different from the number of block pointers
530  * that are iterated over, if iteration is terminated early (e.g. by the func
531  * returning nonzero).
532  *
533  * If there are concurrent (or subsequent) modifications to the bpobj then the
534  * returned *bpobj_size can be passed as "start" to
535  * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
536  */
537 int
538 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
539     uint64_t *bpobj_size)
540 {
541         return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
542 }
543
544 /*
545  * Iterate over the blkptrs in the bpobj beginning at index start. If func
546  * returns nonzero, iteration will stop. This is a livelist specific function
547  * since it assumes that there are no subobjs present.
548  */
549 int
550 livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
551     int64_t start)
552 {
553         if (bpo->bpo_havesubobj)
554                 VERIFY0(bpo->bpo_phys->bpo_subobjs);
555         bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
556         int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
557         kmem_free(bpi, sizeof (bpobj_info_t));
558         return (err);
559 }
560
561 /*
562  * Logically add subobj's contents to the parent bpobj.
563  *
564  * In the most general case, this is accomplished in constant time by adding
565  * a reference to subobj.  This case is used when enqueuing a large subobj:
566  * +--------------+                        +--------------+
567  * | bpobj        |----------------------->| subobj list  |
568  * +----+----+----+----+----+              +-----+-----+--+--+
569  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
570  * +----+----+----+----+----+              +-----+-----+-----+
571  *
572  * +--------------+                        +--------------+
573  * | sub-bpobj    |----------------------> | subsubobj    |
574  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
575  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
576  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
577  *
578  * Result: sub-bpobj added to parent's subobj list.
579  * +--------------+                        +--------------+
580  * | bpobj        |----------------------->| subobj list  |
581  * +----+----+----+----+----+              +-----+-----+--+--+-----+
582  * | bp | bp | bp | bp | bp |              | obj | obj | obj | OBJ |
583  * +----+----+----+----+----+              +-----+-----+-----+--|--+
584  *                                                              |
585  *       /-----------------------------------------------------/
586  *       v
587  * +--------------+                        +--------------+
588  * | sub-bpobj    |----------------------> | subsubobj    |
589  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
590  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
591  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
592  *
593  *
594  * In a common case, the subobj is small: its bp's and its list of subobj's
595  * are each stored in a single block.  In this case we copy the subobj's
596  * contents to the parent:
597  * +--------------+                        +--------------+
598  * | bpobj        |----------------------->| subobj list  |
599  * +----+----+----+----+----+              +-----+-----+--+--+
600  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
601  * +----+----+----+----+----+              +-----+-----+-----+
602  *                          ^                                ^
603  * +--------------+         |              +--------------+  |
604  * | sub-bpobj    |---------^------------> | subsubobj    |  ^
605  * +----+----+----+         |              +-----+-----+--+  |
606  * | BP | BP |-->-->-->-->-/               | OBJ | OBJ |-->-/
607  * +----+----+                             +-----+-----+
608  *
609  * Result: subobj destroyed, contents copied to parent:
610  * +--------------+                        +--------------+
611  * | bpobj        |----------------------->| subobj list  |
612  * +----+----+----+----+----+----+----+    +-----+-----+--+--+-----+-----+
613  * | bp | bp | bp | bp | bp | BP | BP |    | obj | obj | obj | OBJ | OBJ |
614  * +----+----+----+----+----+----+----+    +-----+-----+-----+-----+-----+
615  *
616  *
617  * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
618  * but retain the sub-bpobj:
619  * +--------------+                        +--------------+
620  * | bpobj        |----------------------->| subobj list  |
621  * +----+----+----+----+----+              +-----+-----+--+--+
622  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
623  * +----+----+----+----+----+              +-----+-----+-----+
624  *                                                           ^
625  * +--------------+                        +--------------+  |
626  * | sub-bpobj    |----------------------> | subsubobj    |  ^
627  * +----+----+----+----+---------+----+    +-----+-----+--+  |
628  * | bp | bp | bp | bp |   ...   | bp |    | OBJ | OBJ |-->-/
629  * +----+----+----+----+---------+----+    +-----+-----+
630  *
631  * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
632  * +--------------+                     +--------------+
633  * | bpobj        |-------------------->| subobj list  |
634  * +----+----+----+----+----+           +-----+-----+--+--+-----+-----+------+
635  * | bp | bp | bp | bp | bp |           | obj | obj | obj | OBJ | OBJ | OBJ* |
636  * +----+----+----+----+----+           +-----+-----+-----+-----+-----+--|---+
637  *                                                                       |
638  *       /--------------------------------------------------------------/
639  *       v
640  * +--------------+
641  * | sub-bpobj    |
642  * +----+----+----+----+---------+----+
643  * | bp | bp | bp | bp |   ...   | bp |
644  * +----+----+----+----+---------+----+
645  */
646 void
647 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
648 {
649         bpobj_t subbpo;
650         uint64_t used, comp, uncomp, subsubobjs;
651         boolean_t copy_subsub = B_TRUE;
652         boolean_t copy_bps = B_TRUE;
653
654         ASSERT(bpobj_is_open(bpo));
655         ASSERT(subobj != 0);
656         ASSERT(bpo->bpo_havesubobj);
657         ASSERT(bpo->bpo_havecomp);
658         ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
659
660         if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
661                 bpobj_decr_empty(bpo->bpo_os, tx);
662                 return;
663         }
664
665         VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
666         VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
667
668         if (bpobj_is_empty(&subbpo)) {
669                 /* No point in having an empty subobj. */
670                 bpobj_close(&subbpo);
671                 bpobj_free(bpo->bpo_os, subobj, tx);
672                 return;
673         }
674
675         mutex_enter(&bpo->bpo_lock);
676         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
677
678         dmu_object_info_t doi;
679
680         if (bpo->bpo_phys->bpo_subobjs != 0) {
681                 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
682                     &doi));
683                 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
684         }
685
686         /*
687          * If subobj has only one block of subobjs, then move subobj's
688          * subobjs to bpo's subobj list directly.  This reduces recursion in
689          * bpobj_iterate due to nested subobjs.
690          */
691         subsubobjs = subbpo.bpo_phys->bpo_subobjs;
692         if (subsubobjs != 0) {
693                 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
694                 if (doi.doi_max_offset > doi.doi_data_block_size) {
695                         copy_subsub = B_FALSE;
696                 }
697         }
698
699         /*
700          * If, in addition to having only one block of subobj's, subobj has
701          * only one block of bp's, then move subobj's bp's to bpo's bp list
702          * directly. This reduces recursion in bpobj_iterate due to nested
703          * subobjs.
704          */
705         VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
706         if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
707                 copy_bps = B_FALSE;
708         }
709
710         if (copy_subsub && subsubobjs != 0) {
711                 dmu_buf_t *subdb;
712                 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
713
714                 VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
715                     0, FTAG, &subdb, 0));
716                 /*
717                  * Make sure that we are not asking dmu_write()
718                  * to write more data than we have in our buffer.
719                  */
720                 VERIFY3U(subdb->db_size, >=,
721                     numsubsub * sizeof (subobj));
722                 if (bpo->bpo_phys->bpo_subobjs == 0) {
723                         bpo->bpo_phys->bpo_subobjs =
724                             dmu_object_alloc(bpo->bpo_os,
725                             DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
726                             DMU_OT_NONE, 0, tx);
727                 }
728                 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
729                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
730                     numsubsub * sizeof (subobj), subdb->db_data, tx);
731                 dmu_buf_rele(subdb, FTAG);
732                 bpo->bpo_phys->bpo_num_subobjs += numsubsub;
733
734                 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
735                 subbpo.bpo_phys->bpo_subobjs = 0;
736                 VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
737         }
738
739         if (copy_bps) {
740                 dmu_buf_t *bps;
741                 uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
742
743                 ASSERT(copy_subsub);
744                 VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
745                     0, FTAG, &bps, 0));
746
747                 /*
748                  * Make sure that we are not asking dmu_write()
749                  * to write more data than we have in our buffer.
750                  */
751                 VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
752                 dmu_write(bpo->bpo_os, bpo->bpo_object,
753                     bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
754                     numbps * sizeof (blkptr_t),
755                     bps->db_data, tx);
756                 dmu_buf_rele(bps, FTAG);
757                 bpo->bpo_phys->bpo_num_blkptrs += numbps;
758
759                 bpobj_close(&subbpo);
760                 VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
761         } else {
762                 bpobj_close(&subbpo);
763                 if (bpo->bpo_phys->bpo_subobjs == 0) {
764                         bpo->bpo_phys->bpo_subobjs =
765                             dmu_object_alloc(bpo->bpo_os,
766                             DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
767                             DMU_OT_NONE, 0, tx);
768                 }
769
770                 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
771                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
772                     sizeof (subobj), &subobj, tx);
773                 bpo->bpo_phys->bpo_num_subobjs++;
774         }
775
776         bpo->bpo_phys->bpo_bytes += used;
777         bpo->bpo_phys->bpo_comp += comp;
778         bpo->bpo_phys->bpo_uncomp += uncomp;
779         mutex_exit(&bpo->bpo_lock);
780
781 }
782
783 void
784 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
785     dmu_tx_t *tx)
786 {
787         blkptr_t stored_bp = *bp;
788         uint64_t offset;
789         int blkoff;
790         blkptr_t *bparray;
791
792         ASSERT(bpobj_is_open(bpo));
793         ASSERT(!BP_IS_HOLE(bp));
794         ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
795
796         if (BP_IS_EMBEDDED(bp)) {
797                 /*
798                  * The bpobj will compress better without the payload.
799                  *
800                  * Note that we store EMBEDDED bp's because they have an
801                  * uncompressed size, which must be accounted for.  An
802                  * alternative would be to add their size to bpo_uncomp
803                  * without storing the bp, but that would create additional
804                  * complications: bpo_uncomp would be inconsistent with the
805                  * set of BP's stored, and bpobj_iterate() wouldn't visit
806                  * all the space accounted for in the bpobj.
807                  */
808                 bzero(&stored_bp, sizeof (stored_bp));
809                 stored_bp.blk_prop = bp->blk_prop;
810                 stored_bp.blk_birth = bp->blk_birth;
811         } else if (!BP_GET_DEDUP(bp)) {
812                 /* The bpobj will compress better without the checksum */
813                 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
814         }
815
816         stored_bp.blk_fill = 0;
817         BP_SET_FREE(&stored_bp, bp_freed);
818
819         mutex_enter(&bpo->bpo_lock);
820
821         offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
822         blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
823
824         if (bpo->bpo_cached_dbuf == NULL ||
825             offset < bpo->bpo_cached_dbuf->db_offset ||
826             offset >= bpo->bpo_cached_dbuf->db_offset +
827             bpo->bpo_cached_dbuf->db_size) {
828                 if (bpo->bpo_cached_dbuf)
829                         dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
830                 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
831                     offset, bpo, &bpo->bpo_cached_dbuf, 0));
832         }
833
834         dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
835         bparray = bpo->bpo_cached_dbuf->db_data;
836         bparray[blkoff] = stored_bp;
837
838         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
839         bpo->bpo_phys->bpo_num_blkptrs++;
840         int sign = bp_freed ? -1 : +1;
841         bpo->bpo_phys->bpo_bytes += sign *
842             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
843         if (bpo->bpo_havecomp) {
844                 bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
845                 bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
846         }
847         if (bp_freed) {
848                 ASSERT(bpo->bpo_havefreed);
849                 bpo->bpo_phys->bpo_num_freed++;
850         }
851         mutex_exit(&bpo->bpo_lock);
852 }
853
854 struct space_range_arg {
855         spa_t *spa;
856         uint64_t mintxg;
857         uint64_t maxtxg;
858         uint64_t used;
859         uint64_t comp;
860         uint64_t uncomp;
861 };
862
863 static int
864 space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
865 {
866         (void) bp_freed, (void) tx;
867         struct space_range_arg *sra = arg;
868
869         if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
870                 if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
871                         sra->used += bp_get_dsize_sync(sra->spa, bp);
872                 else
873                         sra->used += bp_get_dsize(sra->spa, bp);
874                 sra->comp += BP_GET_PSIZE(bp);
875                 sra->uncomp += BP_GET_UCSIZE(bp);
876         }
877         return (0);
878 }
879
880 int
881 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
882 {
883         ASSERT(bpobj_is_open(bpo));
884         mutex_enter(&bpo->bpo_lock);
885
886         *usedp = bpo->bpo_phys->bpo_bytes;
887         if (bpo->bpo_havecomp) {
888                 *compp = bpo->bpo_phys->bpo_comp;
889                 *uncompp = bpo->bpo_phys->bpo_uncomp;
890                 mutex_exit(&bpo->bpo_lock);
891                 return (0);
892         } else {
893                 mutex_exit(&bpo->bpo_lock);
894                 return (bpobj_space_range(bpo, 0, UINT64_MAX,
895                     usedp, compp, uncompp));
896         }
897 }
898
899 /*
900  * Return the amount of space in the bpobj which is:
901  * mintxg < blk_birth <= maxtxg
902  */
903 int
904 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
905     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
906 {
907         struct space_range_arg sra = { 0 };
908         int err;
909
910         ASSERT(bpobj_is_open(bpo));
911
912         /*
913          * As an optimization, if they want the whole txg range, just
914          * get bpo_bytes rather than iterating over the bps.
915          */
916         if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
917                 return (bpobj_space(bpo, usedp, compp, uncompp));
918
919         sra.spa = dmu_objset_spa(bpo->bpo_os);
920         sra.mintxg = mintxg;
921         sra.maxtxg = maxtxg;
922
923         err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
924         *usedp = sra.used;
925         *compp = sra.comp;
926         *uncompp = sra.uncomp;
927         return (err);
928 }
929
930 /*
931  * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
932  * bpobj are designated as free or allocated that information is not preserved
933  * in bplists.
934  */
935 int
936 bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
937     dmu_tx_t *tx)
938 {
939         (void) bp_freed, (void) tx;
940         bplist_t *bpl = arg;
941         bplist_append(bpl, bp);
942         return (0);
943 }