]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/contrib/openzfs/module/zfs/bpobj.c
contrib/tzdata: import tzdata 2024a
[FreeBSD/FreeBSD.git] / sys / contrib / openzfs / module / zfs / bpobj.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24  * Copyright (c) 2017 Datto Inc.
25  */
26
27 #include <sys/bpobj.h>
28 #include <sys/zfs_context.h>
29 #include <sys/zfs_refcount.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/zfeature.h>
32 #include <sys/zap.h>
33
34 /*
35  * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
36  */
37 uint64_t
38 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
39 {
40         spa_t *spa = dmu_objset_spa(os);
41         dsl_pool_t *dp = dmu_objset_pool(os);
42
43         if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
44                 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
45                         ASSERT0(dp->dp_empty_bpobj);
46                         dp->dp_empty_bpobj =
47                             bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
48                         VERIFY(zap_add(os,
49                             DMU_POOL_DIRECTORY_OBJECT,
50                             DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
51                             &dp->dp_empty_bpobj, tx) == 0);
52                 }
53                 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
54                 ASSERT(dp->dp_empty_bpobj != 0);
55                 return (dp->dp_empty_bpobj);
56         } else {
57                 return (bpobj_alloc(os, blocksize, tx));
58         }
59 }
60
61 void
62 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
63 {
64         dsl_pool_t *dp = dmu_objset_pool(os);
65
66         spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
67         if (!spa_feature_is_active(dmu_objset_spa(os),
68             SPA_FEATURE_EMPTY_BPOBJ)) {
69                 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
70                     DMU_POOL_DIRECTORY_OBJECT,
71                     DMU_POOL_EMPTY_BPOBJ, tx));
72                 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
73                 dp->dp_empty_bpobj = 0;
74         }
75 }
76
77 uint64_t
78 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
79 {
80         int size;
81
82         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
83                 size = BPOBJ_SIZE_V0;
84         else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
85                 size = BPOBJ_SIZE_V1;
86         else if (!spa_feature_is_active(dmu_objset_spa(os),
87             SPA_FEATURE_LIVELIST))
88                 size = BPOBJ_SIZE_V2;
89         else
90                 size = sizeof (bpobj_phys_t);
91
92         return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
93             DMU_OT_BPOBJ_HDR, size, tx));
94 }
95
96 void
97 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
98 {
99         int64_t i;
100         bpobj_t bpo;
101         dmu_object_info_t doi;
102         int epb;
103         dmu_buf_t *dbuf = NULL;
104
105         ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
106         VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
107
108         mutex_enter(&bpo.bpo_lock);
109
110         if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
111                 goto out;
112
113         VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
114         epb = doi.doi_data_block_size / sizeof (uint64_t);
115
116         for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
117                 uint64_t *objarray;
118                 uint64_t offset, blkoff;
119
120                 offset = i * sizeof (uint64_t);
121                 blkoff = P2PHASE(i, epb);
122
123                 if (dbuf == NULL || dbuf->db_offset > offset) {
124                         if (dbuf)
125                                 dmu_buf_rele(dbuf, FTAG);
126                         VERIFY3U(0, ==, dmu_buf_hold(os,
127                             bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
128                 }
129
130                 ASSERT3U(offset, >=, dbuf->db_offset);
131                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
132
133                 objarray = dbuf->db_data;
134                 bpobj_free(os, objarray[blkoff], tx);
135         }
136         if (dbuf) {
137                 dmu_buf_rele(dbuf, FTAG);
138                 dbuf = NULL;
139         }
140         VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
141
142 out:
143         mutex_exit(&bpo.bpo_lock);
144         bpobj_close(&bpo);
145
146         VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
147 }
148
149 int
150 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
151 {
152         dmu_object_info_t doi;
153         int err;
154
155         err = dmu_object_info(os, object, &doi);
156         if (err)
157                 return (err);
158
159         memset(bpo, 0, sizeof (*bpo));
160         mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
161
162         ASSERT(bpo->bpo_dbuf == NULL);
163         ASSERT(bpo->bpo_phys == NULL);
164         ASSERT(object != 0);
165         ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
166         ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
167
168         err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
169         if (err)
170                 return (err);
171
172         bpo->bpo_os = os;
173         bpo->bpo_object = object;
174         bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
175         bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
176         bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
177         bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
178         bpo->bpo_phys = bpo->bpo_dbuf->db_data;
179         return (0);
180 }
181
182 boolean_t
183 bpobj_is_open(const bpobj_t *bpo)
184 {
185         return (bpo->bpo_object != 0);
186 }
187
188 void
189 bpobj_close(bpobj_t *bpo)
190 {
191         /* Lame workaround for closing a bpobj that was never opened. */
192         if (bpo->bpo_object == 0)
193                 return;
194
195         dmu_buf_rele(bpo->bpo_dbuf, bpo);
196         if (bpo->bpo_cached_dbuf != NULL)
197                 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
198         bpo->bpo_dbuf = NULL;
199         bpo->bpo_phys = NULL;
200         bpo->bpo_cached_dbuf = NULL;
201         bpo->bpo_object = 0;
202
203         mutex_destroy(&bpo->bpo_lock);
204 }
205
206 static boolean_t
207 bpobj_is_empty_impl(bpobj_t *bpo)
208 {
209         ASSERT(MUTEX_HELD(&bpo->bpo_lock));
210         return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
211             (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
212 }
213
214 boolean_t
215 bpobj_is_empty(bpobj_t *bpo)
216 {
217         mutex_enter(&bpo->bpo_lock);
218         boolean_t is_empty = bpobj_is_empty_impl(bpo);
219         mutex_exit(&bpo->bpo_lock);
220         return (is_empty);
221 }
222
223 /*
224  * A recursive iteration of the bpobjs would be nice here but we run the risk
225  * of overflowing function stack space.  Instead, find each subobj and add it
226  * to the head of our list so it can be scanned for subjobjs.  Like a
227  * recursive implementation, the "deepest" subobjs will be freed first.
228  * When a subobj is found to have no additional subojs, free it.
229  */
230 typedef struct bpobj_info {
231         bpobj_t *bpi_bpo;
232         /*
233          * This object is a subobj of bpi_parent,
234          * at bpi_index in its subobj array.
235          */
236         struct bpobj_info *bpi_parent;
237         uint64_t bpi_index;
238         /* How many of our subobj's are left to process. */
239         uint64_t bpi_unprocessed_subobjs;
240         /* True after having visited this bpo's directly referenced BPs. */
241         boolean_t bpi_visited;
242         list_node_t bpi_node;
243 } bpobj_info_t;
244
245 static bpobj_info_t *
246 bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
247 {
248         bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
249         bpi->bpi_bpo = bpo;
250         bpi->bpi_parent = parent;
251         bpi->bpi_index = index;
252         if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
253                 bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
254         }
255         return (bpi);
256 }
257
258 /*
259  * Update bpobj and all of its parents with new space accounting.
260  */
261 static void
262 propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
263     int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
264 {
265
266         for (; bpi != NULL; bpi = bpi->bpi_parent) {
267                 bpobj_t *p = bpi->bpi_bpo;
268                 ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
269                 p->bpo_phys->bpo_bytes -= freed;
270                 ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
271                 if (p->bpo_havecomp) {
272                         p->bpo_phys->bpo_comp -= comp_freed;
273                         p->bpo_phys->bpo_uncomp -= uncomp_freed;
274                 }
275         }
276 }
277
278 static int
279 bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
280     int64_t start, dmu_tx_t *tx, boolean_t free)
281 {
282         int err = 0;
283         int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
284         dmu_buf_t *dbuf = NULL;
285         bpobj_t *bpo = bpi->bpi_bpo;
286
287         int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
288         uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
289             sizeof (blkptr_t);
290         uint64_t ps = start * sizeof (blkptr_t);
291         uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
292             ps);
293         if (pe > pb) {
294                 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
295                     ZIO_PRIORITY_ASYNC_READ);
296         }
297         for (; i >= start; i--) {
298                 uint64_t offset = i * sizeof (blkptr_t);
299                 uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
300
301                 if (dbuf == NULL || dbuf->db_offset > offset) {
302                         if (dbuf)
303                                 dmu_buf_rele(dbuf, FTAG);
304                         err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
305                             offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
306                         if (err)
307                                 break;
308                         pe = pb;
309                         pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
310                             dbuf->db_offset - dmu_prefetch_max : 0, ps);
311                         if (pe > pb) {
312                                 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
313                                     pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
314                         }
315                 }
316
317                 ASSERT3U(offset, >=, dbuf->db_offset);
318                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
319
320                 blkptr_t *bparray = dbuf->db_data;
321                 blkptr_t *bp = &bparray[blkoff];
322
323                 boolean_t bp_freed = BP_GET_FREE(bp);
324                 err = func(arg, bp, bp_freed, tx);
325                 if (err)
326                         break;
327
328                 if (free) {
329                         int sign = bp_freed ? -1 : +1;
330                         spa_t *spa = dmu_objset_spa(bpo->bpo_os);
331                         freed += sign * bp_get_dsize_sync(spa, bp);
332                         comp_freed += sign * BP_GET_PSIZE(bp);
333                         uncomp_freed += sign * BP_GET_UCSIZE(bp);
334                         ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
335                         bpo->bpo_phys->bpo_num_blkptrs--;
336                         ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
337                         if (bp_freed) {
338                                 ASSERT(bpo->bpo_havefreed);
339                                 bpo->bpo_phys->bpo_num_freed--;
340                                 ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
341                         }
342                 }
343         }
344         if (free) {
345                 propagate_space_reduction(bpi, freed, comp_freed,
346                     uncomp_freed, tx);
347                 VERIFY0(dmu_free_range(bpo->bpo_os,
348                     bpo->bpo_object,
349                     bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
350                     DMU_OBJECT_END, tx));
351         }
352         if (dbuf) {
353                 dmu_buf_rele(dbuf, FTAG);
354                 dbuf = NULL;
355         }
356         return (err);
357 }
358
359 /*
360  * Given an initial bpo, start by freeing the BPs that are directly referenced
361  * by that bpo. If the bpo has subobjs, read in its last subobj and push the
362  * subobj to our stack. By popping items off our stack, eventually we will
363  * encounter a bpo that has no subobjs.  We can free its bpobj_info_t, and if
364  * requested also free the now-empty bpo from disk and decrement
365  * its parent's subobj count. We continue popping each subobj from our stack,
366  * visiting its last subobj until they too have no more subobjs, and so on.
367  */
368 static int
369 bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
370     dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
371 {
372         list_t stack;
373         bpobj_info_t *bpi;
374         int err = 0;
375
376         /*
377          * Create a "stack" for us to work with without worrying about
378          * stack overflows. Initialize it with the initial_bpo.
379          */
380         list_create(&stack, sizeof (bpobj_info_t),
381             offsetof(bpobj_info_t, bpi_node));
382         mutex_enter(&initial_bpo->bpo_lock);
383
384         if (bpobj_size != NULL)
385                 *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
386
387         list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
388
389         while ((bpi = list_head(&stack)) != NULL) {
390                 bpobj_t *bpo = bpi->bpi_bpo;
391
392                 ASSERT3P(bpo, !=, NULL);
393                 ASSERT(MUTEX_HELD(&bpo->bpo_lock));
394                 ASSERT(bpobj_is_open(bpo));
395
396                 if (free)
397                         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
398
399                 if (bpi->bpi_visited == B_FALSE) {
400                         err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
401                             free);
402                         bpi->bpi_visited = B_TRUE;
403                         if (err != 0)
404                                 break;
405                 }
406                 /*
407                  * We've finished with this bpo's directly-referenced BP's and
408                  * it has no more unprocessed subobjs. We can free its
409                  * bpobj_info_t (unless it is the topmost, initial_bpo).
410                  * If we are freeing from disk, we can also do that.
411                  */
412                 if (bpi->bpi_unprocessed_subobjs == 0) {
413                         /*
414                          * If there are no entries, there should
415                          * be no bytes.
416                          */
417                         if (bpobj_is_empty_impl(bpo)) {
418                                 ASSERT0(bpo->bpo_phys->bpo_bytes);
419                                 ASSERT0(bpo->bpo_phys->bpo_comp);
420                                 ASSERT0(bpo->bpo_phys->bpo_uncomp);
421                         }
422
423                         /* The initial_bpo has no parent and is not closed. */
424                         if (bpi->bpi_parent != NULL) {
425                                 if (free) {
426                                         bpobj_t *p = bpi->bpi_parent->bpi_bpo;
427
428                                         ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
429                                         ASSERT3U(p->bpo_phys->bpo_num_subobjs,
430                                             >, 0);
431                                         ASSERT3U(bpi->bpi_index, ==,
432                                             p->bpo_phys->bpo_num_subobjs - 1);
433                                         ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
434                                             tx));
435
436                                         p->bpo_phys->bpo_num_subobjs--;
437
438                                         VERIFY0(dmu_free_range(p->bpo_os,
439                                             p->bpo_phys->bpo_subobjs,
440                                             bpi->bpi_index * sizeof (uint64_t),
441                                             sizeof (uint64_t), tx));
442
443                                         /* eliminate the empty subobj list */
444                                         if (bpo->bpo_havesubobj &&
445                                             bpo->bpo_phys->bpo_subobjs != 0) {
446                                                 ASSERT0(bpo->bpo_phys->
447                                                     bpo_num_subobjs);
448                                                 err = dmu_object_free(
449                                                     bpo->bpo_os,
450                                                     bpo->bpo_phys->bpo_subobjs,
451                                                     tx);
452                                                 if (err)
453                                                         break;
454                                                 bpo->bpo_phys->bpo_subobjs = 0;
455                                         }
456                                         err = dmu_object_free(p->bpo_os,
457                                             bpo->bpo_object, tx);
458                                         if (err)
459                                                 break;
460                                 }
461
462                                 mutex_exit(&bpo->bpo_lock);
463                                 bpobj_close(bpo);
464                                 kmem_free(bpo, sizeof (bpobj_t));
465                         } else {
466                                 mutex_exit(&bpo->bpo_lock);
467                         }
468
469                         /*
470                          * Finished processing this bpo. Unlock, and free
471                          * our "stack" info.
472                          */
473                         list_remove_head(&stack);
474                         kmem_free(bpi, sizeof (bpobj_info_t));
475                 } else {
476                         /*
477                          * We have unprocessed subobjs. Process the next one.
478                          */
479                         ASSERT(bpo->bpo_havecomp);
480                         ASSERT3P(bpobj_size, ==, NULL);
481
482                         /* Add the last subobj to stack. */
483                         int64_t i = bpi->bpi_unprocessed_subobjs - 1;
484                         uint64_t offset = i * sizeof (uint64_t);
485
486                         uint64_t subobj;
487                         err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
488                             offset, sizeof (uint64_t), &subobj,
489                             DMU_READ_NO_PREFETCH);
490                         if (err)
491                                 break;
492
493                         bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
494                             KM_SLEEP);
495                         err = bpobj_open(subbpo, bpo->bpo_os, subobj);
496                         if (err) {
497                                 kmem_free(subbpo, sizeof (bpobj_t));
498                                 break;
499                         }
500
501                         if (subbpo->bpo_havesubobj &&
502                             subbpo->bpo_phys->bpo_subobjs != 0) {
503                                 dmu_prefetch(subbpo->bpo_os,
504                                     subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
505                                     ZIO_PRIORITY_ASYNC_READ);
506                         }
507
508                         list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
509                         mutex_enter(&subbpo->bpo_lock);
510                         bpi->bpi_unprocessed_subobjs--;
511                 }
512         }
513         /*
514          * Cleanup anything left on the "stack" after we left the loop.
515          * Every bpo on the stack is locked so we must remember to undo
516          * that now (in LIFO order).
517          */
518         while ((bpi = list_remove_head(&stack)) != NULL) {
519                 bpobj_t *bpo = bpi->bpi_bpo;
520                 ASSERT(err != 0);
521                 ASSERT3P(bpo, !=, NULL);
522
523                 mutex_exit(&bpo->bpo_lock);
524
525                 /* do not free the initial_bpo */
526                 if (bpi->bpi_parent != NULL) {
527                         bpobj_close(bpi->bpi_bpo);
528                         kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
529                 }
530                 kmem_free(bpi, sizeof (bpobj_info_t));
531         }
532
533         list_destroy(&stack);
534
535         return (err);
536 }
537
538 /*
539  * Iterate and remove the entries.  If func returns nonzero, iteration
540  * will stop and that entry will not be removed.
541  */
542 int
543 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
544 {
545         return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
546 }
547
548 /*
549  * Iterate the entries.  If func returns nonzero, iteration will stop.
550  *
551  * If there are no subobjs:
552  *
553  * *bpobj_size can be used to return the number of block pointers in the
554  * bpobj.  Note that this may be different from the number of block pointers
555  * that are iterated over, if iteration is terminated early (e.g. by the func
556  * returning nonzero).
557  *
558  * If there are concurrent (or subsequent) modifications to the bpobj then the
559  * returned *bpobj_size can be passed as "start" to
560  * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
561  */
562 int
563 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
564     uint64_t *bpobj_size)
565 {
566         return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
567 }
568
569 /*
570  * Iterate over the blkptrs in the bpobj beginning at index start. If func
571  * returns nonzero, iteration will stop. This is a livelist specific function
572  * since it assumes that there are no subobjs present.
573  */
574 int
575 livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
576     int64_t start)
577 {
578         if (bpo->bpo_havesubobj)
579                 VERIFY0(bpo->bpo_phys->bpo_subobjs);
580         bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
581         int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
582         kmem_free(bpi, sizeof (bpobj_info_t));
583         return (err);
584 }
585
586 /*
587  * Logically add subobj's contents to the parent bpobj.
588  *
589  * In the most general case, this is accomplished in constant time by adding
590  * a reference to subobj.  This case is used when enqueuing a large subobj:
591  * +--------------+                        +--------------+
592  * | bpobj        |----------------------->| subobj list  |
593  * +----+----+----+----+----+              +-----+-----+--+--+
594  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
595  * +----+----+----+----+----+              +-----+-----+-----+
596  *
597  * +--------------+                        +--------------+
598  * | sub-bpobj    |----------------------> | subsubobj    |
599  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
600  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
601  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
602  *
603  * Result: sub-bpobj added to parent's subobj list.
604  * +--------------+                        +--------------+
605  * | bpobj        |----------------------->| subobj list  |
606  * +----+----+----+----+----+              +-----+-----+--+--+-----+
607  * | bp | bp | bp | bp | bp |              | obj | obj | obj | OBJ |
608  * +----+----+----+----+----+              +-----+-----+-----+--|--+
609  *                                                              |
610  *       /-----------------------------------------------------/
611  *       v
612  * +--------------+                        +--------------+
613  * | sub-bpobj    |----------------------> | subsubobj    |
614  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
615  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
616  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
617  *
618  *
619  * In a common case, the subobj is small: its bp's and its list of subobj's
620  * are each stored in a single block.  In this case we copy the subobj's
621  * contents to the parent:
622  * +--------------+                        +--------------+
623  * | bpobj        |----------------------->| subobj list  |
624  * +----+----+----+----+----+              +-----+-----+--+--+
625  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
626  * +----+----+----+----+----+              +-----+-----+-----+
627  *                          ^                                ^
628  * +--------------+         |              +--------------+  |
629  * | sub-bpobj    |---------^------------> | subsubobj    |  ^
630  * +----+----+----+         |              +-----+-----+--+  |
631  * | BP | BP |-->-->-->-->-/               | OBJ | OBJ |-->-/
632  * +----+----+                             +-----+-----+
633  *
634  * Result: subobj destroyed, contents copied to parent:
635  * +--------------+                        +--------------+
636  * | bpobj        |----------------------->| subobj list  |
637  * +----+----+----+----+----+----+----+    +-----+-----+--+--+-----+-----+
638  * | bp | bp | bp | bp | bp | BP | BP |    | obj | obj | obj | OBJ | OBJ |
639  * +----+----+----+----+----+----+----+    +-----+-----+-----+-----+-----+
640  *
641  *
642  * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
643  * but retain the sub-bpobj:
644  * +--------------+                        +--------------+
645  * | bpobj        |----------------------->| subobj list  |
646  * +----+----+----+----+----+              +-----+-----+--+--+
647  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
648  * +----+----+----+----+----+              +-----+-----+-----+
649  *                                                           ^
650  * +--------------+                        +--------------+  |
651  * | sub-bpobj    |----------------------> | subsubobj    |  ^
652  * +----+----+----+----+---------+----+    +-----+-----+--+  |
653  * | bp | bp | bp | bp |   ...   | bp |    | OBJ | OBJ |-->-/
654  * +----+----+----+----+---------+----+    +-----+-----+
655  *
656  * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
657  * +--------------+                     +--------------+
658  * | bpobj        |-------------------->| subobj list  |
659  * +----+----+----+----+----+           +-----+-----+--+--+-----+-----+------+
660  * | bp | bp | bp | bp | bp |           | obj | obj | obj | OBJ | OBJ | OBJ* |
661  * +----+----+----+----+----+           +-----+-----+-----+-----+-----+--|---+
662  *                                                                       |
663  *       /--------------------------------------------------------------/
664  *       v
665  * +--------------+
666  * | sub-bpobj    |
667  * +----+----+----+----+---------+----+
668  * | bp | bp | bp | bp |   ...   | bp |
669  * +----+----+----+----+---------+----+
670  */
671 void
672 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
673 {
674         bpobj_t subbpo;
675         uint64_t used, comp, uncomp, subsubobjs;
676         boolean_t copy_subsub = B_TRUE;
677         boolean_t copy_bps = B_TRUE;
678
679         ASSERT(bpobj_is_open(bpo));
680         ASSERT(subobj != 0);
681         ASSERT(bpo->bpo_havesubobj);
682         ASSERT(bpo->bpo_havecomp);
683         ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
684
685         if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
686                 bpobj_decr_empty(bpo->bpo_os, tx);
687                 return;
688         }
689
690         VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
691         if (bpobj_is_empty(&subbpo)) {
692                 /* No point in having an empty subobj. */
693                 bpobj_close(&subbpo);
694                 bpobj_free(bpo->bpo_os, subobj, tx);
695                 return;
696         }
697         VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
698
699         mutex_enter(&bpo->bpo_lock);
700         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
701
702         dmu_object_info_t doi;
703
704         if (bpo->bpo_phys->bpo_subobjs != 0) {
705                 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
706                     &doi));
707                 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
708         }
709
710         /*
711          * If subobj has only one block of subobjs, then move subobj's
712          * subobjs to bpo's subobj list directly.  This reduces recursion in
713          * bpobj_iterate due to nested subobjs.
714          */
715         subsubobjs = subbpo.bpo_phys->bpo_subobjs;
716         if (subsubobjs != 0) {
717                 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
718                 if (doi.doi_max_offset > doi.doi_data_block_size) {
719                         copy_subsub = B_FALSE;
720                 }
721         }
722
723         /*
724          * If, in addition to having only one block of subobj's, subobj has
725          * only one block of bp's, then move subobj's bp's to bpo's bp list
726          * directly. This reduces recursion in bpobj_iterate due to nested
727          * subobjs.
728          */
729         VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
730         if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
731                 copy_bps = B_FALSE;
732         }
733
734         if (copy_subsub && subsubobjs != 0) {
735                 dmu_buf_t *subdb;
736                 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
737
738                 VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
739                     0, FTAG, &subdb, 0));
740                 /*
741                  * Make sure that we are not asking dmu_write()
742                  * to write more data than we have in our buffer.
743                  */
744                 VERIFY3U(subdb->db_size, >=,
745                     numsubsub * sizeof (subobj));
746                 if (bpo->bpo_phys->bpo_subobjs == 0) {
747                         bpo->bpo_phys->bpo_subobjs =
748                             dmu_object_alloc(bpo->bpo_os,
749                             DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
750                             DMU_OT_NONE, 0, tx);
751                 }
752                 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
753                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
754                     numsubsub * sizeof (subobj), subdb->db_data, tx);
755                 dmu_buf_rele(subdb, FTAG);
756                 bpo->bpo_phys->bpo_num_subobjs += numsubsub;
757
758                 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
759                 subbpo.bpo_phys->bpo_subobjs = 0;
760                 VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
761         }
762
763         if (copy_bps) {
764                 dmu_buf_t *bps;
765                 uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
766
767                 ASSERT(copy_subsub);
768                 VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
769                     0, FTAG, &bps, 0));
770
771                 /*
772                  * Make sure that we are not asking dmu_write()
773                  * to write more data than we have in our buffer.
774                  */
775                 VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
776                 dmu_write(bpo->bpo_os, bpo->bpo_object,
777                     bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
778                     numbps * sizeof (blkptr_t),
779                     bps->db_data, tx);
780                 dmu_buf_rele(bps, FTAG);
781                 bpo->bpo_phys->bpo_num_blkptrs += numbps;
782
783                 bpobj_close(&subbpo);
784                 VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
785         } else {
786                 bpobj_close(&subbpo);
787                 if (bpo->bpo_phys->bpo_subobjs == 0) {
788                         bpo->bpo_phys->bpo_subobjs =
789                             dmu_object_alloc(bpo->bpo_os,
790                             DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
791                             DMU_OT_NONE, 0, tx);
792                 }
793
794                 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
795                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
796                     sizeof (subobj), &subobj, tx);
797                 bpo->bpo_phys->bpo_num_subobjs++;
798         }
799
800         bpo->bpo_phys->bpo_bytes += used;
801         bpo->bpo_phys->bpo_comp += comp;
802         bpo->bpo_phys->bpo_uncomp += uncomp;
803         mutex_exit(&bpo->bpo_lock);
804
805 }
806
807 /*
808  * Prefetch metadata required for bpobj_enqueue_subobj().
809  */
810 void
811 bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
812 {
813         dmu_object_info_t doi;
814         bpobj_t subbpo;
815         uint64_t subsubobjs;
816         boolean_t copy_subsub = B_TRUE;
817         boolean_t copy_bps = B_TRUE;
818
819         ASSERT(bpobj_is_open(bpo));
820         ASSERT(subobj != 0);
821
822         if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
823                 return;
824
825         if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
826                 return;
827         if (bpobj_is_empty(&subbpo)) {
828                 bpobj_close(&subbpo);
829                 return;
830         }
831         subsubobjs = subbpo.bpo_phys->bpo_subobjs;
832         bpobj_close(&subbpo);
833
834         if (subsubobjs != 0) {
835                 if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
836                         return;
837                 if (doi.doi_max_offset > doi.doi_data_block_size)
838                         copy_subsub = B_FALSE;
839         }
840
841         if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
842                 return;
843         if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
844                 copy_bps = B_FALSE;
845
846         if (copy_subsub && subsubobjs != 0) {
847                 if (bpo->bpo_phys->bpo_subobjs) {
848                         dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
849                             bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
850                             ZIO_PRIORITY_ASYNC_READ);
851                 }
852                 dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
853                     ZIO_PRIORITY_ASYNC_READ);
854         }
855
856         if (copy_bps) {
857                 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
858                     bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
859                     ZIO_PRIORITY_ASYNC_READ);
860                 dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
861                     ZIO_PRIORITY_ASYNC_READ);
862         } else if (bpo->bpo_phys->bpo_subobjs) {
863                 dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
864                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
865                     ZIO_PRIORITY_ASYNC_READ);
866         }
867 }
868
869 void
870 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
871     dmu_tx_t *tx)
872 {
873         blkptr_t stored_bp = *bp;
874         uint64_t offset;
875         int blkoff;
876         blkptr_t *bparray;
877
878         ASSERT(bpobj_is_open(bpo));
879         ASSERT(!BP_IS_HOLE(bp));
880         ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
881
882         if (BP_IS_EMBEDDED(bp)) {
883                 /*
884                  * The bpobj will compress better without the payload.
885                  *
886                  * Note that we store EMBEDDED bp's because they have an
887                  * uncompressed size, which must be accounted for.  An
888                  * alternative would be to add their size to bpo_uncomp
889                  * without storing the bp, but that would create additional
890                  * complications: bpo_uncomp would be inconsistent with the
891                  * set of BP's stored, and bpobj_iterate() wouldn't visit
892                  * all the space accounted for in the bpobj.
893                  */
894                 memset(&stored_bp, 0, sizeof (stored_bp));
895                 stored_bp.blk_prop = bp->blk_prop;
896                 stored_bp.blk_birth = bp->blk_birth;
897         } else if (!BP_GET_DEDUP(bp)) {
898                 /* The bpobj will compress better without the checksum */
899                 memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
900         }
901
902         stored_bp.blk_fill = 0;
903         BP_SET_FREE(&stored_bp, bp_freed);
904
905         mutex_enter(&bpo->bpo_lock);
906
907         offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
908         blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
909
910         if (bpo->bpo_cached_dbuf == NULL ||
911             offset < bpo->bpo_cached_dbuf->db_offset ||
912             offset >= bpo->bpo_cached_dbuf->db_offset +
913             bpo->bpo_cached_dbuf->db_size) {
914                 if (bpo->bpo_cached_dbuf)
915                         dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
916                 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
917                     offset, bpo, &bpo->bpo_cached_dbuf, 0));
918                 ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL);
919         }
920
921         dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
922         bparray = bpo->bpo_cached_dbuf->db_data;
923         bparray[blkoff] = stored_bp;
924
925         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
926         bpo->bpo_phys->bpo_num_blkptrs++;
927         int sign = bp_freed ? -1 : +1;
928         bpo->bpo_phys->bpo_bytes += sign *
929             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
930         if (bpo->bpo_havecomp) {
931                 bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
932                 bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
933         }
934         if (bp_freed) {
935                 ASSERT(bpo->bpo_havefreed);
936                 bpo->bpo_phys->bpo_num_freed++;
937         }
938         mutex_exit(&bpo->bpo_lock);
939 }
940
941 struct space_range_arg {
942         spa_t *spa;
943         uint64_t mintxg;
944         uint64_t maxtxg;
945         uint64_t used;
946         uint64_t comp;
947         uint64_t uncomp;
948 };
949
950 static int
951 space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
952 {
953         (void) bp_freed, (void) tx;
954         struct space_range_arg *sra = arg;
955
956         if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
957                 if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
958                         sra->used += bp_get_dsize_sync(sra->spa, bp);
959                 else
960                         sra->used += bp_get_dsize(sra->spa, bp);
961                 sra->comp += BP_GET_PSIZE(bp);
962                 sra->uncomp += BP_GET_UCSIZE(bp);
963         }
964         return (0);
965 }
966
967 int
968 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
969 {
970         ASSERT(bpobj_is_open(bpo));
971         mutex_enter(&bpo->bpo_lock);
972
973         *usedp = bpo->bpo_phys->bpo_bytes;
974         if (bpo->bpo_havecomp) {
975                 *compp = bpo->bpo_phys->bpo_comp;
976                 *uncompp = bpo->bpo_phys->bpo_uncomp;
977                 mutex_exit(&bpo->bpo_lock);
978                 return (0);
979         } else {
980                 mutex_exit(&bpo->bpo_lock);
981                 return (bpobj_space_range(bpo, 0, UINT64_MAX,
982                     usedp, compp, uncompp));
983         }
984 }
985
986 /*
987  * Return the amount of space in the bpobj which is:
988  * mintxg < blk_birth <= maxtxg
989  */
990 int
991 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
992     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
993 {
994         struct space_range_arg sra = { 0 };
995         int err;
996
997         ASSERT(bpobj_is_open(bpo));
998
999         /*
1000          * As an optimization, if they want the whole txg range, just
1001          * get bpo_bytes rather than iterating over the bps.
1002          */
1003         if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
1004                 return (bpobj_space(bpo, usedp, compp, uncompp));
1005
1006         sra.spa = dmu_objset_spa(bpo->bpo_os);
1007         sra.mintxg = mintxg;
1008         sra.maxtxg = maxtxg;
1009
1010         err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
1011         *usedp = sra.used;
1012         *compp = sra.comp;
1013         *uncompp = sra.uncomp;
1014         return (err);
1015 }
1016
1017 /*
1018  * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
1019  * bpobj are designated as free or allocated that information is not preserved
1020  * in bplists.
1021  */
1022 int
1023 bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
1024     dmu_tx_t *tx)
1025 {
1026         (void) bp_freed, (void) tx;
1027         bplist_t *bpl = arg;
1028         bplist_append(bpl, bp);
1029         return (0);
1030 }