]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - module/zfs/bpobj.c
Remove races from scrub / resilver tests
[FreeBSD/FreeBSD.git] / module / zfs / bpobj.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright (c) 2017 Datto Inc.
25  */
26
27 #include <sys/bpobj.h>
28 #include <sys/zfs_context.h>
29 #include <sys/refcount.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/zfeature.h>
32 #include <sys/zap.h>
33
34 /*
35  * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
36  */
37 uint64_t
38 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
39 {
40         spa_t *spa = dmu_objset_spa(os);
41         dsl_pool_t *dp = dmu_objset_pool(os);
42
43         if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
44                 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
45                         ASSERT0(dp->dp_empty_bpobj);
46                         dp->dp_empty_bpobj =
47                             bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
48                         VERIFY(zap_add(os,
49                             DMU_POOL_DIRECTORY_OBJECT,
50                             DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
51                             &dp->dp_empty_bpobj, tx) == 0);
52                 }
53                 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
54                 ASSERT(dp->dp_empty_bpobj != 0);
55                 return (dp->dp_empty_bpobj);
56         } else {
57                 return (bpobj_alloc(os, blocksize, tx));
58         }
59 }
60
61 void
62 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
63 {
64         dsl_pool_t *dp = dmu_objset_pool(os);
65
66         spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
67         if (!spa_feature_is_active(dmu_objset_spa(os),
68             SPA_FEATURE_EMPTY_BPOBJ)) {
69                 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
70                     DMU_POOL_DIRECTORY_OBJECT,
71                     DMU_POOL_EMPTY_BPOBJ, tx));
72                 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
73                 dp->dp_empty_bpobj = 0;
74         }
75 }
76
77 uint64_t
78 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
79 {
80         int size;
81
82         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
83                 size = BPOBJ_SIZE_V0;
84         else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
85                 size = BPOBJ_SIZE_V1;
86         else
87                 size = sizeof (bpobj_phys_t);
88
89         return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
90             DMU_OT_BPOBJ_HDR, size, tx));
91 }
92
93 void
94 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
95 {
96         int64_t i;
97         bpobj_t bpo;
98         dmu_object_info_t doi;
99         int epb;
100         dmu_buf_t *dbuf = NULL;
101
102         ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
103         VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
104
105         mutex_enter(&bpo.bpo_lock);
106
107         if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
108                 goto out;
109
110         VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
111         epb = doi.doi_data_block_size / sizeof (uint64_t);
112
113         for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
114                 uint64_t *objarray;
115                 uint64_t offset, blkoff;
116
117                 offset = i * sizeof (uint64_t);
118                 blkoff = P2PHASE(i, epb);
119
120                 if (dbuf == NULL || dbuf->db_offset > offset) {
121                         if (dbuf)
122                                 dmu_buf_rele(dbuf, FTAG);
123                         VERIFY3U(0, ==, dmu_buf_hold(os,
124                             bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
125                 }
126
127                 ASSERT3U(offset, >=, dbuf->db_offset);
128                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
129
130                 objarray = dbuf->db_data;
131                 bpobj_free(os, objarray[blkoff], tx);
132         }
133         if (dbuf) {
134                 dmu_buf_rele(dbuf, FTAG);
135                 dbuf = NULL;
136         }
137         VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
138
139 out:
140         mutex_exit(&bpo.bpo_lock);
141         bpobj_close(&bpo);
142
143         VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
144 }
145
146 int
147 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
148 {
149         dmu_object_info_t doi;
150         int err;
151
152         err = dmu_object_info(os, object, &doi);
153         if (err)
154                 return (err);
155
156         bzero(bpo, sizeof (*bpo));
157         mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
158
159         ASSERT(bpo->bpo_dbuf == NULL);
160         ASSERT(bpo->bpo_phys == NULL);
161         ASSERT(object != 0);
162         ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
163         ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
164
165         err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
166         if (err)
167                 return (err);
168
169         bpo->bpo_os = os;
170         bpo->bpo_object = object;
171         bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
172         bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
173         bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
174         bpo->bpo_phys = bpo->bpo_dbuf->db_data;
175         return (0);
176 }
177
178 boolean_t
179 bpobj_is_open(const bpobj_t *bpo)
180 {
181         return (bpo->bpo_object != 0);
182 }
183
184 void
185 bpobj_close(bpobj_t *bpo)
186 {
187         /* Lame workaround for closing a bpobj that was never opened. */
188         if (bpo->bpo_object == 0)
189                 return;
190
191         dmu_buf_rele(bpo->bpo_dbuf, bpo);
192         if (bpo->bpo_cached_dbuf != NULL)
193                 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
194         bpo->bpo_dbuf = NULL;
195         bpo->bpo_phys = NULL;
196         bpo->bpo_cached_dbuf = NULL;
197         bpo->bpo_object = 0;
198
199         mutex_destroy(&bpo->bpo_lock);
200 }
201
202 boolean_t
203 bpobj_is_empty(bpobj_t *bpo)
204 {
205         return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
206             (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
207 }
208
209 static int
210 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
211     boolean_t free)
212 {
213         dmu_object_info_t doi;
214         int epb;
215         int64_t i;
216         int err = 0;
217         dmu_buf_t *dbuf = NULL;
218
219         ASSERT(bpobj_is_open(bpo));
220         mutex_enter(&bpo->bpo_lock);
221
222         if (free)
223                 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
224
225         for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
226                 blkptr_t *bparray;
227                 blkptr_t *bp;
228                 uint64_t offset, blkoff;
229
230                 offset = i * sizeof (blkptr_t);
231                 blkoff = P2PHASE(i, bpo->bpo_epb);
232
233                 if (dbuf == NULL || dbuf->db_offset > offset) {
234                         if (dbuf)
235                                 dmu_buf_rele(dbuf, FTAG);
236                         err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
237                             FTAG, &dbuf, 0);
238                         if (err)
239                                 break;
240                 }
241
242                 ASSERT3U(offset, >=, dbuf->db_offset);
243                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
244
245                 bparray = dbuf->db_data;
246                 bp = &bparray[blkoff];
247                 err = func(arg, bp, tx);
248                 if (err)
249                         break;
250                 if (free) {
251                         bpo->bpo_phys->bpo_bytes -=
252                             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
253                         ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
254                         if (bpo->bpo_havecomp) {
255                                 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
256                                 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
257                         }
258                         bpo->bpo_phys->bpo_num_blkptrs--;
259                         ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
260                 }
261         }
262         if (dbuf) {
263                 dmu_buf_rele(dbuf, FTAG);
264                 dbuf = NULL;
265         }
266         if (free) {
267                 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
268                     (i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx));
269         }
270         if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
271                 goto out;
272
273         ASSERT(bpo->bpo_havecomp);
274         err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
275         if (err) {
276                 mutex_exit(&bpo->bpo_lock);
277                 return (err);
278         }
279         ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
280         epb = doi.doi_data_block_size / sizeof (uint64_t);
281
282         for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
283                 uint64_t *objarray;
284                 uint64_t offset, blkoff;
285                 bpobj_t sublist;
286                 uint64_t used_before, comp_before, uncomp_before;
287                 uint64_t used_after, comp_after, uncomp_after;
288
289                 offset = i * sizeof (uint64_t);
290                 blkoff = P2PHASE(i, epb);
291
292                 if (dbuf == NULL || dbuf->db_offset > offset) {
293                         if (dbuf)
294                                 dmu_buf_rele(dbuf, FTAG);
295                         err = dmu_buf_hold(bpo->bpo_os,
296                             bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
297                         if (err)
298                                 break;
299                 }
300
301                 ASSERT3U(offset, >=, dbuf->db_offset);
302                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
303
304                 objarray = dbuf->db_data;
305                 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
306                 if (err)
307                         break;
308                 if (free) {
309                         err = bpobj_space(&sublist,
310                             &used_before, &comp_before, &uncomp_before);
311                         if (err != 0) {
312                                 bpobj_close(&sublist);
313                                 break;
314                         }
315                 }
316                 err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
317                 if (free) {
318                         VERIFY3U(0, ==, bpobj_space(&sublist,
319                             &used_after, &comp_after, &uncomp_after));
320                         bpo->bpo_phys->bpo_bytes -= used_before - used_after;
321                         ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
322                         bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
323                         bpo->bpo_phys->bpo_uncomp -=
324                             uncomp_before - uncomp_after;
325                 }
326
327                 bpobj_close(&sublist);
328                 if (err)
329                         break;
330                 if (free) {
331                         err = dmu_object_free(bpo->bpo_os,
332                             objarray[blkoff], tx);
333                         if (err)
334                                 break;
335                         bpo->bpo_phys->bpo_num_subobjs--;
336                         ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
337                 }
338         }
339         if (dbuf) {
340                 dmu_buf_rele(dbuf, FTAG);
341                 dbuf = NULL;
342         }
343         if (free) {
344                 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
345                     bpo->bpo_phys->bpo_subobjs,
346                     (i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx));
347         }
348
349 out:
350         /* If there are no entries, there should be no bytes. */
351         if (bpobj_is_empty(bpo)) {
352                 ASSERT0(bpo->bpo_phys->bpo_bytes);
353                 ASSERT0(bpo->bpo_phys->bpo_comp);
354                 ASSERT0(bpo->bpo_phys->bpo_uncomp);
355         }
356
357         mutex_exit(&bpo->bpo_lock);
358         return (err);
359 }
360
361 /*
362  * Iterate and remove the entries.  If func returns nonzero, iteration
363  * will stop and that entry will not be removed.
364  */
365 int
366 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
367 {
368         return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
369 }
370
371 /*
372  * Iterate the entries.  If func returns nonzero, iteration will stop.
373  */
374 int
375 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
376 {
377         return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
378 }
379
380 /*
381  * Logically add subobj's contents to the parent bpobj.
382  *
383  * In the most general case, this is accomplished in constant time by adding
384  * a reference to subobj.  This case is used when enqueuing a large subobj:
385  * +--------------+                        +--------------+
386  * | bpobj        |----------------------->| subobj list  |
387  * +----+----+----+----+----+              +-----+-----+--+--+
388  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
389  * +----+----+----+----+----+              +-----+-----+-----+
390  *
391  * +--------------+                        +--------------+
392  * | sub-bpobj    |----------------------> | subsubobj    |
393  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
394  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
395  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
396  *
397  * Result: sub-bpobj added to parent's subobj list.
398  * +--------------+                        +--------------+
399  * | bpobj        |----------------------->| subobj list  |
400  * +----+----+----+----+----+              +-----+-----+--+--+-----+
401  * | bp | bp | bp | bp | bp |              | obj | obj | obj | OBJ |
402  * +----+----+----+----+----+              +-----+-----+-----+--|--+
403  *                                                              |
404  *       /-----------------------------------------------------/
405  *       v
406  * +--------------+                        +--------------+
407  * | sub-bpobj    |----------------------> | subsubobj    |
408  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
409  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
410  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
411  *
412  *
413  * In a common case, the subobj is small: its bp's and its list of subobj's
414  * are each stored in a single block.  In this case we copy the subobj's
415  * contents to the parent:
416  * +--------------+                        +--------------+
417  * | bpobj        |----------------------->| subobj list  |
418  * +----+----+----+----+----+              +-----+-----+--+--+
419  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
420  * +----+----+----+----+----+              +-----+-----+-----+
421  *                          ^                                ^
422  * +--------------+         |              +--------------+  |
423  * | sub-bpobj    |---------^------------> | subsubobj    |  ^
424  * +----+----+----+         |              +-----+-----+--+  |
425  * | BP | BP |-->-->-->-->-/               | OBJ | OBJ |-->-/
426  * +----+----+                             +-----+-----+
427  *
428  * Result: subobj destroyed, contents copied to parent:
429  * +--------------+                        +--------------+
430  * | bpobj        |----------------------->| subobj list  |
431  * +----+----+----+----+----+----+----+    +-----+-----+--+--+-----+-----+
432  * | bp | bp | bp | bp | bp | BP | BP |    | obj | obj | obj | OBJ | OBJ |
433  * +----+----+----+----+----+----+----+    +-----+-----+-----+-----+-----+
434  *
435  *
436  * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
437  * but retain the sub-bpobj:
438  * +--------------+                        +--------------+
439  * | bpobj        |----------------------->| subobj list  |
440  * +----+----+----+----+----+              +-----+-----+--+--+
441  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
442  * +----+----+----+----+----+              +-----+-----+-----+
443  *                                                           ^
444  * +--------------+                        +--------------+  |
445  * | sub-bpobj    |----------------------> | subsubobj    |  ^
446  * +----+----+----+----+---------+----+    +-----+-----+--+  |
447  * | bp | bp | bp | bp |   ...   | bp |    | OBJ | OBJ |-->-/
448  * +----+----+----+----+---------+----+    +-----+-----+
449  *
450  * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
451  * +--------------+                     +--------------+
452  * | bpobj        |-------------------->| subobj list  |
453  * +----+----+----+----+----+           +-----+-----+--+--+-----+-----+------+
454  * | bp | bp | bp | bp | bp |           | obj | obj | obj | OBJ | OBJ | OBJ* |
455  * +----+----+----+----+----+           +-----+-----+-----+-----+-----+--|---+
456  *                                                                       |
457  *       /--------------------------------------------------------------/
458  *       v
459  * +--------------+
460  * | sub-bpobj    |
461  * +----+----+----+----+---------+----+
462  * | bp | bp | bp | bp |   ...   | bp |
463  * +----+----+----+----+---------+----+
464  */
465 void
466 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
467 {
468         bpobj_t subbpo;
469         uint64_t used, comp, uncomp, subsubobjs;
470         boolean_t copy_subsub = B_TRUE;
471         boolean_t copy_bps = B_TRUE;
472
473         ASSERT(bpobj_is_open(bpo));
474         ASSERT(subobj != 0);
475         ASSERT(bpo->bpo_havesubobj);
476         ASSERT(bpo->bpo_havecomp);
477         ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
478
479         if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
480                 bpobj_decr_empty(bpo->bpo_os, tx);
481                 return;
482         }
483
484         VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
485         VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
486
487         if (bpobj_is_empty(&subbpo)) {
488                 /* No point in having an empty subobj. */
489                 bpobj_close(&subbpo);
490                 bpobj_free(bpo->bpo_os, subobj, tx);
491                 return;
492         }
493
494         mutex_enter(&bpo->bpo_lock);
495         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
496
497         dmu_object_info_t doi;
498
499         if (bpo->bpo_phys->bpo_subobjs != 0) {
500                 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
501                     &doi));
502                 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
503         }
504
505         /*
506          * If subobj has only one block of subobjs, then move subobj's
507          * subobjs to bpo's subobj list directly.  This reduces recursion in
508          * bpobj_iterate due to nested subobjs.
509          */
510         subsubobjs = subbpo.bpo_phys->bpo_subobjs;
511         if (subsubobjs != 0) {
512                 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
513                 if (doi.doi_max_offset > doi.doi_data_block_size) {
514                         copy_subsub = B_FALSE;
515                 }
516         }
517
518         /*
519          * If, in addition to having only one block of subobj's, subobj has
520          * only one block of bp's, then move subobj's bp's to bpo's bp list
521          * directly. This reduces recursion in bpobj_iterate due to nested
522          * subobjs.
523          */
524         VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
525         if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
526                 copy_bps = B_FALSE;
527         }
528
529         if (copy_subsub && subsubobjs != 0) {
530                 dmu_buf_t *subdb;
531                 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
532
533                 VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
534                     0, FTAG, &subdb, 0));
535                 /*
536                  * Make sure that we are not asking dmu_write()
537                  * to write more data than we have in our buffer.
538                  */
539                 VERIFY3U(subdb->db_size, >=,
540                     numsubsub * sizeof (subobj));
541                 if (bpo->bpo_phys->bpo_subobjs == 0) {
542                         bpo->bpo_phys->bpo_subobjs =
543                             dmu_object_alloc(bpo->bpo_os,
544                             DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
545                             DMU_OT_NONE, 0, tx);
546                 }
547                 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
548                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
549                     numsubsub * sizeof (subobj), subdb->db_data, tx);
550                 dmu_buf_rele(subdb, FTAG);
551                 bpo->bpo_phys->bpo_num_subobjs += numsubsub;
552
553                 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
554                 subbpo.bpo_phys->bpo_subobjs = 0;
555                 VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
556         }
557
558         if (copy_bps) {
559                 dmu_buf_t *bps;
560                 uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
561
562                 ASSERT(copy_subsub);
563                 VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
564                     0, FTAG, &bps, 0));
565
566                 /*
567                  * Make sure that we are not asking dmu_write()
568                  * to write more data than we have in our buffer.
569                  */
570                 VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
571                 dmu_write(bpo->bpo_os, bpo->bpo_object,
572                     bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
573                     numbps * sizeof (blkptr_t),
574                     bps->db_data, tx);
575                 dmu_buf_rele(bps, FTAG);
576                 bpo->bpo_phys->bpo_num_blkptrs += numbps;
577
578                 bpobj_close(&subbpo);
579                 VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
580         } else {
581                 bpobj_close(&subbpo);
582                 if (bpo->bpo_phys->bpo_subobjs == 0) {
583                         bpo->bpo_phys->bpo_subobjs =
584                             dmu_object_alloc(bpo->bpo_os,
585                             DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
586                             DMU_OT_NONE, 0, tx);
587                 }
588
589                 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
590                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
591                     sizeof (subobj), &subobj, tx);
592                 bpo->bpo_phys->bpo_num_subobjs++;
593         }
594
595         bpo->bpo_phys->bpo_bytes += used;
596         bpo->bpo_phys->bpo_comp += comp;
597         bpo->bpo_phys->bpo_uncomp += uncomp;
598         mutex_exit(&bpo->bpo_lock);
599
600 }
601
602 void
603 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
604 {
605         blkptr_t stored_bp = *bp;
606         uint64_t offset;
607         int blkoff;
608         blkptr_t *bparray;
609
610         ASSERT(bpobj_is_open(bpo));
611         ASSERT(!BP_IS_HOLE(bp));
612         ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
613
614         if (BP_IS_EMBEDDED(bp)) {
615                 /*
616                  * The bpobj will compress better without the payload.
617                  *
618                  * Note that we store EMBEDDED bp's because they have an
619                  * uncompressed size, which must be accounted for.  An
620                  * alternative would be to add their size to bpo_uncomp
621                  * without storing the bp, but that would create additional
622                  * complications: bpo_uncomp would be inconsistent with the
623                  * set of BP's stored, and bpobj_iterate() wouldn't visit
624                  * all the space accounted for in the bpobj.
625                  */
626                 bzero(&stored_bp, sizeof (stored_bp));
627                 stored_bp.blk_prop = bp->blk_prop;
628                 stored_bp.blk_birth = bp->blk_birth;
629         } else if (!BP_GET_DEDUP(bp)) {
630                 /* The bpobj will compress better without the checksum */
631                 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
632         }
633
634         /* We never need the fill count. */
635         stored_bp.blk_fill = 0;
636
637         mutex_enter(&bpo->bpo_lock);
638
639         offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
640         blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
641
642         if (bpo->bpo_cached_dbuf == NULL ||
643             offset < bpo->bpo_cached_dbuf->db_offset ||
644             offset >= bpo->bpo_cached_dbuf->db_offset +
645             bpo->bpo_cached_dbuf->db_size) {
646                 if (bpo->bpo_cached_dbuf)
647                         dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
648                 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
649                     offset, bpo, &bpo->bpo_cached_dbuf, 0));
650         }
651
652         dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
653         bparray = bpo->bpo_cached_dbuf->db_data;
654         bparray[blkoff] = stored_bp;
655
656         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
657         bpo->bpo_phys->bpo_num_blkptrs++;
658         bpo->bpo_phys->bpo_bytes +=
659             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
660         if (bpo->bpo_havecomp) {
661                 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
662                 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
663         }
664         mutex_exit(&bpo->bpo_lock);
665 }
666
667 struct space_range_arg {
668         spa_t *spa;
669         uint64_t mintxg;
670         uint64_t maxtxg;
671         uint64_t used;
672         uint64_t comp;
673         uint64_t uncomp;
674 };
675
676 /* ARGSUSED */
677 static int
678 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
679 {
680         struct space_range_arg *sra = arg;
681
682         if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
683                 if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
684                         sra->used += bp_get_dsize_sync(sra->spa, bp);
685                 else
686                         sra->used += bp_get_dsize(sra->spa, bp);
687                 sra->comp += BP_GET_PSIZE(bp);
688                 sra->uncomp += BP_GET_UCSIZE(bp);
689         }
690         return (0);
691 }
692
693 int
694 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
695 {
696         ASSERT(bpobj_is_open(bpo));
697         mutex_enter(&bpo->bpo_lock);
698
699         *usedp = bpo->bpo_phys->bpo_bytes;
700         if (bpo->bpo_havecomp) {
701                 *compp = bpo->bpo_phys->bpo_comp;
702                 *uncompp = bpo->bpo_phys->bpo_uncomp;
703                 mutex_exit(&bpo->bpo_lock);
704                 return (0);
705         } else {
706                 mutex_exit(&bpo->bpo_lock);
707                 return (bpobj_space_range(bpo, 0, UINT64_MAX,
708                     usedp, compp, uncompp));
709         }
710 }
711
712 /*
713  * Return the amount of space in the bpobj which is:
714  * mintxg < blk_birth <= maxtxg
715  */
716 int
717 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
718     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
719 {
720         struct space_range_arg sra = { 0 };
721         int err;
722
723         ASSERT(bpobj_is_open(bpo));
724
725         /*
726          * As an optimization, if they want the whole txg range, just
727          * get bpo_bytes rather than iterating over the bps.
728          */
729         if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
730                 return (bpobj_space(bpo, usedp, compp, uncompp));
731
732         sra.spa = dmu_objset_spa(bpo->bpo_os);
733         sra.mintxg = mintxg;
734         sra.maxtxg = maxtxg;
735
736         err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
737         *usedp = sra.used;
738         *compp = sra.comp;
739         *uncompp = sra.uncomp;
740         return (err);
741 }