]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
MFC r228103, r228104:
[FreeBSD/stable/8.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / bpobj.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 by Delphix. All rights reserved.
24  */
25
26 #include <sys/bpobj.h>
27 #include <sys/zfs_context.h>
28 #include <sys/refcount.h>
29 #include <sys/dsl_pool.h>
30
31 uint64_t
32 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
33 {
34         int size;
35
36         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
37                 size = BPOBJ_SIZE_V0;
38         else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
39                 size = BPOBJ_SIZE_V1;
40         else
41                 size = sizeof (bpobj_phys_t);
42
43         return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
44             DMU_OT_BPOBJ_HDR, size, tx));
45 }
46
47 void
48 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
49 {
50         int64_t i;
51         bpobj_t bpo;
52         dmu_object_info_t doi;
53         int epb;
54         dmu_buf_t *dbuf = NULL;
55
56         VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
57
58         mutex_enter(&bpo.bpo_lock);
59
60         if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
61                 goto out;
62
63         VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
64         epb = doi.doi_data_block_size / sizeof (uint64_t);
65
66         for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
67                 uint64_t *objarray;
68                 uint64_t offset, blkoff;
69
70                 offset = i * sizeof (uint64_t);
71                 blkoff = P2PHASE(i, epb);
72
73                 if (dbuf == NULL || dbuf->db_offset > offset) {
74                         if (dbuf)
75                                 dmu_buf_rele(dbuf, FTAG);
76                         VERIFY3U(0, ==, dmu_buf_hold(os,
77                             bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
78                 }
79
80                 ASSERT3U(offset, >=, dbuf->db_offset);
81                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
82
83                 objarray = dbuf->db_data;
84                 bpobj_free(os, objarray[blkoff], tx);
85         }
86         if (dbuf) {
87                 dmu_buf_rele(dbuf, FTAG);
88                 dbuf = NULL;
89         }
90         VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
91
92 out:
93         mutex_exit(&bpo.bpo_lock);
94         bpobj_close(&bpo);
95
96         VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
97 }
98
99 int
100 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
101 {
102         dmu_object_info_t doi;
103         int err;
104
105         err = dmu_object_info(os, object, &doi);
106         if (err)
107                 return (err);
108
109         bzero(bpo, sizeof (*bpo));
110         mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
111
112         ASSERT(bpo->bpo_dbuf == NULL);
113         ASSERT(bpo->bpo_phys == NULL);
114         ASSERT(object != 0);
115         ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
116         ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
117
118         err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
119         if (err)
120                 return (err);
121
122         bpo->bpo_os = os;
123         bpo->bpo_object = object;
124         bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
125         bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
126         bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
127         bpo->bpo_phys = bpo->bpo_dbuf->db_data;
128         return (0);
129 }
130
131 void
132 bpobj_close(bpobj_t *bpo)
133 {
134         /* Lame workaround for closing a bpobj that was never opened. */
135         if (bpo->bpo_object == 0)
136                 return;
137
138         dmu_buf_rele(bpo->bpo_dbuf, bpo);
139         if (bpo->bpo_cached_dbuf != NULL)
140                 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
141         bpo->bpo_dbuf = NULL;
142         bpo->bpo_phys = NULL;
143         bpo->bpo_cached_dbuf = NULL;
144         bpo->bpo_object = 0;
145
146         mutex_destroy(&bpo->bpo_lock);
147 }
148
149 static int
150 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
151     boolean_t free)
152 {
153         dmu_object_info_t doi;
154         int epb;
155         int64_t i;
156         int err = 0;
157         dmu_buf_t *dbuf = NULL;
158
159         mutex_enter(&bpo->bpo_lock);
160
161         if (free)
162                 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
163
164         for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
165                 blkptr_t *bparray;
166                 blkptr_t *bp;
167                 uint64_t offset, blkoff;
168
169                 offset = i * sizeof (blkptr_t);
170                 blkoff = P2PHASE(i, bpo->bpo_epb);
171
172                 if (dbuf == NULL || dbuf->db_offset > offset) {
173                         if (dbuf)
174                                 dmu_buf_rele(dbuf, FTAG);
175                         err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
176                             FTAG, &dbuf, 0);
177                         if (err)
178                                 break;
179                 }
180
181                 ASSERT3U(offset, >=, dbuf->db_offset);
182                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
183
184                 bparray = dbuf->db_data;
185                 bp = &bparray[blkoff];
186                 err = func(arg, bp, tx);
187                 if (err)
188                         break;
189                 if (free) {
190                         bpo->bpo_phys->bpo_bytes -=
191                             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
192                         ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
193                         if (bpo->bpo_havecomp) {
194                                 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
195                                 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
196                         }
197                         bpo->bpo_phys->bpo_num_blkptrs--;
198                         ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
199                 }
200         }
201         if (dbuf) {
202                 dmu_buf_rele(dbuf, FTAG);
203                 dbuf = NULL;
204         }
205         if (free) {
206                 i++;
207                 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
208                     i * sizeof (blkptr_t), -1ULL, tx));
209         }
210         if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
211                 goto out;
212
213         ASSERT(bpo->bpo_havecomp);
214         err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
215         if (err) {
216                 mutex_exit(&bpo->bpo_lock);
217                 return (err);
218         }
219         epb = doi.doi_data_block_size / sizeof (uint64_t);
220
221         for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
222                 uint64_t *objarray;
223                 uint64_t offset, blkoff;
224                 bpobj_t sublist;
225                 uint64_t used_before, comp_before, uncomp_before;
226                 uint64_t used_after, comp_after, uncomp_after;
227
228                 offset = i * sizeof (uint64_t);
229                 blkoff = P2PHASE(i, epb);
230
231                 if (dbuf == NULL || dbuf->db_offset > offset) {
232                         if (dbuf)
233                                 dmu_buf_rele(dbuf, FTAG);
234                         err = dmu_buf_hold(bpo->bpo_os,
235                             bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
236                         if (err)
237                                 break;
238                 }
239
240                 ASSERT3U(offset, >=, dbuf->db_offset);
241                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
242
243                 objarray = dbuf->db_data;
244                 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
245                 if (err)
246                         break;
247                 if (free) {
248                         err = bpobj_space(&sublist,
249                             &used_before, &comp_before, &uncomp_before);
250                         if (err)
251                                 break;
252                 }
253                 err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
254                 if (free) {
255                         VERIFY3U(0, ==, bpobj_space(&sublist,
256                             &used_after, &comp_after, &uncomp_after));
257                         bpo->bpo_phys->bpo_bytes -= used_before - used_after;
258                         ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
259                         bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
260                         bpo->bpo_phys->bpo_uncomp -=
261                             uncomp_before - uncomp_after;
262                 }
263
264                 bpobj_close(&sublist);
265                 if (err)
266                         break;
267                 if (free) {
268                         err = dmu_object_free(bpo->bpo_os,
269                             objarray[blkoff], tx);
270                         if (err)
271                                 break;
272                         bpo->bpo_phys->bpo_num_subobjs--;
273                         ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
274                 }
275         }
276         if (dbuf) {
277                 dmu_buf_rele(dbuf, FTAG);
278                 dbuf = NULL;
279         }
280         if (free) {
281                 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
282                     bpo->bpo_phys->bpo_subobjs,
283                     (i + 1) * sizeof (uint64_t), -1ULL, tx));
284         }
285
286 out:
287         /* If there are no entries, there should be no bytes. */
288         ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
289             (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
290             bpo->bpo_phys->bpo_bytes == 0);
291
292         mutex_exit(&bpo->bpo_lock);
293         return (err);
294 }
295
296 /*
297  * Iterate and remove the entries.  If func returns nonzero, iteration
298  * will stop and that entry will not be removed.
299  */
300 int
301 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
302 {
303         return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
304 }
305
306 /*
307  * Iterate the entries.  If func returns nonzero, iteration will stop.
308  */
309 int
310 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
311 {
312         return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
313 }
314
315 void
316 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
317 {
318         bpobj_t subbpo;
319         uint64_t used, comp, uncomp, subsubobjs;
320
321         ASSERT(bpo->bpo_havesubobj);
322         ASSERT(bpo->bpo_havecomp);
323
324         VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
325         VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
326
327         if (used == 0) {
328                 /* No point in having an empty subobj. */
329                 bpobj_close(&subbpo);
330                 bpobj_free(bpo->bpo_os, subobj, tx);
331                 return;
332         }
333
334         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
335         if (bpo->bpo_phys->bpo_subobjs == 0) {
336                 bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
337                     DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
338         }
339
340         mutex_enter(&bpo->bpo_lock);
341         dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
342             bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
343             sizeof (subobj), &subobj, tx);
344         bpo->bpo_phys->bpo_num_subobjs++;
345
346         /*
347          * If subobj has only one block of subobjs, then move subobj's
348          * subobjs to bpo's subobj list directly.  This reduces
349          * recursion in bpobj_iterate due to nested subobjs.
350          */
351         subsubobjs = subbpo.bpo_phys->bpo_subobjs;
352         if (subsubobjs != 0) {
353                 dmu_object_info_t doi;
354
355                 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
356                 if (doi.doi_max_offset == doi.doi_data_block_size) {
357                         dmu_buf_t *subdb;
358                         uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
359
360                         VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
361                             0, FTAG, &subdb, 0));
362                         dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
363                             bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
364                             numsubsub * sizeof (subobj), subdb->db_data, tx);
365                         dmu_buf_rele(subdb, FTAG);
366                         bpo->bpo_phys->bpo_num_subobjs += numsubsub;
367
368                         dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
369                         subbpo.bpo_phys->bpo_subobjs = 0;
370                         VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
371                             subsubobjs, tx));
372                 }
373         }
374         bpo->bpo_phys->bpo_bytes += used;
375         bpo->bpo_phys->bpo_comp += comp;
376         bpo->bpo_phys->bpo_uncomp += uncomp;
377         mutex_exit(&bpo->bpo_lock);
378
379         bpobj_close(&subbpo);
380 }
381
382 void
383 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
384 {
385         blkptr_t stored_bp = *bp;
386         uint64_t offset;
387         int blkoff;
388         blkptr_t *bparray;
389
390         ASSERT(!BP_IS_HOLE(bp));
391
392         /* We never need the fill count. */
393         stored_bp.blk_fill = 0;
394
395         /* The bpobj will compress better if we can leave off the checksum */
396         if (!BP_GET_DEDUP(bp))
397                 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
398
399         mutex_enter(&bpo->bpo_lock);
400
401         offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
402         blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
403
404         if (bpo->bpo_cached_dbuf == NULL ||
405             offset < bpo->bpo_cached_dbuf->db_offset ||
406             offset >= bpo->bpo_cached_dbuf->db_offset +
407             bpo->bpo_cached_dbuf->db_size) {
408                 if (bpo->bpo_cached_dbuf)
409                         dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
410                 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
411                     offset, bpo, &bpo->bpo_cached_dbuf, 0));
412         }
413
414         dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
415         bparray = bpo->bpo_cached_dbuf->db_data;
416         bparray[blkoff] = stored_bp;
417
418         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
419         bpo->bpo_phys->bpo_num_blkptrs++;
420         bpo->bpo_phys->bpo_bytes +=
421             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
422         if (bpo->bpo_havecomp) {
423                 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
424                 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
425         }
426         mutex_exit(&bpo->bpo_lock);
427 }
428
429 struct space_range_arg {
430         spa_t *spa;
431         uint64_t mintxg;
432         uint64_t maxtxg;
433         uint64_t used;
434         uint64_t comp;
435         uint64_t uncomp;
436 };
437
438 /* ARGSUSED */
439 static int
440 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
441 {
442         struct space_range_arg *sra = arg;
443
444         if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
445                 if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
446                         sra->used += bp_get_dsize_sync(sra->spa, bp);
447                 else
448                         sra->used += bp_get_dsize(sra->spa, bp);
449                 sra->comp += BP_GET_PSIZE(bp);
450                 sra->uncomp += BP_GET_UCSIZE(bp);
451         }
452         return (0);
453 }
454
455 int
456 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
457 {
458         mutex_enter(&bpo->bpo_lock);
459
460         *usedp = bpo->bpo_phys->bpo_bytes;
461         if (bpo->bpo_havecomp) {
462                 *compp = bpo->bpo_phys->bpo_comp;
463                 *uncompp = bpo->bpo_phys->bpo_uncomp;
464                 mutex_exit(&bpo->bpo_lock);
465                 return (0);
466         } else {
467                 mutex_exit(&bpo->bpo_lock);
468                 return (bpobj_space_range(bpo, 0, UINT64_MAX,
469                     usedp, compp, uncompp));
470         }
471 }
472
473 /*
474  * Return the amount of space in the bpobj which is:
475  * mintxg < blk_birth <= maxtxg
476  */
477 int
478 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
479     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
480 {
481         struct space_range_arg sra = { 0 };
482         int err;
483
484         /*
485          * As an optimization, if they want the whole txg range, just
486          * get bpo_bytes rather than iterating over the bps.
487          */
488         if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
489                 return (bpobj_space(bpo, usedp, compp, uncompp));
490
491         sra.spa = dmu_objset_spa(bpo->bpo_os);
492         sra.mintxg = mintxg;
493         sra.maxtxg = maxtxg;
494
495         err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
496         *usedp = sra.used;
497         *compp = sra.comp;
498         *uncompp = sra.uncomp;
499         return (err);
500 }