]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / bpobj.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24
25 #include <sys/bpobj.h>
26 #include <sys/zfs_context.h>
27 #include <sys/refcount.h>
28
29 uint64_t
30 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
31 {
32         int size;
33
34         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
35                 size = BPOBJ_SIZE_V0;
36         else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
37                 size = BPOBJ_SIZE_V1;
38         else
39                 size = sizeof (bpobj_phys_t);
40
41         return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
42             DMU_OT_BPOBJ_HDR, size, tx));
43 }
44
45 void
46 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
47 {
48         int64_t i;
49         bpobj_t bpo;
50         dmu_object_info_t doi;
51         int epb;
52         dmu_buf_t *dbuf = NULL;
53
54         VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
55
56         mutex_enter(&bpo.bpo_lock);
57
58         if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
59                 goto out;
60
61         VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
62         epb = doi.doi_data_block_size / sizeof (uint64_t);
63
64         for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
65                 uint64_t *objarray;
66                 uint64_t offset, blkoff;
67
68                 offset = i * sizeof (uint64_t);
69                 blkoff = P2PHASE(i, epb);
70
71                 if (dbuf == NULL || dbuf->db_offset > offset) {
72                         if (dbuf)
73                                 dmu_buf_rele(dbuf, FTAG);
74                         VERIFY3U(0, ==, dmu_buf_hold(os,
75                             bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
76                 }
77
78                 ASSERT3U(offset, >=, dbuf->db_offset);
79                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
80
81                 objarray = dbuf->db_data;
82                 bpobj_free(os, objarray[blkoff], tx);
83         }
84         if (dbuf) {
85                 dmu_buf_rele(dbuf, FTAG);
86                 dbuf = NULL;
87         }
88         VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
89
90 out:
91         mutex_exit(&bpo.bpo_lock);
92         bpobj_close(&bpo);
93
94         VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
95 }
96
97 int
98 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
99 {
100         dmu_object_info_t doi;
101         int err;
102
103         err = dmu_object_info(os, object, &doi);
104         if (err)
105                 return (err);
106
107         bzero(bpo, sizeof (*bpo));
108         mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
109
110         ASSERT(bpo->bpo_dbuf == NULL);
111         ASSERT(bpo->bpo_phys == NULL);
112         ASSERT(object != 0);
113         ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
114         ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
115
116         err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
117         if (err)
118                 return (err);
119
120         bpo->bpo_os = os;
121         bpo->bpo_object = object;
122         bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
123         bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
124         bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
125         bpo->bpo_phys = bpo->bpo_dbuf->db_data;
126         return (0);
127 }
128
129 void
130 bpobj_close(bpobj_t *bpo)
131 {
132         /* Lame workaround for closing a bpobj that was never opened. */
133         if (bpo->bpo_object == 0)
134                 return;
135
136         dmu_buf_rele(bpo->bpo_dbuf, bpo);
137         if (bpo->bpo_cached_dbuf != NULL)
138                 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
139         bpo->bpo_dbuf = NULL;
140         bpo->bpo_phys = NULL;
141         bpo->bpo_cached_dbuf = NULL;
142         bpo->bpo_object = 0;
143
144         mutex_destroy(&bpo->bpo_lock);
145 }
146
147 static int
148 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
149     boolean_t free)
150 {
151         dmu_object_info_t doi;
152         int epb;
153         int64_t i;
154         int err = 0;
155         dmu_buf_t *dbuf = NULL;
156
157         mutex_enter(&bpo->bpo_lock);
158
159         if (free)
160                 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
161
162         for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
163                 blkptr_t *bparray;
164                 blkptr_t *bp;
165                 uint64_t offset, blkoff;
166
167                 offset = i * sizeof (blkptr_t);
168                 blkoff = P2PHASE(i, bpo->bpo_epb);
169
170                 if (dbuf == NULL || dbuf->db_offset > offset) {
171                         if (dbuf)
172                                 dmu_buf_rele(dbuf, FTAG);
173                         err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
174                             FTAG, &dbuf, 0);
175                         if (err)
176                                 break;
177                 }
178
179                 ASSERT3U(offset, >=, dbuf->db_offset);
180                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
181
182                 bparray = dbuf->db_data;
183                 bp = &bparray[blkoff];
184                 err = func(arg, bp, tx);
185                 if (err)
186                         break;
187                 if (free) {
188                         bpo->bpo_phys->bpo_bytes -=
189                             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
190                         ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
191                         if (bpo->bpo_havecomp) {
192                                 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
193                                 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
194                         }
195                         bpo->bpo_phys->bpo_num_blkptrs--;
196                         ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
197                 }
198         }
199         if (dbuf) {
200                 dmu_buf_rele(dbuf, FTAG);
201                 dbuf = NULL;
202         }
203         if (free) {
204                 i++;
205                 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
206                     i * sizeof (blkptr_t), -1ULL, tx));
207         }
208         if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
209                 goto out;
210
211         ASSERT(bpo->bpo_havecomp);
212         err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
213         if (err) {
214                 mutex_exit(&bpo->bpo_lock);
215                 return (err);
216         }
217         epb = doi.doi_data_block_size / sizeof (uint64_t);
218
219         for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
220                 uint64_t *objarray;
221                 uint64_t offset, blkoff;
222                 bpobj_t sublist;
223                 uint64_t used_before, comp_before, uncomp_before;
224                 uint64_t used_after, comp_after, uncomp_after;
225
226                 offset = i * sizeof (uint64_t);
227                 blkoff = P2PHASE(i, epb);
228
229                 if (dbuf == NULL || dbuf->db_offset > offset) {
230                         if (dbuf)
231                                 dmu_buf_rele(dbuf, FTAG);
232                         err = dmu_buf_hold(bpo->bpo_os,
233                             bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
234                         if (err)
235                                 break;
236                 }
237
238                 ASSERT3U(offset, >=, dbuf->db_offset);
239                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
240
241                 objarray = dbuf->db_data;
242                 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
243                 if (err)
244                         break;
245                 if (free) {
246                         err = bpobj_space(&sublist,
247                             &used_before, &comp_before, &uncomp_before);
248                         if (err)
249                                 break;
250                 }
251                 err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
252                 if (free) {
253                         VERIFY3U(0, ==, bpobj_space(&sublist,
254                             &used_after, &comp_after, &uncomp_after));
255                         bpo->bpo_phys->bpo_bytes -= used_before - used_after;
256                         ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
257                         bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
258                         bpo->bpo_phys->bpo_uncomp -=
259                             uncomp_before - uncomp_after;
260                 }
261
262                 bpobj_close(&sublist);
263                 if (err)
264                         break;
265                 if (free) {
266                         err = dmu_object_free(bpo->bpo_os,
267                             objarray[blkoff], tx);
268                         if (err)
269                                 break;
270                         bpo->bpo_phys->bpo_num_subobjs--;
271                         ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
272                 }
273         }
274         if (dbuf) {
275                 dmu_buf_rele(dbuf, FTAG);
276                 dbuf = NULL;
277         }
278         if (free) {
279                 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
280                     bpo->bpo_phys->bpo_subobjs,
281                     (i + 1) * sizeof (uint64_t), -1ULL, tx));
282         }
283
284 out:
285         /* If there are no entries, there should be no bytes. */
286         ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
287             (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
288             bpo->bpo_phys->bpo_bytes == 0);
289
290         mutex_exit(&bpo->bpo_lock);
291         return (err);
292 }
293
294 /*
295  * Iterate and remove the entries.  If func returns nonzero, iteration
296  * will stop and that entry will not be removed.
297  */
298 int
299 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
300 {
301         return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
302 }
303
304 /*
305  * Iterate the entries.  If func returns nonzero, iteration will stop.
306  */
307 int
308 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
309 {
310         return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
311 }
312
313 void
314 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
315 {
316         bpobj_t subbpo;
317         uint64_t used, comp, uncomp, subsubobjs;
318
319         ASSERT(bpo->bpo_havesubobj);
320         ASSERT(bpo->bpo_havecomp);
321
322         VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
323         VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
324
325         if (used == 0) {
326                 /* No point in having an empty subobj. */
327                 bpobj_close(&subbpo);
328                 bpobj_free(bpo->bpo_os, subobj, tx);
329                 return;
330         }
331
332         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
333         if (bpo->bpo_phys->bpo_subobjs == 0) {
334                 bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
335                     DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
336         }
337
338         mutex_enter(&bpo->bpo_lock);
339         dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
340             bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
341             sizeof (subobj), &subobj, tx);
342         bpo->bpo_phys->bpo_num_subobjs++;
343
344         /*
345          * If subobj has only one block of subobjs, then move subobj's
346          * subobjs to bpo's subobj list directly.  This reduces
347          * recursion in bpobj_iterate due to nested subobjs.
348          */
349         subsubobjs = subbpo.bpo_phys->bpo_subobjs;
350         if (subsubobjs != 0) {
351                 dmu_object_info_t doi;
352
353                 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
354                 if (doi.doi_max_offset == doi.doi_data_block_size) {
355                         dmu_buf_t *subdb;
356                         uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
357
358                         VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
359                             0, FTAG, &subdb, 0));
360                         dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
361                             bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
362                             numsubsub * sizeof (subobj), subdb->db_data, tx);
363                         dmu_buf_rele(subdb, FTAG);
364                         bpo->bpo_phys->bpo_num_subobjs += numsubsub;
365
366                         dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
367                         subbpo.bpo_phys->bpo_subobjs = 0;
368                         VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
369                             subsubobjs, tx));
370                 }
371         }
372         bpo->bpo_phys->bpo_bytes += used;
373         bpo->bpo_phys->bpo_comp += comp;
374         bpo->bpo_phys->bpo_uncomp += uncomp;
375         mutex_exit(&bpo->bpo_lock);
376
377         bpobj_close(&subbpo);
378 }
379
380 void
381 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
382 {
383         blkptr_t stored_bp = *bp;
384         uint64_t offset;
385         int blkoff;
386         blkptr_t *bparray;
387
388         ASSERT(!BP_IS_HOLE(bp));
389
390         /* We never need the fill count. */
391         stored_bp.blk_fill = 0;
392
393         /* The bpobj will compress better if we can leave off the checksum */
394         if (!BP_GET_DEDUP(bp))
395                 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
396
397         mutex_enter(&bpo->bpo_lock);
398
399         offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
400         blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
401
402         if (bpo->bpo_cached_dbuf == NULL ||
403             offset < bpo->bpo_cached_dbuf->db_offset ||
404             offset >= bpo->bpo_cached_dbuf->db_offset +
405             bpo->bpo_cached_dbuf->db_size) {
406                 if (bpo->bpo_cached_dbuf)
407                         dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
408                 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
409                     offset, bpo, &bpo->bpo_cached_dbuf, 0));
410         }
411
412         dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
413         bparray = bpo->bpo_cached_dbuf->db_data;
414         bparray[blkoff] = stored_bp;
415
416         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
417         bpo->bpo_phys->bpo_num_blkptrs++;
418         bpo->bpo_phys->bpo_bytes +=
419             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
420         if (bpo->bpo_havecomp) {
421                 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
422                 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
423         }
424         mutex_exit(&bpo->bpo_lock);
425 }
426
427 struct space_range_arg {
428         spa_t *spa;
429         uint64_t mintxg;
430         uint64_t maxtxg;
431         uint64_t used;
432         uint64_t comp;
433         uint64_t uncomp;
434 };
435
436 /* ARGSUSED */
437 static int
438 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
439 {
440         struct space_range_arg *sra = arg;
441
442         if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
443                 sra->used += bp_get_dsize_sync(sra->spa, bp);
444                 sra->comp += BP_GET_PSIZE(bp);
445                 sra->uncomp += BP_GET_UCSIZE(bp);
446         }
447         return (0);
448 }
449
450 int
451 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
452 {
453         mutex_enter(&bpo->bpo_lock);
454
455         *usedp = bpo->bpo_phys->bpo_bytes;
456         if (bpo->bpo_havecomp) {
457                 *compp = bpo->bpo_phys->bpo_comp;
458                 *uncompp = bpo->bpo_phys->bpo_uncomp;
459                 mutex_exit(&bpo->bpo_lock);
460                 return (0);
461         } else {
462                 mutex_exit(&bpo->bpo_lock);
463                 return (bpobj_space_range(bpo, 0, UINT64_MAX,
464                     usedp, compp, uncompp));
465         }
466 }
467
468 /*
469  * Return the amount of space in the bpobj which is:
470  * mintxg < blk_birth <= maxtxg
471  */
472 int
473 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
474     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
475 {
476         struct space_range_arg sra = { 0 };
477         int err;
478
479         /*
480          * As an optimization, if they want the whole txg range, just
481          * get bpo_bytes rather than iterating over the bps.
482          */
483         if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
484                 return (bpobj_space(bpo, usedp, compp, uncompp));
485
486         sra.spa = dmu_objset_spa(bpo->bpo_os);
487         sra.mintxg = mintxg;
488         sra.maxtxg = maxtxg;
489
490         err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
491         *usedp = sra.used;
492         *compp = sra.comp;
493         *uncompp = sra.uncomp;
494         return (err);
495 }