]> CyberLeo.Net >> Repos - FreeBSD/releng/8.0.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
Adjust to reflect 8.0-RELEASE.
[FreeBSD/releng/8.0.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dmu_traverse.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #pragma ident   "%Z%%M% %I%     %E% SMI"
27
28 #include <sys/zfs_context.h>
29 #include <sys/dmu_objset.h>
30 #include <sys/dmu_traverse.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dsl_pool.h>
34 #include <sys/dnode.h>
35 #include <sys/spa.h>
36 #include <sys/zio.h>
37 #include <sys/dmu_impl.h>
38 #include <sys/zvol.h>
39
40 #define BP_SPAN_SHIFT(level, width)     ((level) * (width))
41
42 #define BP_EQUAL(b1, b2)                                \
43         (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
44         (b1)->blk_birth == (b2)->blk_birth)
45
46 /*
47  * Compare two bookmarks.
48  *
49  * For ADVANCE_PRE, the visitation order is:
50  *
51  *      objset 0, 1, 2, ..., ZB_MAXOBJSET.
52  *      object 0, 1, 2, ..., ZB_MAXOBJECT.
53  *      blkoff 0, 1, 2, ...
54  *      level ZB_MAXLEVEL, ..., 2, 1, 0.
55  *
56  * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
57  * ordering vector is:
58  *
59  *      < objset, object, blkoff, -level >
60  *
61  * For ADVANCE_POST, the starting offsets aren't sequential but ending
62  * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
63  * The visitation order is:
64  *
65  *      objset 1, 2, ..., ZB_MAXOBJSET, 0.
66  *      object 1, 2, ..., ZB_MAXOBJECT, 0.
67  *      blkoff 1, 2, ...
68  *      level 0, 1, 2, ..., ZB_MAXLEVEL.
69  *
70  * and thus a valid ordering vector is:
71  *
72  *      < objset - 1, object - 1, blkoff, level >
73  *
74  * Both orderings can be expressed as:
75  *
76  *      < objset + bias, object + bias, blkoff, level ^ bias >
77  *
78  * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
79  * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
80  *
81  * Special case: an objset's osphys is represented as level -1 of object 0.
82  * It is always either the very first or very last block we visit in an objset.
83  * Therefore, if either bookmark's level is -1, level alone determines order.
84  */
85 static int
86 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
87     int advance)
88 {
89         int bias = (advance & ADVANCE_PRE) ? 0 : -1;
90         uint64_t sblkoff, eblkoff;
91         int slevel, elevel, wshift;
92
93         if (szb->zb_objset + bias < ezb->zb_objset + bias)
94                 return (-1);
95
96         if (szb->zb_objset + bias > ezb->zb_objset + bias)
97                 return (1);
98
99         slevel = szb->zb_level;
100         elevel = ezb->zb_level;
101
102         if ((slevel | elevel) < 0)
103                 return ((slevel ^ bias) - (elevel ^ bias));
104
105         if (szb->zb_object + bias < ezb->zb_object + bias)
106                 return (-1);
107
108         if (szb->zb_object + bias > ezb->zb_object + bias)
109                 return (1);
110
111         if (dnp == NULL)
112                 return (0);
113
114         wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
115
116         sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
117         eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
118
119         if (sblkoff < eblkoff)
120                 return (-1);
121
122         if (sblkoff > eblkoff)
123                 return (1);
124
125         return ((elevel ^ bias) - (slevel ^ bias));
126 }
127
128 #define SET_BOOKMARK(zb, objset, object, level, blkid)  \
129 {                                                       \
130         (zb)->zb_objset = objset;                       \
131         (zb)->zb_object = object;                       \
132         (zb)->zb_level = level;                         \
133         (zb)->zb_blkid = blkid;                         \
134 }
135
136 #define SET_BOOKMARK_LB(zb, level, blkid)               \
137 {                                                       \
138         (zb)->zb_level = level;                         \
139         (zb)->zb_blkid = blkid;                         \
140 }
141
142 static int
143 advance_objset(zseg_t *zseg, uint64_t objset, int advance)
144 {
145         zbookmark_t *zb = &zseg->seg_start;
146
147         if (advance & ADVANCE_PRE) {
148                 if (objset >= ZB_MAXOBJSET)
149                         return (ERANGE);
150                 SET_BOOKMARK(zb, objset, 0, -1, 0);
151         } else {
152                 if (objset >= ZB_MAXOBJSET)
153                         objset = 0;
154                 SET_BOOKMARK(zb, objset, 1, 0, 0);
155         }
156
157         if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
158                 return (ERANGE);
159
160         return (EAGAIN);
161 }
162
163 static int
164 advance_object(zseg_t *zseg, uint64_t object, int advance)
165 {
166         zbookmark_t *zb = &zseg->seg_start;
167
168         if (advance & ADVANCE_PRE) {
169                 if (object >= ZB_MAXOBJECT) {
170                         SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
171                 } else {
172                         SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
173                 }
174         } else {
175                 if (zb->zb_object == 0) {
176                         SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
177                 } else {
178                         if (object >= ZB_MAXOBJECT)
179                                 object = 0;
180                         SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
181                 }
182         }
183
184         if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
185                 return (ERANGE);
186
187         return (EAGAIN);
188 }
189
190 static int
191 advance_from_osphys(zseg_t *zseg, int advance)
192 {
193         zbookmark_t *zb = &zseg->seg_start;
194
195         ASSERT(zb->zb_object == 0);
196         ASSERT(zb->zb_level == -1);
197         ASSERT(zb->zb_blkid == 0);
198
199         if (advance & ADVANCE_PRE) {
200                 SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
201         } else {
202                 if (zb->zb_objset == 0)
203                         return (ERANGE);
204                 SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
205         }
206
207         if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
208                 return (ERANGE);
209
210         return (EAGAIN);
211 }
212
213 static int
214 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
215 {
216         zbookmark_t *zb = &zseg->seg_start;
217         int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
218         int maxlevel = dnp->dn_nlevels - 1;
219         int level = zb->zb_level;
220         uint64_t blkid = zb->zb_blkid;
221
222         if (advance & ADVANCE_PRE) {
223                 if (level > 0 && rc == 0) {
224                         level--;
225                         blkid <<= wshift;
226                 } else {
227                         blkid++;
228
229                         if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
230                             dnp->dn_maxblkid)
231                                 return (ERANGE);
232
233                         while (level < maxlevel) {
234                                 if (P2PHASE(blkid, 1ULL << wshift))
235                                         break;
236                                 blkid >>= wshift;
237                                 level++;
238                         }
239                 }
240         } else {
241                 if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
242                         blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
243                         level = 0;
244                 } else {
245                         blkid >>= wshift;
246                         level++;
247                 }
248
249                 while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
250                     dnp->dn_maxblkid) {
251                         if (level == maxlevel)
252                                 return (ERANGE);
253                         blkid >>= wshift;
254                         level++;
255                 }
256         }
257         SET_BOOKMARK_LB(zb, level, blkid);
258
259         if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
260                 return (ERANGE);
261
262         return (EAGAIN);
263 }
264
265 /*
266  * The traverse_callback function will call the function specified in th_func.
267  * In the event of an error the callee, specified by th_func, must return
268  * one of the following errors:
269  *
270  *      EINTR           - Indicates that the callee wants the traversal to
271  *                        abort immediately.
272  *      ERESTART        - The callee has acknowledged the error and would
273  *                        like to continue.
274  */
275 static int
276 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
277 {
278         /*
279          * Before we issue the callback, prune against maxtxg.
280          *
281          * We prune against mintxg before we get here because it's a big win.
282          * If a given block was born in txg 37, then we know that the entire
283          * subtree below that block must have been born in txg 37 or earlier.
284          * We can therefore lop off huge branches of the tree as we go.
285          *
286          * There's no corresponding optimization for maxtxg because knowing
287          * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
288          * children.  In fact, the copy-on-write design of ZFS ensures that
289          * top-level blocks will pretty much always be new.
290          *
291          * Therefore, in the name of simplicity we don't prune against
292          * maxtxg until the last possible moment -- that being right now.
293          */
294         if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
295                 return (0);
296
297         /*
298          * Debugging: verify that the order we visit things agrees with the
299          * order defined by compare_bookmark().  We don't check this for
300          * log blocks because there's no defined ordering for them; they're
301          * always visited (or not) as part of visiting the objset_phys_t.
302          */
303         if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
304                 zbookmark_t *zb = &bc->bc_bookmark;
305                 zbookmark_t *szb = &zseg->seg_start;
306                 zbookmark_t *ezb = &zseg->seg_end;
307                 zbookmark_t *lzb = &th->th_lastcb;
308                 dnode_phys_t *dnp = bc->bc_dnode;
309
310                 ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
311                 ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
312                 ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
313                     lzb->zb_level == ZB_NO_LEVEL);
314                 *lzb = *zb;
315         }
316
317         th->th_callbacks++;
318         return (th->th_func(bc, th->th_spa, th->th_arg));
319 }
320
321 static int
322 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
323         dnode_phys_t *dnp)
324 {
325         zbookmark_t *zb = &bc->bc_bookmark;
326         int error;
327
328         th->th_hits++;
329
330         bc->bc_dnode = dnp;
331         bc->bc_errno = 0;
332
333         if (BP_EQUAL(&bc->bc_blkptr, bp))
334                 return (0);
335
336         bc->bc_blkptr = *bp;
337
338         if (bc->bc_data == NULL)
339                 return (0);
340
341         if (BP_IS_HOLE(bp)) {
342                 ASSERT(th->th_advance & ADVANCE_HOLES);
343                 return (0);
344         }
345
346         if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
347                 error = EIO;
348         } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
349                 error = 0;
350                 th->th_arc_hits++;
351         } else {
352                 error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
353                     BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
354                     th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
355
356                 if (BP_SHOULD_BYTESWAP(bp) && error == 0)
357                         (zb->zb_level > 0 ? byteswap_uint64_array :
358                             dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
359                             BP_GET_LSIZE(bp));
360                 th->th_reads++;
361         }
362
363         if (error) {
364                 bc->bc_errno = error;
365                 error = traverse_callback(th, NULL, bc);
366                 ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
367                 bc->bc_blkptr.blk_birth = -1ULL;
368         }
369
370         dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
371             bc - &th->th_cache[0][0], error,
372             zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
373
374         return (error);
375 }
376
377 static int
378 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
379 {
380         zbookmark_t *zb = &zseg->seg_start;
381         traverse_blk_cache_t *bc;
382         blkptr_t *bp = dnp->dn_blkptr;
383         int i, first, level;
384         int nbp = dnp->dn_nblkptr;
385         int minlevel = zb->zb_level;
386         int maxlevel = dnp->dn_nlevels - 1;
387         int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
388         int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
389         uint64_t blkid = zb->zb_blkid >> bp_shift;
390         int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
391         int rc;
392
393         if (minlevel > maxlevel || blkid >= nbp)
394                 return (ERANGE);
395
396         for (level = maxlevel; level >= minlevel; level--) {
397                 first = P2PHASE(blkid, 1ULL << wshift);
398
399                 for (i = first; i < nbp; i++)
400                         if (bp[i].blk_birth > zseg->seg_mintxg ||
401                             BP_IS_HOLE(&bp[i]) && do_holes)
402                                 break;
403
404                 if (i != first) {
405                         i--;
406                         SET_BOOKMARK_LB(zb, level, blkid + (i - first));
407                         return (ENOTBLK);
408                 }
409
410                 bc = &th->th_cache[depth][level];
411
412                 SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
413                     level, blkid);
414
415                 if (rc = traverse_read(th, bc, bp + i, dnp)) {
416                         if (rc != EAGAIN) {
417                                 SET_BOOKMARK_LB(zb, level, blkid);
418                         }
419                         return (rc);
420                 }
421
422                 if (BP_IS_HOLE(&bp[i])) {
423                         SET_BOOKMARK_LB(zb, level, blkid);
424                         th->th_lastcb.zb_level = ZB_NO_LEVEL;
425                         return (0);
426                 }
427
428                 nbp = 1 << wshift;
429                 bp = bc->bc_data;
430                 bp_shift -= wshift;
431                 blkid = zb->zb_blkid >> bp_shift;
432         }
433
434         return (0);
435 }
436
437 static int
438 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
439     uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
440 {
441         zseg_t zseg;
442         zbookmark_t *zb = &zseg.seg_start;
443         uint64_t object = *objectp;
444         int i, rc;
445
446         SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
447         SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
448
449         zseg.seg_mintxg = txg;
450         zseg.seg_maxtxg = -1ULL;
451
452         for (;;) {
453                 rc = find_block(th, &zseg, mdn, depth);
454
455                 if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
456                         break;
457
458                 if (rc == 0 && zb->zb_level == 0) {
459                         dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
460                         for (i = 0; i < DNODES_PER_BLOCK; i++) {
461                                 object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
462                                 if (object >= *objectp &&
463                                     dnp[i].dn_type != DMU_OT_NONE &&
464                                     (type == -1 || dnp[i].dn_type == type)) {
465                                         *objectp = object;
466                                         *dnpp = &dnp[i];
467                                         return (0);
468                                 }
469                         }
470                 }
471
472                 rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
473
474                 if (rc == ERANGE)
475                         break;
476         }
477
478         if (rc == ERANGE)
479                 *objectp = ZB_MAXOBJECT;
480
481         return (rc);
482 }
483
484 /* ARGSUSED */
485 static void
486 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
487 {
488         traverse_handle_t *th = arg;
489         traverse_blk_cache_t *bc = &th->th_zil_cache;
490         zbookmark_t *zb = &bc->bc_bookmark;
491         zseg_t *zseg = list_head(&th->th_seglist);
492
493         if (bp->blk_birth <= zseg->seg_mintxg)
494                 return;
495
496         if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
497                 zb->zb_object = 0;
498                 zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
499                 bc->bc_blkptr = *bp;
500                 (void) traverse_callback(th, zseg, bc);
501         }
502 }
503
504 /* ARGSUSED */
505 static void
506 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
507 {
508         traverse_handle_t *th = arg;
509         traverse_blk_cache_t *bc = &th->th_zil_cache;
510         zbookmark_t *zb = &bc->bc_bookmark;
511         zseg_t *zseg = list_head(&th->th_seglist);
512
513         if (lrc->lrc_txtype == TX_WRITE) {
514                 lr_write_t *lr = (lr_write_t *)lrc;
515                 blkptr_t *bp = &lr->lr_blkptr;
516
517                 if (bp->blk_birth <= zseg->seg_mintxg)
518                         return;
519
520                 if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
521                         zb->zb_object = lr->lr_foid;
522                         zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
523                         bc->bc_blkptr = *bp;
524                         (void) traverse_callback(th, zseg, bc);
525                 }
526         }
527 }
528
529 static void
530 traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
531 {
532         spa_t *spa = th->th_spa;
533         dsl_pool_t *dp = spa_get_dsl(spa);
534         objset_phys_t *osphys = bc->bc_data;
535         zil_header_t *zh = &osphys->os_zil_header;
536         uint64_t claim_txg = zh->zh_claim_txg;
537         zilog_t *zilog;
538
539         ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
540         ASSERT(bc->bc_bookmark.zb_level == -1);
541
542         /*
543          * We only want to visit blocks that have been claimed but not yet
544          * replayed (or, in read-only mode, blocks that *would* be claimed).
545          */
546         if (claim_txg == 0 && (spa_mode & FWRITE))
547                 return;
548
549         th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
550
551         zilog = zil_alloc(dp->dp_meta_objset, zh);
552
553         (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
554             claim_txg);
555
556         zil_free(zilog);
557 }
558
559 static int
560 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
561 {
562         zbookmark_t *zb = &zseg->seg_start;
563         traverse_blk_cache_t *bc;
564         dnode_phys_t *dn, *dn_tmp;
565         int worklimit = 100;
566         int rc;
567
568         dprintf("<%llu, %llu, %d, %llx>\n",
569             zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
570
571         bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
572         dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
573
574         SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
575
576         rc = traverse_read(th, bc, mosbp, dn);
577
578         if (rc)         /* If we get ERESTART, we've got nowhere left to go */
579                 return (rc == ERESTART ? EINTR : rc);
580
581         ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
582
583         if (zb->zb_objset != 0) {
584                 uint64_t objset = zb->zb_objset;
585                 dsl_dataset_phys_t *dsp;
586
587                 rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
588                     DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
589
590                 if (objset != zb->zb_objset)
591                         rc = advance_objset(zseg, objset, th->th_advance);
592
593                 if (rc != 0)
594                         return (rc);
595
596                 dsp = DN_BONUS(dn_tmp);
597
598                 bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
599                 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
600
601                 SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
602
603                 /*
604                  * If we're traversing an open snapshot, we know that it
605                  * can't be deleted (because it's open) and it can't change
606                  * (because it's a snapshot).  Therefore, once we've gotten
607                  * from the uberblock down to the snapshot's objset_phys_t,
608                  * we no longer need to synchronize with spa_sync(); we're
609                  * traversing a completely static block tree from here on.
610                  */
611                 if (th->th_advance & ADVANCE_NOLOCK) {
612                         ASSERT(th->th_locked);
613                         rw_exit(spa_traverse_rwlock(th->th_spa));
614                         th->th_locked = 0;
615                 }
616
617                 if (BP_IS_HOLE(&dsp->ds_bp))
618                         rc = ERESTART;
619                 else
620                         rc = traverse_read(th, bc, &dsp->ds_bp, dn);
621
622                 if (rc != 0) {
623                         if (rc == ERESTART)
624                                 rc = advance_objset(zseg, zb->zb_objset + 1,
625                                     th->th_advance);
626                         return (rc);
627                 }
628
629                 if (th->th_advance & ADVANCE_PRUNE)
630                         zseg->seg_mintxg =
631                             MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
632         }
633
634         if (zb->zb_level == -1) {
635                 ASSERT(zb->zb_object == 0);
636                 ASSERT(zb->zb_blkid == 0);
637                 ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
638
639                 if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
640                         rc = traverse_callback(th, zseg, bc);
641                         if (rc) {
642                                 ASSERT(rc == EINTR);
643                                 return (rc);
644                         }
645                         if ((th->th_advance & ADVANCE_ZIL) &&
646                             zb->zb_objset != 0)
647                                 traverse_zil(th, bc);
648                 }
649
650                 return (advance_from_osphys(zseg, th->th_advance));
651         }
652
653         if (zb->zb_object != 0) {
654                 uint64_t object = zb->zb_object;
655
656                 rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
657                     zseg->seg_mintxg, -1, ZB_MDN_CACHE);
658
659                 if (object != zb->zb_object)
660                         rc = advance_object(zseg, object, th->th_advance);
661
662                 if (rc != 0)
663                         return (rc);
664
665                 dn = dn_tmp;
666         }
667
668         if (zb->zb_level == ZB_MAXLEVEL)
669                 zb->zb_level = dn->dn_nlevels - 1;
670
671         for (;;) {
672                 rc = find_block(th, zseg, dn, ZB_DN_CACHE);
673
674                 if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
675                         break;
676
677                 if (rc == 0) {
678                         bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
679                         ASSERT(bc->bc_dnode == dn);
680                         ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
681                         rc = traverse_callback(th, zseg, bc);
682                         if (rc) {
683                                 ASSERT(rc == EINTR);
684                                 return (rc);
685                         }
686                         if (BP_IS_HOLE(&bc->bc_blkptr)) {
687                                 ASSERT(th->th_advance & ADVANCE_HOLES);
688                                 rc = ENOTBLK;
689                         }
690                 }
691
692                 rc = advance_block(zseg, dn, rc, th->th_advance);
693
694                 if (rc == ERANGE)
695                         break;
696
697                 /*
698                  * Give spa_sync() a chance to run.
699                  */
700                 if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
701                         th->th_syncs++;
702                         return (EAGAIN);
703                 }
704
705                 if (--worklimit == 0)
706                         return (EAGAIN);
707         }
708
709         if (rc == ERANGE)
710                 rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
711
712         return (rc);
713 }
714
715 /*
716  * It is the caller's responsibility to ensure that the dsl_dataset_t
717  * doesn't go away during traversal.
718  */
719 int
720 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
721     blkptr_cb_t func, void *arg)
722 {
723         spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
724         traverse_handle_t *th;
725         int err;
726
727         th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
728
729         traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
730
731         while ((err = traverse_more(th)) == EAGAIN)
732                 continue;
733
734         traverse_fini(th);
735         return (err);
736 }
737
738 int
739 traverse_zvol(objset_t *os, int advance,  blkptr_cb_t func, void *arg)
740 {
741         spa_t *spa = dmu_objset_spa(os);
742         traverse_handle_t *th;
743         int err;
744
745         th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
746
747         traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
748
749         while ((err = traverse_more(th)) == EAGAIN)
750                 continue;
751
752         traverse_fini(th);
753         return (err);
754 }
755
756 int
757 traverse_more(traverse_handle_t *th)
758 {
759         zseg_t *zseg = list_head(&th->th_seglist);
760         uint64_t save_txg;      /* XXX won't be necessary with real itinerary */
761         krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
762         blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
763         int rc;
764
765         if (zseg == NULL)
766                 return (0);
767
768         th->th_restarts++;
769
770         save_txg = zseg->seg_mintxg;
771
772         rw_enter(rw, RW_READER);
773         th->th_locked = 1;
774
775         rc = traverse_segment(th, zseg, mosbp);
776         ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
777
778         if (th->th_locked)
779                 rw_exit(rw);
780         th->th_locked = 0;
781
782         zseg->seg_mintxg = save_txg;
783
784         if (rc == ERANGE) {
785                 list_remove(&th->th_seglist, zseg);
786                 kmem_free(zseg, sizeof (*zseg));
787                 return (EAGAIN);
788         }
789
790         return (rc);
791 }
792
793 /*
794  * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
795  * are not included.  The blocks covered by this segment will all have
796  * mintxg < birth < maxtxg.
797  */
798 static void
799 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
800     uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
801     uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
802 {
803         zseg_t *zseg;
804
805         zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
806
807         zseg->seg_mintxg = mintxg;
808         zseg->seg_maxtxg = maxtxg;
809
810         zseg->seg_start.zb_objset = sobjset;
811         zseg->seg_start.zb_object = sobject;
812         zseg->seg_start.zb_level = slevel;
813         zseg->seg_start.zb_blkid = sblkid;
814
815         zseg->seg_end.zb_objset = eobjset;
816         zseg->seg_end.zb_object = eobject;
817         zseg->seg_end.zb_level = elevel;
818         zseg->seg_end.zb_blkid = eblkid;
819
820         list_insert_tail(&th->th_seglist, zseg);
821 }
822
823 void
824 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
825     uint64_t objset, uint64_t object)
826 {
827         if (th->th_advance & ADVANCE_PRE)
828                 traverse_add_segment(th, mintxg, maxtxg,
829                     objset, object, ZB_MAXLEVEL, 0,
830                     objset, object, 0, ZB_MAXBLKID);
831         else
832                 traverse_add_segment(th, mintxg, maxtxg,
833                     objset, object, 0, 0,
834                     objset, object, 0, ZB_MAXBLKID);
835 }
836
837 void
838 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
839     uint64_t objset)
840 {
841         if (th->th_advance & ADVANCE_PRE)
842                 traverse_add_segment(th, mintxg, maxtxg,
843                     objset, 0, -1, 0,
844                     objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
845         else
846                 traverse_add_segment(th, mintxg, maxtxg,
847                     objset, 1, 0, 0,
848                     objset, 0, -1, 0);
849 }
850
851 void
852 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
853 {
854         if (th->th_advance & ADVANCE_PRE)
855                 traverse_add_segment(th, mintxg, maxtxg,
856                     0, 0, -1, 0,
857                     ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
858         else
859                 traverse_add_segment(th, mintxg, maxtxg,
860                     1, 1, 0, 0,
861                     0, 0, -1, 0);
862 }
863
864 traverse_handle_t *
865 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
866     int zio_flags)
867 {
868         traverse_handle_t *th;
869         int d, l;
870
871         th = kmem_zalloc(sizeof (*th), KM_SLEEP);
872
873         th->th_spa = spa;
874         th->th_func = func;
875         th->th_arg = arg;
876         th->th_advance = advance;
877         th->th_lastcb.zb_level = ZB_NO_LEVEL;
878         th->th_noread.zb_level = ZB_NO_LEVEL;
879         th->th_zio_flags = zio_flags;
880
881         list_create(&th->th_seglist, sizeof (zseg_t),
882             offsetof(zseg_t, seg_node));
883
884         for (d = 0; d < ZB_DEPTH; d++) {
885                 for (l = 0; l < ZB_MAXLEVEL; l++) {
886                         if ((advance & ADVANCE_DATA) ||
887                             l != 0 || d != ZB_DN_CACHE)
888                                 th->th_cache[d][l].bc_data =
889                                     zio_buf_alloc(SPA_MAXBLOCKSIZE);
890                 }
891         }
892
893         return (th);
894 }
895
896 void
897 traverse_fini(traverse_handle_t *th)
898 {
899         int d, l;
900         zseg_t *zseg;
901
902         for (d = 0; d < ZB_DEPTH; d++)
903                 for (l = 0; l < ZB_MAXLEVEL; l++)
904                         if (th->th_cache[d][l].bc_data != NULL)
905                                 zio_buf_free(th->th_cache[d][l].bc_data,
906                                     SPA_MAXBLOCKSIZE);
907
908         while ((zseg = list_head(&th->th_seglist)) != NULL) {
909                 list_remove(&th->th_seglist, zseg);
910                 kmem_free(zseg, sizeof (*zseg));
911         }
912
913         list_destroy(&th->th_seglist);
914
915         dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
916             th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
917             th->th_syncs, th->th_restarts);
918
919         kmem_free(th, sizeof (*th));
920 }