]> CyberLeo.Net >> Repos - FreeBSD/releng/7.2.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
Create releng/7.2 from stable/7 in preparation for 7.2-RELEASE.
[FreeBSD/releng/7.2.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dmu_traverse.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #pragma ident   "%Z%%M% %I%     %E% SMI"
27
28 #include <sys/zfs_context.h>
29 #include <sys/dmu_objset.h>
30 #include <sys/dmu_traverse.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dsl_pool.h>
34 #include <sys/dnode.h>
35 #include <sys/spa.h>
36 #include <sys/zio.h>
37 #include <sys/dmu_impl.h>
38
39 #define BP_SPAN_SHIFT(level, width)     ((level) * (width))
40
41 #define BP_EQUAL(b1, b2)                                \
42         (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
43         (b1)->blk_birth == (b2)->blk_birth)
44
45 /*
46  * Compare two bookmarks.
47  *
48  * For ADVANCE_PRE, the visitation order is:
49  *
50  *      objset 0, 1, 2, ..., ZB_MAXOBJSET.
51  *      object 0, 1, 2, ..., ZB_MAXOBJECT.
52  *      blkoff 0, 1, 2, ...
53  *      level ZB_MAXLEVEL, ..., 2, 1, 0.
54  *
55  * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
56  * ordering vector is:
57  *
58  *      < objset, object, blkoff, -level >
59  *
60  * For ADVANCE_POST, the starting offsets aren't sequential but ending
61  * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
62  * The visitation order is:
63  *
64  *      objset 1, 2, ..., ZB_MAXOBJSET, 0.
65  *      object 1, 2, ..., ZB_MAXOBJECT, 0.
66  *      blkoff 1, 2, ...
67  *      level 0, 1, 2, ..., ZB_MAXLEVEL.
68  *
69  * and thus a valid ordering vector is:
70  *
71  *      < objset - 1, object - 1, blkoff, level >
72  *
73  * Both orderings can be expressed as:
74  *
75  *      < objset + bias, object + bias, blkoff, level ^ bias >
76  *
77  * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
78  * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
79  *
80  * Special case: an objset's osphys is represented as level -1 of object 0.
81  * It is always either the very first or very last block we visit in an objset.
82  * Therefore, if either bookmark's level is -1, level alone determines order.
83  */
84 static int
85 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
86     int advance)
87 {
88         int bias = (advance & ADVANCE_PRE) ? 0 : -1;
89         uint64_t sblkoff, eblkoff;
90         int slevel, elevel, wshift;
91
92         if (szb->zb_objset + bias < ezb->zb_objset + bias)
93                 return (-1);
94
95         if (szb->zb_objset + bias > ezb->zb_objset + bias)
96                 return (1);
97
98         slevel = szb->zb_level;
99         elevel = ezb->zb_level;
100
101         if ((slevel | elevel) < 0)
102                 return ((slevel ^ bias) - (elevel ^ bias));
103
104         if (szb->zb_object + bias < ezb->zb_object + bias)
105                 return (-1);
106
107         if (szb->zb_object + bias > ezb->zb_object + bias)
108                 return (1);
109
110         if (dnp == NULL)
111                 return (0);
112
113         wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
114
115         sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
116         eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
117
118         if (sblkoff < eblkoff)
119                 return (-1);
120
121         if (sblkoff > eblkoff)
122                 return (1);
123
124         return ((elevel ^ bias) - (slevel ^ bias));
125 }
126
127 #define SET_BOOKMARK(zb, objset, object, level, blkid)  \
128 {                                                       \
129         (zb)->zb_objset = objset;                       \
130         (zb)->zb_object = object;                       \
131         (zb)->zb_level = level;                         \
132         (zb)->zb_blkid = blkid;                         \
133 }
134
135 #define SET_BOOKMARK_LB(zb, level, blkid)               \
136 {                                                       \
137         (zb)->zb_level = level;                         \
138         (zb)->zb_blkid = blkid;                         \
139 }
140
141 static int
142 advance_objset(zseg_t *zseg, uint64_t objset, int advance)
143 {
144         zbookmark_t *zb = &zseg->seg_start;
145
146         if (advance & ADVANCE_PRE) {
147                 if (objset >= ZB_MAXOBJSET)
148                         return (ERANGE);
149                 SET_BOOKMARK(zb, objset, 0, -1, 0);
150         } else {
151                 if (objset >= ZB_MAXOBJSET)
152                         objset = 0;
153                 SET_BOOKMARK(zb, objset, 1, 0, 0);
154         }
155
156         if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
157                 return (ERANGE);
158
159         return (EAGAIN);
160 }
161
162 static int
163 advance_object(zseg_t *zseg, uint64_t object, int advance)
164 {
165         zbookmark_t *zb = &zseg->seg_start;
166
167         if (advance & ADVANCE_PRE) {
168                 if (object >= ZB_MAXOBJECT) {
169                         SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
170                 } else {
171                         SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
172                 }
173         } else {
174                 if (zb->zb_object == 0) {
175                         SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
176                 } else {
177                         if (object >= ZB_MAXOBJECT)
178                                 object = 0;
179                         SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
180                 }
181         }
182
183         if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
184                 return (ERANGE);
185
186         return (EAGAIN);
187 }
188
189 static int
190 advance_from_osphys(zseg_t *zseg, int advance)
191 {
192         zbookmark_t *zb = &zseg->seg_start;
193
194         ASSERT(zb->zb_object == 0);
195         ASSERT(zb->zb_level == -1);
196         ASSERT(zb->zb_blkid == 0);
197
198         if (advance & ADVANCE_PRE) {
199                 SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
200         } else {
201                 if (zb->zb_objset == 0)
202                         return (ERANGE);
203                 SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
204         }
205
206         if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
207                 return (ERANGE);
208
209         return (EAGAIN);
210 }
211
212 static int
213 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
214 {
215         zbookmark_t *zb = &zseg->seg_start;
216         int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
217         int maxlevel = dnp->dn_nlevels - 1;
218         int level = zb->zb_level;
219         uint64_t blkid = zb->zb_blkid;
220
221         if (advance & ADVANCE_PRE) {
222                 if (level > 0 && rc == 0) {
223                         level--;
224                         blkid <<= wshift;
225                 } else {
226                         blkid++;
227
228                         if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
229                             dnp->dn_maxblkid)
230                                 return (ERANGE);
231
232                         while (level < maxlevel) {
233                                 if (P2PHASE(blkid, 1ULL << wshift))
234                                         break;
235                                 blkid >>= wshift;
236                                 level++;
237                         }
238                 }
239         } else {
240                 if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
241                         blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
242                         level = 0;
243                 } else {
244                         blkid >>= wshift;
245                         level++;
246                 }
247
248                 while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
249                     dnp->dn_maxblkid) {
250                         if (level == maxlevel)
251                                 return (ERANGE);
252                         blkid >>= wshift;
253                         level++;
254                 }
255         }
256         SET_BOOKMARK_LB(zb, level, blkid);
257
258         if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
259                 return (ERANGE);
260
261         return (EAGAIN);
262 }
263
264 static int
265 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
266 {
267         /*
268          * Before we issue the callback, prune against maxtxg.
269          *
270          * We prune against mintxg before we get here because it's a big win.
271          * If a given block was born in txg 37, then we know that the entire
272          * subtree below that block must have been born in txg 37 or earlier.
273          * We can therefore lop off huge branches of the tree as we go.
274          *
275          * There's no corresponding optimization for maxtxg because knowing
276          * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
277          * children.  In fact, the copy-on-write design of ZFS ensures that
278          * top-level blocks will pretty much always be new.
279          *
280          * Therefore, in the name of simplicity we don't prune against
281          * maxtxg until the last possible moment -- that being right now.
282          */
283         if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
284                 return (0);
285
286         /*
287          * Debugging: verify that the order we visit things agrees with the
288          * order defined by compare_bookmark().  We don't check this for
289          * log blocks because there's no defined ordering for them; they're
290          * always visited (or not) as part of visiting the objset_phys_t.
291          */
292         if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
293                 zbookmark_t *zb = &bc->bc_bookmark;
294                 zbookmark_t *szb = &zseg->seg_start;
295                 zbookmark_t *ezb = &zseg->seg_end;
296                 zbookmark_t *lzb = &th->th_lastcb;
297                 dnode_phys_t *dnp = bc->bc_dnode;
298
299                 ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
300                 ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
301                 ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
302                     lzb->zb_level == ZB_NO_LEVEL);
303                 *lzb = *zb;
304         }
305
306         th->th_callbacks++;
307         return (th->th_func(bc, th->th_spa, th->th_arg));
308 }
309
310 static int
311 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
312         dnode_phys_t *dnp)
313 {
314         zbookmark_t *zb = &bc->bc_bookmark;
315         int error;
316
317         th->th_hits++;
318
319         bc->bc_dnode = dnp;
320         bc->bc_errno = 0;
321
322         if (BP_EQUAL(&bc->bc_blkptr, bp))
323                 return (0);
324
325         bc->bc_blkptr = *bp;
326
327         if (bc->bc_data == NULL)
328                 return (0);
329
330         if (BP_IS_HOLE(bp)) {
331                 ASSERT(th->th_advance & ADVANCE_HOLES);
332                 return (0);
333         }
334
335         if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
336                 error = EIO;
337         } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
338                 error = 0;
339                 th->th_arc_hits++;
340         } else {
341                 error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
342                     BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
343                     th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
344
345                 if (BP_SHOULD_BYTESWAP(bp) && error == 0)
346                         (zb->zb_level > 0 ? byteswap_uint64_array :
347                             dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
348                             BP_GET_LSIZE(bp));
349                 th->th_reads++;
350         }
351
352         if (error) {
353                 bc->bc_errno = error;
354                 error = traverse_callback(th, NULL, bc);
355                 ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
356                 bc->bc_blkptr.blk_birth = -1ULL;
357         }
358
359         dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
360             bc - &th->th_cache[0][0], error,
361             zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
362
363         return (error);
364 }
365
366 static int
367 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
368 {
369         zbookmark_t *zb = &zseg->seg_start;
370         traverse_blk_cache_t *bc;
371         blkptr_t *bp = dnp->dn_blkptr;
372         int i, first, level;
373         int nbp = dnp->dn_nblkptr;
374         int minlevel = zb->zb_level;
375         int maxlevel = dnp->dn_nlevels - 1;
376         int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
377         int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
378         uint64_t blkid = zb->zb_blkid >> bp_shift;
379         int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
380         int rc;
381
382         if (minlevel > maxlevel || blkid >= nbp)
383                 return (ERANGE);
384
385         for (level = maxlevel; level >= minlevel; level--) {
386                 first = P2PHASE(blkid, 1ULL << wshift);
387
388                 for (i = first; i < nbp; i++)
389                         if (bp[i].blk_birth > zseg->seg_mintxg ||
390                             BP_IS_HOLE(&bp[i]) && do_holes)
391                                 break;
392
393                 if (i != first) {
394                         i--;
395                         SET_BOOKMARK_LB(zb, level, blkid + (i - first));
396                         return (ENOTBLK);
397                 }
398
399                 bc = &th->th_cache[depth][level];
400
401                 SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
402                     level, blkid);
403
404                 if (rc = traverse_read(th, bc, bp + i, dnp)) {
405                         if (rc != EAGAIN) {
406                                 SET_BOOKMARK_LB(zb, level, blkid);
407                         }
408                         return (rc);
409                 }
410
411                 if (BP_IS_HOLE(&bp[i])) {
412                         SET_BOOKMARK_LB(zb, level, blkid);
413                         th->th_lastcb.zb_level = ZB_NO_LEVEL;
414                         return (0);
415                 }
416
417                 nbp = 1 << wshift;
418                 bp = bc->bc_data;
419                 bp_shift -= wshift;
420                 blkid = zb->zb_blkid >> bp_shift;
421         }
422
423         return (0);
424 }
425
426 static int
427 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
428     uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
429 {
430         zseg_t zseg;
431         zbookmark_t *zb = &zseg.seg_start;
432         uint64_t object = *objectp;
433         int i, rc;
434
435         SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
436         SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
437
438         zseg.seg_mintxg = txg;
439         zseg.seg_maxtxg = -1ULL;
440
441         for (;;) {
442                 rc = find_block(th, &zseg, mdn, depth);
443
444                 if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
445                         break;
446
447                 if (rc == 0 && zb->zb_level == 0) {
448                         dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
449                         for (i = 0; i < DNODES_PER_BLOCK; i++) {
450                                 object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
451                                 if (object >= *objectp &&
452                                     dnp[i].dn_type != DMU_OT_NONE &&
453                                     (type == -1 || dnp[i].dn_type == type)) {
454                                         *objectp = object;
455                                         *dnpp = &dnp[i];
456                                         return (0);
457                                 }
458                         }
459                 }
460
461                 rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
462
463                 if (rc == ERANGE)
464                         break;
465         }
466
467         if (rc == ERANGE)
468                 *objectp = ZB_MAXOBJECT;
469
470         return (rc);
471 }
472
473 /* ARGSUSED */
474 static void
475 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
476 {
477         traverse_handle_t *th = arg;
478         traverse_blk_cache_t *bc = &th->th_zil_cache;
479         zbookmark_t *zb = &bc->bc_bookmark;
480         zseg_t *zseg = list_head(&th->th_seglist);
481
482         if (bp->blk_birth <= zseg->seg_mintxg)
483                 return;
484
485         if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
486                 zb->zb_object = 0;
487                 zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
488                 bc->bc_blkptr = *bp;
489                 (void) traverse_callback(th, zseg, bc);
490         }
491 }
492
493 /* ARGSUSED */
494 static void
495 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
496 {
497         traverse_handle_t *th = arg;
498         traverse_blk_cache_t *bc = &th->th_zil_cache;
499         zbookmark_t *zb = &bc->bc_bookmark;
500         zseg_t *zseg = list_head(&th->th_seglist);
501
502         if (lrc->lrc_txtype == TX_WRITE) {
503                 lr_write_t *lr = (lr_write_t *)lrc;
504                 blkptr_t *bp = &lr->lr_blkptr;
505
506                 if (bp->blk_birth <= zseg->seg_mintxg)
507                         return;
508
509                 if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
510                         zb->zb_object = lr->lr_foid;
511                         zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
512                         bc->bc_blkptr = *bp;
513                         (void) traverse_callback(th, zseg, bc);
514                 }
515         }
516 }
517
518 static void
519 traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
520 {
521         spa_t *spa = th->th_spa;
522         dsl_pool_t *dp = spa_get_dsl(spa);
523         objset_phys_t *osphys = bc->bc_data;
524         zil_header_t *zh = &osphys->os_zil_header;
525         uint64_t claim_txg = zh->zh_claim_txg;
526         zilog_t *zilog;
527
528         ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
529         ASSERT(bc->bc_bookmark.zb_level == -1);
530
531         /*
532          * We only want to visit blocks that have been claimed but not yet
533          * replayed (or, in read-only mode, blocks that *would* be claimed).
534          */
535         if (claim_txg == 0 && (spa_mode & FWRITE))
536                 return;
537
538         th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
539
540         zilog = zil_alloc(dp->dp_meta_objset, zh);
541
542         (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
543             claim_txg);
544
545         zil_free(zilog);
546 }
547
548 static int
549 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
550 {
551         zbookmark_t *zb = &zseg->seg_start;
552         traverse_blk_cache_t *bc;
553         dnode_phys_t *dn, *dn_tmp;
554         int worklimit = 100;
555         int rc;
556
557         dprintf("<%llu, %llu, %d, %llx>\n",
558             zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
559
560         bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
561         dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
562
563         SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
564
565         rc = traverse_read(th, bc, mosbp, dn);
566
567         if (rc)         /* If we get ERESTART, we've got nowhere left to go */
568                 return (rc == ERESTART ? EINTR : rc);
569
570         ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
571
572         if (zb->zb_objset != 0) {
573                 uint64_t objset = zb->zb_objset;
574                 dsl_dataset_phys_t *dsp;
575
576                 rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
577                     DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
578
579                 if (objset != zb->zb_objset)
580                         rc = advance_objset(zseg, objset, th->th_advance);
581
582                 if (rc != 0)
583                         return (rc);
584
585                 dsp = DN_BONUS(dn_tmp);
586
587                 bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
588                 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
589
590                 SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
591
592                 /*
593                  * If we're traversing an open snapshot, we know that it
594                  * can't be deleted (because it's open) and it can't change
595                  * (because it's a snapshot).  Therefore, once we've gotten
596                  * from the uberblock down to the snapshot's objset_phys_t,
597                  * we no longer need to synchronize with spa_sync(); we're
598                  * traversing a completely static block tree from here on.
599                  */
600                 if (th->th_advance & ADVANCE_NOLOCK) {
601                         ASSERT(th->th_locked);
602                         rw_exit(spa_traverse_rwlock(th->th_spa));
603                         th->th_locked = 0;
604                 }
605
606                 rc = traverse_read(th, bc, &dsp->ds_bp, dn);
607
608                 if (rc != 0) {
609                         if (rc == ERESTART)
610                                 rc = advance_objset(zseg, zb->zb_objset + 1,
611                                     th->th_advance);
612                         return (rc);
613                 }
614
615                 if (th->th_advance & ADVANCE_PRUNE)
616                         zseg->seg_mintxg =
617                             MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
618         }
619
620         if (zb->zb_level == -1) {
621                 ASSERT(zb->zb_object == 0);
622                 ASSERT(zb->zb_blkid == 0);
623                 ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
624
625                 if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
626                         rc = traverse_callback(th, zseg, bc);
627                         if (rc) {
628                                 ASSERT(rc == EINTR);
629                                 return (rc);
630                         }
631                         if ((th->th_advance & ADVANCE_ZIL) &&
632                             zb->zb_objset != 0)
633                                 traverse_zil(th, bc);
634                 }
635
636                 return (advance_from_osphys(zseg, th->th_advance));
637         }
638
639         if (zb->zb_object != 0) {
640                 uint64_t object = zb->zb_object;
641
642                 rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
643                     zseg->seg_mintxg, -1, ZB_MDN_CACHE);
644
645                 if (object != zb->zb_object)
646                         rc = advance_object(zseg, object, th->th_advance);
647
648                 if (rc != 0)
649                         return (rc);
650
651                 dn = dn_tmp;
652         }
653
654         if (zb->zb_level == ZB_MAXLEVEL)
655                 zb->zb_level = dn->dn_nlevels - 1;
656
657         for (;;) {
658                 rc = find_block(th, zseg, dn, ZB_DN_CACHE);
659
660                 if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
661                         break;
662
663                 if (rc == 0) {
664                         bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
665                         ASSERT(bc->bc_dnode == dn);
666                         ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
667                         rc = traverse_callback(th, zseg, bc);
668                         if (rc) {
669                                 ASSERT(rc == EINTR);
670                                 return (rc);
671                         }
672                         if (BP_IS_HOLE(&bc->bc_blkptr)) {
673                                 ASSERT(th->th_advance & ADVANCE_HOLES);
674                                 rc = ENOTBLK;
675                         }
676                 }
677
678                 rc = advance_block(zseg, dn, rc, th->th_advance);
679
680                 if (rc == ERANGE)
681                         break;
682
683                 /*
684                  * Give spa_sync() a chance to run.
685                  */
686                 if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
687                         th->th_syncs++;
688                         return (EAGAIN);
689                 }
690
691                 if (--worklimit == 0)
692                         return (EAGAIN);
693         }
694
695         if (rc == ERANGE)
696                 rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
697
698         return (rc);
699 }
700
701 /*
702  * It is the caller's responsibility to ensure that the dsl_dataset_t
703  * doesn't go away during traversal.
704  */
705 int
706 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
707     blkptr_cb_t func, void *arg)
708 {
709         spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
710         traverse_handle_t *th;
711         int err;
712
713         th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
714
715         traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
716
717         while ((err = traverse_more(th)) == EAGAIN)
718                 continue;
719
720         traverse_fini(th);
721         return (err);
722 }
723
724 int
725 traverse_more(traverse_handle_t *th)
726 {
727         zseg_t *zseg = list_head(&th->th_seglist);
728         uint64_t save_txg;      /* XXX won't be necessary with real itinerary */
729         krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
730         blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
731         int rc;
732
733         if (zseg == NULL)
734                 return (0);
735
736         th->th_restarts++;
737
738         save_txg = zseg->seg_mintxg;
739
740         rw_enter(rw, RW_READER);
741         th->th_locked = 1;
742
743         rc = traverse_segment(th, zseg, mosbp);
744         ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
745
746         if (th->th_locked)
747                 rw_exit(rw);
748         th->th_locked = 0;
749
750         zseg->seg_mintxg = save_txg;
751
752         if (rc == ERANGE) {
753                 list_remove(&th->th_seglist, zseg);
754                 kmem_free(zseg, sizeof (*zseg));
755                 return (EAGAIN);
756         }
757
758         return (rc);
759 }
760
761 /*
762  * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
763  * are not included.  The blocks covered by this segment will all have
764  * mintxg < birth < maxtxg.
765  */
766 static void
767 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
768     uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
769     uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
770 {
771         zseg_t *zseg;
772
773         zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
774
775         zseg->seg_mintxg = mintxg;
776         zseg->seg_maxtxg = maxtxg;
777
778         zseg->seg_start.zb_objset = sobjset;
779         zseg->seg_start.zb_object = sobject;
780         zseg->seg_start.zb_level = slevel;
781         zseg->seg_start.zb_blkid = sblkid;
782
783         zseg->seg_end.zb_objset = eobjset;
784         zseg->seg_end.zb_object = eobject;
785         zseg->seg_end.zb_level = elevel;
786         zseg->seg_end.zb_blkid = eblkid;
787
788         list_insert_tail(&th->th_seglist, zseg);
789 }
790
791 void
792 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
793     uint64_t objset, uint64_t object)
794 {
795         if (th->th_advance & ADVANCE_PRE)
796                 traverse_add_segment(th, mintxg, maxtxg,
797                     objset, object, ZB_MAXLEVEL, 0,
798                     objset, object, 0, ZB_MAXBLKID);
799         else
800                 traverse_add_segment(th, mintxg, maxtxg,
801                     objset, object, 0, 0,
802                     objset, object, 0, ZB_MAXBLKID);
803 }
804
805 void
806 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
807     uint64_t objset)
808 {
809         if (th->th_advance & ADVANCE_PRE)
810                 traverse_add_segment(th, mintxg, maxtxg,
811                     objset, 0, -1, 0,
812                     objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
813         else
814                 traverse_add_segment(th, mintxg, maxtxg,
815                     objset, 1, 0, 0,
816                     objset, 0, -1, 0);
817 }
818
819 void
820 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
821 {
822         if (th->th_advance & ADVANCE_PRE)
823                 traverse_add_segment(th, mintxg, maxtxg,
824                     0, 0, -1, 0,
825                     ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
826         else
827                 traverse_add_segment(th, mintxg, maxtxg,
828                     1, 1, 0, 0,
829                     0, 0, -1, 0);
830 }
831
832 traverse_handle_t *
833 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
834     int zio_flags)
835 {
836         traverse_handle_t *th;
837         int d, l;
838
839         th = kmem_zalloc(sizeof (*th), KM_SLEEP);
840
841         th->th_spa = spa;
842         th->th_func = func;
843         th->th_arg = arg;
844         th->th_advance = advance;
845         th->th_lastcb.zb_level = ZB_NO_LEVEL;
846         th->th_noread.zb_level = ZB_NO_LEVEL;
847         th->th_zio_flags = zio_flags;
848
849         list_create(&th->th_seglist, sizeof (zseg_t),
850             offsetof(zseg_t, seg_node));
851
852         for (d = 0; d < ZB_DEPTH; d++) {
853                 for (l = 0; l < ZB_MAXLEVEL; l++) {
854                         if ((advance & ADVANCE_DATA) ||
855                             l != 0 || d != ZB_DN_CACHE)
856                                 th->th_cache[d][l].bc_data =
857                                     zio_buf_alloc(SPA_MAXBLOCKSIZE);
858                 }
859         }
860
861         return (th);
862 }
863
864 void
865 traverse_fini(traverse_handle_t *th)
866 {
867         int d, l;
868         zseg_t *zseg;
869
870         for (d = 0; d < ZB_DEPTH; d++)
871                 for (l = 0; l < ZB_MAXLEVEL; l++)
872                         if (th->th_cache[d][l].bc_data != NULL)
873                                 zio_buf_free(th->th_cache[d][l].bc_data,
874                                     SPA_MAXBLOCKSIZE);
875
876         while ((zseg = list_head(&th->th_seglist)) != NULL) {
877                 list_remove(&th->th_seglist, zseg);
878                 kmem_free(zseg, sizeof (*zseg));
879         }
880
881         list_destroy(&th->th_seglist);
882
883         dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
884             th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
885             th->th_syncs, th->th_restarts);
886
887         kmem_free(th, sizeof (*th));
888 }