]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
Update our copy of DTS from the ones from Linux 4.14
[FreeBSD/FreeBSD.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_scan.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2016 Gary Mills
24  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
25  * Copyright (c) 2017 Datto Inc.
26  */
27
28 #include <sys/dsl_scan.h>
29 #include <sys/dsl_pool.h>
30 #include <sys/dsl_dataset.h>
31 #include <sys/dsl_prop.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dsl_synctask.h>
34 #include <sys/dnode.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/dmu_objset.h>
37 #include <sys/arc.h>
38 #include <sys/zap.h>
39 #include <sys/zio.h>
40 #include <sys/zfs_context.h>
41 #include <sys/fs/zfs.h>
42 #include <sys/zfs_znode.h>
43 #include <sys/spa_impl.h>
44 #include <sys/vdev_impl.h>
45 #include <sys/zil_impl.h>
46 #include <sys/zio_checksum.h>
47 #include <sys/ddt.h>
48 #include <sys/sa.h>
49 #include <sys/sa_impl.h>
50 #include <sys/zfeature.h>
51 #include <sys/abd.h>
52 #ifdef _KERNEL
53 #include <sys/zfs_vfsops.h>
54 #endif
55
56 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
57     const zbookmark_phys_t *);
58
59 static scan_cb_t dsl_scan_scrub_cb;
60 static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
61 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
62 static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
63
64 unsigned int zfs_top_maxinflight = 32;  /* maximum I/Os per top-level */
65 unsigned int zfs_resilver_delay = 2;    /* number of ticks to delay resilver */
66 unsigned int zfs_scrub_delay = 4;       /* number of ticks to delay scrub */
67 unsigned int zfs_scan_idle = 50;        /* idle window in clock ticks */
68
69 unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
70 unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
71 unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
72                                                  per txg */
73 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
74 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
75
76 SYSCTL_DECL(_vfs_zfs);
77 SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
78     &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev");
79 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN,
80     &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
81 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN,
82     &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
83 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
84     &zfs_scan_idle, 0, "Idle scan window in clock ticks");
85 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
86     &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
87 SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
88     &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
89 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
90     &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
91 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN,
92     &zfs_no_scrub_io, 0, "Disable scrub I/O");
93 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
94     &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
95
96 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
97 /* max number of blocks to free in a single TXG */
98 uint64_t zfs_free_max_blocks = UINT64_MAX;
99 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
100     &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG");
101
102
103 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
104         ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
105         (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
106
107 extern int zfs_txg_timeout;
108
109 /*
110  * Enable/disable the processing of the free_bpobj object.
111  */
112 boolean_t zfs_free_bpobj_enabled = B_TRUE;
113
114 SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
115     &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
116
117 /* the order has to match pool_scan_type */
118 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
119         NULL,
120         dsl_scan_scrub_cb,      /* POOL_SCAN_SCRUB */
121         dsl_scan_scrub_cb,      /* POOL_SCAN_RESILVER */
122 };
123
124 int
125 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
126 {
127         int err;
128         dsl_scan_t *scn;
129         spa_t *spa = dp->dp_spa;
130         uint64_t f;
131
132         scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
133         scn->scn_dp = dp;
134
135         /*
136          * It's possible that we're resuming a scan after a reboot so
137          * make sure that the scan_async_destroying flag is initialized
138          * appropriately.
139          */
140         ASSERT(!scn->scn_async_destroying);
141         scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
142             SPA_FEATURE_ASYNC_DESTROY);
143
144         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
145             "scrub_func", sizeof (uint64_t), 1, &f);
146         if (err == 0) {
147                 /*
148                  * There was an old-style scrub in progress.  Restart a
149                  * new-style scrub from the beginning.
150                  */
151                 scn->scn_restart_txg = txg;
152                 zfs_dbgmsg("old-style scrub was in progress; "
153                     "restarting new-style scrub in txg %llu",
154                     scn->scn_restart_txg);
155
156                 /*
157                  * Load the queue obj from the old location so that it
158                  * can be freed by dsl_scan_done().
159                  */
160                 (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
161                     "scrub_queue", sizeof (uint64_t), 1,
162                     &scn->scn_phys.scn_queue_obj);
163         } else {
164                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
165                     DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
166                     &scn->scn_phys);
167                 if (err == ENOENT)
168                         return (0);
169                 else if (err)
170                         return (err);
171
172                 if (scn->scn_phys.scn_state == DSS_SCANNING &&
173                     spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
174                         /*
175                          * A new-type scrub was in progress on an old
176                          * pool, and the pool was accessed by old
177                          * software.  Restart from the beginning, since
178                          * the old software may have changed the pool in
179                          * the meantime.
180                          */
181                         scn->scn_restart_txg = txg;
182                         zfs_dbgmsg("new-style scrub was modified "
183                             "by old software; restarting in txg %llu",
184                             scn->scn_restart_txg);
185                 }
186         }
187
188         spa_scan_stat_init(spa);
189         return (0);
190 }
191
192 void
193 dsl_scan_fini(dsl_pool_t *dp)
194 {
195         if (dp->dp_scan) {
196                 kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
197                 dp->dp_scan = NULL;
198         }
199 }
200
201 /* ARGSUSED */
202 static int
203 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
204 {
205         dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
206
207         if (scn->scn_phys.scn_state == DSS_SCANNING)
208                 return (SET_ERROR(EBUSY));
209
210         return (0);
211 }
212
213 static void
214 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
215 {
216         dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
217         pool_scan_func_t *funcp = arg;
218         dmu_object_type_t ot = 0;
219         dsl_pool_t *dp = scn->scn_dp;
220         spa_t *spa = dp->dp_spa;
221
222         ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
223         ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
224         bzero(&scn->scn_phys, sizeof (scn->scn_phys));
225         scn->scn_phys.scn_func = *funcp;
226         scn->scn_phys.scn_state = DSS_SCANNING;
227         scn->scn_phys.scn_min_txg = 0;
228         scn->scn_phys.scn_max_txg = tx->tx_txg;
229         scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
230         scn->scn_phys.scn_start_time = gethrestime_sec();
231         scn->scn_phys.scn_errors = 0;
232         scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
233         scn->scn_restart_txg = 0;
234         scn->scn_done_txg = 0;
235         spa_scan_stat_init(spa);
236
237         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
238                 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
239
240                 /* rewrite all disk labels */
241                 vdev_config_dirty(spa->spa_root_vdev);
242
243                 if (vdev_resilver_needed(spa->spa_root_vdev,
244                     &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
245                         spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
246                 } else {
247                         spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
248                 }
249
250                 spa->spa_scrub_started = B_TRUE;
251                 /*
252                  * If this is an incremental scrub, limit the DDT scrub phase
253                  * to just the auto-ditto class (for correctness); the rest
254                  * of the scrub should go faster using top-down pruning.
255                  */
256                 if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
257                         scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
258
259         }
260
261         /* back to the generic stuff */
262
263         if (dp->dp_blkstats == NULL) {
264                 dp->dp_blkstats =
265                     kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
266         }
267         bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
268
269         if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
270                 ot = DMU_OT_ZAP_OTHER;
271
272         scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
273             ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
274
275         dsl_scan_sync_state(scn, tx);
276
277         spa_history_log_internal(spa, "scan setup", tx,
278             "func=%u mintxg=%llu maxtxg=%llu",
279             *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
280 }
281
282 /* ARGSUSED */
283 static void
284 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
285 {
286         static const char *old_names[] = {
287                 "scrub_bookmark",
288                 "scrub_ddt_bookmark",
289                 "scrub_ddt_class_max",
290                 "scrub_queue",
291                 "scrub_min_txg",
292                 "scrub_max_txg",
293                 "scrub_func",
294                 "scrub_errors",
295                 NULL
296         };
297
298         dsl_pool_t *dp = scn->scn_dp;
299         spa_t *spa = dp->dp_spa;
300         int i;
301
302         /* Remove any remnants of an old-style scrub. */
303         for (i = 0; old_names[i]; i++) {
304                 (void) zap_remove(dp->dp_meta_objset,
305                     DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
306         }
307
308         if (scn->scn_phys.scn_queue_obj != 0) {
309                 VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
310                     scn->scn_phys.scn_queue_obj, tx));
311                 scn->scn_phys.scn_queue_obj = 0;
312         }
313
314         scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
315
316         /*
317          * If we were "restarted" from a stopped state, don't bother
318          * with anything else.
319          */
320         if (scn->scn_phys.scn_state != DSS_SCANNING)
321                 return;
322
323         if (complete)
324                 scn->scn_phys.scn_state = DSS_FINISHED;
325         else
326                 scn->scn_phys.scn_state = DSS_CANCELED;
327
328         if (dsl_scan_restarting(scn, tx))
329                 spa_history_log_internal(spa, "scan aborted, restarting", tx,
330                     "errors=%llu", spa_get_errlog_size(spa));
331         else if (!complete)
332                 spa_history_log_internal(spa, "scan cancelled", tx,
333                     "errors=%llu", spa_get_errlog_size(spa));
334         else
335                 spa_history_log_internal(spa, "scan done", tx,
336                     "errors=%llu", spa_get_errlog_size(spa));
337
338         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
339                 mutex_enter(&spa->spa_scrub_lock);
340                 while (spa->spa_scrub_inflight > 0) {
341                         cv_wait(&spa->spa_scrub_io_cv,
342                             &spa->spa_scrub_lock);
343                 }
344                 mutex_exit(&spa->spa_scrub_lock);
345                 spa->spa_scrub_started = B_FALSE;
346                 spa->spa_scrub_active = B_FALSE;
347
348                 /*
349                  * If the scrub/resilver completed, update all DTLs to
350                  * reflect this.  Whether it succeeded or not, vacate
351                  * all temporary scrub DTLs.
352                  */
353                 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
354                     complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
355                 if (complete) {
356                         spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
357                             ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
358                 }
359                 spa_errlog_rotate(spa);
360
361                 /*
362                  * We may have finished replacing a device.
363                  * Let the async thread assess this and handle the detach.
364                  */
365                 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
366         }
367
368         scn->scn_phys.scn_end_time = gethrestime_sec();
369 }
370
371 /* ARGSUSED */
372 static int
373 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
374 {
375         dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
376
377         if (scn->scn_phys.scn_state != DSS_SCANNING)
378                 return (SET_ERROR(ENOENT));
379         return (0);
380 }
381
382 /* ARGSUSED */
383 static void
384 dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
385 {
386         dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
387
388         dsl_scan_done(scn, B_FALSE, tx);
389         dsl_scan_sync_state(scn, tx);
390 }
391
392 int
393 dsl_scan_cancel(dsl_pool_t *dp)
394 {
395         return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
396             dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
397 }
398
399 boolean_t
400 dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
401 {
402         if (dsl_scan_scrubbing(scn->scn_dp) &&
403             scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED)
404                 return (B_TRUE);
405
406         return (B_FALSE);
407 }
408
409 static int
410 dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
411 {
412         pool_scrub_cmd_t *cmd = arg;
413         dsl_pool_t *dp = dmu_tx_pool(tx);
414         dsl_scan_t *scn = dp->dp_scan;
415
416         if (*cmd == POOL_SCRUB_PAUSE) {
417                 /* can't pause a scrub when there is no in-progress scrub */
418                 if (!dsl_scan_scrubbing(dp))
419                         return (SET_ERROR(ENOENT));
420
421                 /* can't pause a paused scrub */
422                 if (dsl_scan_is_paused_scrub(scn))
423                         return (SET_ERROR(EBUSY));
424         } else if (*cmd != POOL_SCRUB_NORMAL) {
425                 return (SET_ERROR(ENOTSUP));
426         }
427
428         return (0);
429 }
430
431 static void
432 dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
433 {
434         pool_scrub_cmd_t *cmd = arg;
435         dsl_pool_t *dp = dmu_tx_pool(tx);
436         spa_t *spa = dp->dp_spa;
437         dsl_scan_t *scn = dp->dp_scan;
438
439         if (*cmd == POOL_SCRUB_PAUSE) {
440                 /* can't pause a scrub when there is no in-progress scrub */
441                 spa->spa_scan_pass_scrub_pause = gethrestime_sec();
442                 scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
443                 dsl_scan_sync_state(scn, tx);
444         } else {
445                 ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
446                 if (dsl_scan_is_paused_scrub(scn)) {
447                         /*
448                          * We need to keep track of how much time we spend
449                          * paused per pass so that we can adjust the scrub rate
450                          * shown in the output of 'zpool status'
451                          */
452                         spa->spa_scan_pass_scrub_spent_paused +=
453                             gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
454                         spa->spa_scan_pass_scrub_pause = 0;
455                         scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
456                         dsl_scan_sync_state(scn, tx);
457                 }
458         }
459 }
460
461 /*
462  * Set scrub pause/resume state if it makes sense to do so
463  */
464 int
465 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
466 {
467         return (dsl_sync_task(spa_name(dp->dp_spa),
468             dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
469             ZFS_SPACE_CHECK_RESERVED));
470 }
471
472 boolean_t
473 dsl_scan_scrubbing(const dsl_pool_t *dp)
474 {
475         dsl_scan_t *scn = dp->dp_scan;
476
477         if (scn->scn_phys.scn_state == DSS_SCANNING &&
478             scn->scn_phys.scn_func == POOL_SCAN_SCRUB)
479                 return (B_TRUE);
480
481         return (B_FALSE);
482 }
483
484 static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
485     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
486     dmu_objset_type_t ostype, dmu_tx_t *tx);
487 static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
488     dmu_objset_type_t ostype,
489     dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
490
491 void
492 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
493 {
494         zio_free(dp->dp_spa, txg, bp);
495 }
496
497 void
498 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
499 {
500         ASSERT(dsl_pool_sync_context(dp));
501         zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
502             pio->io_flags));
503 }
504
505 static uint64_t
506 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
507 {
508         uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
509         if (ds->ds_is_snapshot)
510                 return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
511         return (smt);
512 }
513
514 static void
515 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
516 {
517         VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
518             DMU_POOL_DIRECTORY_OBJECT,
519             DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
520             &scn->scn_phys, tx));
521 }
522
523 extern int zfs_vdev_async_write_active_min_dirty_percent;
524
525 static boolean_t
526 dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
527 {
528         /* we never skip user/group accounting objects */
529         if (zb && (int64_t)zb->zb_object < 0)
530                 return (B_FALSE);
531
532         if (scn->scn_suspending)
533                 return (B_TRUE); /* we're already suspending */
534
535         if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
536                 return (B_FALSE); /* we're resuming */
537
538         /* We only know how to resume from level-0 blocks. */
539         if (zb && zb->zb_level != 0)
540                 return (B_FALSE);
541
542         /*
543          * We suspend if:
544          *  - we have scanned for the maximum time: an entire txg
545          *    timeout (default 5 sec)
546          *  or
547          *  - we have scanned for at least the minimum time (default 1 sec
548          *    for scrub, 3 sec for resilver), and either we have sufficient
549          *    dirty data that we are starting to write more quickly
550          *    (default 30%), or someone is explicitly waiting for this txg
551          *    to complete.
552          *  or
553          *  - the spa is shutting down because this pool is being exported
554          *    or the machine is rebooting.
555          */
556         int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
557             zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
558         uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
559         int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
560         if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
561             (NSEC2MSEC(elapsed_nanosecs) > mintime &&
562             (txg_sync_waiting(scn->scn_dp) ||
563             dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
564             spa_shutting_down(scn->scn_dp->dp_spa)) {
565                 if (zb) {
566                         dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
567                             (longlong_t)zb->zb_objset,
568                             (longlong_t)zb->zb_object,
569                             (longlong_t)zb->zb_level,
570                             (longlong_t)zb->zb_blkid);
571                         scn->scn_phys.scn_bookmark = *zb;
572                 }
573                 dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n",
574                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
575                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
576                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
577                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
578                 scn->scn_suspending = B_TRUE;
579                 return (B_TRUE);
580         }
581         return (B_FALSE);
582 }
583
584 typedef struct zil_scan_arg {
585         dsl_pool_t      *zsa_dp;
586         zil_header_t    *zsa_zh;
587 } zil_scan_arg_t;
588
589 /* ARGSUSED */
590 static int
591 dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
592 {
593         zil_scan_arg_t *zsa = arg;
594         dsl_pool_t *dp = zsa->zsa_dp;
595         dsl_scan_t *scn = dp->dp_scan;
596         zil_header_t *zh = zsa->zsa_zh;
597         zbookmark_phys_t zb;
598
599         if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
600                 return (0);
601
602         /*
603          * One block ("stubby") can be allocated a long time ago; we
604          * want to visit that one because it has been allocated
605          * (on-disk) even if it hasn't been claimed (even though for
606          * scrub there's nothing to do to it).
607          */
608         if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
609                 return (0);
610
611         SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
612             ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
613
614         VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
615         return (0);
616 }
617
618 /* ARGSUSED */
619 static int
620 dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
621 {
622         if (lrc->lrc_txtype == TX_WRITE) {
623                 zil_scan_arg_t *zsa = arg;
624                 dsl_pool_t *dp = zsa->zsa_dp;
625                 dsl_scan_t *scn = dp->dp_scan;
626                 zil_header_t *zh = zsa->zsa_zh;
627                 lr_write_t *lr = (lr_write_t *)lrc;
628                 blkptr_t *bp = &lr->lr_blkptr;
629                 zbookmark_phys_t zb;
630
631                 if (BP_IS_HOLE(bp) ||
632                     bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
633                         return (0);
634
635                 /*
636                  * birth can be < claim_txg if this record's txg is
637                  * already txg sync'ed (but this log block contains
638                  * other records that are not synced)
639                  */
640                 if (claim_txg == 0 || bp->blk_birth < claim_txg)
641                         return (0);
642
643                 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
644                     lr->lr_foid, ZB_ZIL_LEVEL,
645                     lr->lr_offset / BP_GET_LSIZE(bp));
646
647                 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
648         }
649         return (0);
650 }
651
652 static void
653 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
654 {
655         uint64_t claim_txg = zh->zh_claim_txg;
656         zil_scan_arg_t zsa = { dp, zh };
657         zilog_t *zilog;
658
659         /*
660          * We only want to visit blocks that have been claimed but not yet
661          * replayed (or, in read-only mode, blocks that *would* be claimed).
662          */
663         if (claim_txg == 0 && spa_writeable(dp->dp_spa))
664                 return;
665
666         zilog = zil_alloc(dp->dp_meta_objset, zh);
667
668         (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
669             claim_txg);
670
671         zil_free(zilog);
672 }
673
674 /* ARGSUSED */
675 static void
676 dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
677     uint64_t objset, uint64_t object, uint64_t blkid)
678 {
679         zbookmark_phys_t czb;
680         arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
681
682         if (zfs_no_scrub_prefetch)
683                 return;
684
685         if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
686             (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
687                 return;
688
689         SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
690
691         (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
692             NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
693             ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
694 }
695
696 static boolean_t
697 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
698     const zbookmark_phys_t *zb)
699 {
700         /*
701          * We never skip over user/group accounting objects (obj<0)
702          */
703         if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
704             (int64_t)zb->zb_object >= 0) {
705                 /*
706                  * If we already visited this bp & everything below (in
707                  * a prior txg sync), don't bother doing it again.
708                  */
709                 if (zbookmark_subtree_completed(dnp, zb,
710                     &scn->scn_phys.scn_bookmark))
711                         return (B_TRUE);
712
713                 /*
714                  * If we found the block we're trying to resume from, or
715                  * we went past it to a different object, zero it out to
716                  * indicate that it's OK to start checking for suspending
717                  * again.
718                  */
719                 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
720                     zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
721                         dprintf("resuming at %llx/%llx/%llx/%llx\n",
722                             (longlong_t)zb->zb_objset,
723                             (longlong_t)zb->zb_object,
724                             (longlong_t)zb->zb_level,
725                             (longlong_t)zb->zb_blkid);
726                         bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
727                 }
728         }
729         return (B_FALSE);
730 }
731
732 /*
733  * Return nonzero on i/o error.
734  * Return new buf to write out in *bufp.
735  */
736 static int
737 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
738     dnode_phys_t *dnp, const blkptr_t *bp,
739     const zbookmark_phys_t *zb, dmu_tx_t *tx)
740 {
741         dsl_pool_t *dp = scn->scn_dp;
742         int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
743         int err;
744
745         if (BP_GET_LEVEL(bp) > 0) {
746                 arc_flags_t flags = ARC_FLAG_WAIT;
747                 int i;
748                 blkptr_t *cbp;
749                 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
750                 arc_buf_t *buf;
751
752                 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
753                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
754                 if (err) {
755                         scn->scn_phys.scn_errors++;
756                         return (err);
757                 }
758                 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
759                         dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
760                             zb->zb_object, zb->zb_blkid * epb + i);
761                 }
762                 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
763                         zbookmark_phys_t czb;
764
765                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
766                             zb->zb_level - 1,
767                             zb->zb_blkid * epb + i);
768                         dsl_scan_visitbp(cbp, &czb, dnp,
769                             ds, scn, ostype, tx);
770                 }
771                 arc_buf_destroy(buf, &buf);
772         } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
773                 arc_flags_t flags = ARC_FLAG_WAIT;
774                 dnode_phys_t *cdnp;
775                 int i, j;
776                 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
777                 arc_buf_t *buf;
778
779                 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
780                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
781                 if (err) {
782                         scn->scn_phys.scn_errors++;
783                         return (err);
784                 }
785                 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
786                         for (j = 0; j < cdnp->dn_nblkptr; j++) {
787                                 blkptr_t *cbp = &cdnp->dn_blkptr[j];
788                                 dsl_scan_prefetch(scn, buf, cbp,
789                                     zb->zb_objset, zb->zb_blkid * epb + i, j);
790                         }
791                 }
792                 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
793                         dsl_scan_visitdnode(scn, ds, ostype,
794                             cdnp, zb->zb_blkid * epb + i, tx);
795                 }
796
797                 arc_buf_destroy(buf, &buf);
798         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
799                 arc_flags_t flags = ARC_FLAG_WAIT;
800                 objset_phys_t *osp;
801                 arc_buf_t *buf;
802
803                 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
804                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
805                 if (err) {
806                         scn->scn_phys.scn_errors++;
807                         return (err);
808                 }
809
810                 osp = buf->b_data;
811
812                 dsl_scan_visitdnode(scn, ds, osp->os_type,
813                     &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
814
815                 if (OBJSET_BUF_HAS_USERUSED(buf)) {
816                         /*
817                          * We also always visit user/group accounting
818                          * objects, and never skip them, even if we are
819                          * suspending.  This is necessary so that the space
820                          * deltas from this txg get integrated.
821                          */
822                         dsl_scan_visitdnode(scn, ds, osp->os_type,
823                             &osp->os_groupused_dnode,
824                             DMU_GROUPUSED_OBJECT, tx);
825                         dsl_scan_visitdnode(scn, ds, osp->os_type,
826                             &osp->os_userused_dnode,
827                             DMU_USERUSED_OBJECT, tx);
828                 }
829                 arc_buf_destroy(buf, &buf);
830         }
831
832         return (0);
833 }
834
835 static void
836 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
837     dmu_objset_type_t ostype, dnode_phys_t *dnp,
838     uint64_t object, dmu_tx_t *tx)
839 {
840         int j;
841
842         for (j = 0; j < dnp->dn_nblkptr; j++) {
843                 zbookmark_phys_t czb;
844
845                 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
846                     dnp->dn_nlevels - 1, j);
847                 dsl_scan_visitbp(&dnp->dn_blkptr[j],
848                     &czb, dnp, ds, scn, ostype, tx);
849         }
850
851         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
852                 zbookmark_phys_t czb;
853                 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
854                     0, DMU_SPILL_BLKID);
855                 dsl_scan_visitbp(&dnp->dn_spill,
856                     &czb, dnp, ds, scn, ostype, tx);
857         }
858 }
859
860 /*
861  * The arguments are in this order because mdb can only print the
862  * first 5; we want them to be useful.
863  */
864 static void
865 dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
866     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
867     dmu_objset_type_t ostype, dmu_tx_t *tx)
868 {
869         dsl_pool_t *dp = scn->scn_dp;
870         arc_buf_t *buf = NULL;
871         blkptr_t bp_toread = *bp;
872
873         /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
874
875         if (dsl_scan_check_suspend(scn, zb))
876                 return;
877
878         if (dsl_scan_check_resume(scn, dnp, zb))
879                 return;
880
881         if (BP_IS_HOLE(bp))
882                 return;
883
884         scn->scn_visited_this_txg++;
885
886         dprintf_bp(bp,
887             "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
888             ds, ds ? ds->ds_object : 0,
889             zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
890             bp);
891
892         if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
893                 return;
894
895         if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
896                 return;
897
898         /*
899          * If dsl_scan_ddt() has already visited this block, it will have
900          * already done any translations or scrubbing, so don't call the
901          * callback again.
902          */
903         if (ddt_class_contains(dp->dp_spa,
904             scn->scn_phys.scn_ddt_class_max, bp)) {
905                 ASSERT(buf == NULL);
906                 return;
907         }
908
909         /*
910          * If this block is from the future (after cur_max_txg), then we
911          * are doing this on behalf of a deleted snapshot, and we will
912          * revisit the future block on the next pass of this dataset.
913          * Don't scan it now unless we need to because something
914          * under it was modified.
915          */
916         if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
917                 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
918         }
919 }
920
921 static void
922 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
923     dmu_tx_t *tx)
924 {
925         zbookmark_phys_t zb;
926
927         SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
928             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
929         dsl_scan_visitbp(bp, &zb, NULL,
930             ds, scn, DMU_OST_NONE, tx);
931
932         dprintf_ds(ds, "finished scan%s", "");
933 }
934
935 void
936 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
937 {
938         dsl_pool_t *dp = ds->ds_dir->dd_pool;
939         dsl_scan_t *scn = dp->dp_scan;
940         uint64_t mintxg;
941
942         if (scn->scn_phys.scn_state != DSS_SCANNING)
943                 return;
944
945         if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
946                 if (ds->ds_is_snapshot) {
947                         /*
948                          * Note:
949                          *  - scn_cur_{min,max}_txg stays the same.
950                          *  - Setting the flag is not really necessary if
951                          *    scn_cur_max_txg == scn_max_txg, because there
952                          *    is nothing after this snapshot that we care
953                          *    about.  However, we set it anyway and then
954                          *    ignore it when we retraverse it in
955                          *    dsl_scan_visitds().
956                          */
957                         scn->scn_phys.scn_bookmark.zb_objset =
958                             dsl_dataset_phys(ds)->ds_next_snap_obj;
959                         zfs_dbgmsg("destroying ds %llu; currently traversing; "
960                             "reset zb_objset to %llu",
961                             (u_longlong_t)ds->ds_object,
962                             (u_longlong_t)dsl_dataset_phys(ds)->
963                             ds_next_snap_obj);
964                         scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
965                 } else {
966                         SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
967                             ZB_DESTROYED_OBJSET, 0, 0, 0);
968                         zfs_dbgmsg("destroying ds %llu; currently traversing; "
969                             "reset bookmark to -1,0,0,0",
970                             (u_longlong_t)ds->ds_object);
971                 }
972         } else if (zap_lookup_int_key(dp->dp_meta_objset,
973             scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
974                 ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
975                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
976                     scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
977                 if (ds->ds_is_snapshot) {
978                         /*
979                          * We keep the same mintxg; it could be >
980                          * ds_creation_txg if the previous snapshot was
981                          * deleted too.
982                          */
983                         VERIFY(zap_add_int_key(dp->dp_meta_objset,
984                             scn->scn_phys.scn_queue_obj,
985                             dsl_dataset_phys(ds)->ds_next_snap_obj,
986                             mintxg, tx) == 0);
987                         zfs_dbgmsg("destroying ds %llu; in queue; "
988                             "replacing with %llu",
989                             (u_longlong_t)ds->ds_object,
990                             (u_longlong_t)dsl_dataset_phys(ds)->
991                             ds_next_snap_obj);
992                 } else {
993                         zfs_dbgmsg("destroying ds %llu; in queue; removing",
994                             (u_longlong_t)ds->ds_object);
995                 }
996         }
997
998         /*
999          * dsl_scan_sync() should be called after this, and should sync
1000          * out our changed state, but just to be safe, do it here.
1001          */
1002         dsl_scan_sync_state(scn, tx);
1003 }
1004
1005 void
1006 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
1007 {
1008         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1009         dsl_scan_t *scn = dp->dp_scan;
1010         uint64_t mintxg;
1011
1012         if (scn->scn_phys.scn_state != DSS_SCANNING)
1013                 return;
1014
1015         ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
1016
1017         if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
1018                 scn->scn_phys.scn_bookmark.zb_objset =
1019                     dsl_dataset_phys(ds)->ds_prev_snap_obj;
1020                 zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
1021                     "reset zb_objset to %llu",
1022                     (u_longlong_t)ds->ds_object,
1023                     (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
1024         } else if (zap_lookup_int_key(dp->dp_meta_objset,
1025             scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
1026                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1027                     scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
1028                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1029                     scn->scn_phys.scn_queue_obj,
1030                     dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
1031                 zfs_dbgmsg("snapshotting ds %llu; in queue; "
1032                     "replacing with %llu",
1033                     (u_longlong_t)ds->ds_object,
1034                     (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
1035         }
1036         dsl_scan_sync_state(scn, tx);
1037 }
1038
1039 void
1040 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
1041 {
1042         dsl_pool_t *dp = ds1->ds_dir->dd_pool;
1043         dsl_scan_t *scn = dp->dp_scan;
1044         uint64_t mintxg;
1045
1046         if (scn->scn_phys.scn_state != DSS_SCANNING)
1047                 return;
1048
1049         if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
1050                 scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
1051                 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
1052                     "reset zb_objset to %llu",
1053                     (u_longlong_t)ds1->ds_object,
1054                     (u_longlong_t)ds2->ds_object);
1055         } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
1056                 scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
1057                 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
1058                     "reset zb_objset to %llu",
1059                     (u_longlong_t)ds2->ds_object,
1060                     (u_longlong_t)ds1->ds_object);
1061         }
1062
1063         if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
1064             ds1->ds_object, &mintxg) == 0) {
1065                 int err;
1066
1067                 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
1068                 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
1069                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1070                     scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
1071                 err = zap_add_int_key(dp->dp_meta_objset,
1072                     scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
1073                 VERIFY(err == 0 || err == EEXIST);
1074                 if (err == EEXIST) {
1075                         /* Both were there to begin with */
1076                         VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
1077                             scn->scn_phys.scn_queue_obj,
1078                             ds1->ds_object, mintxg, tx));
1079                 }
1080                 zfs_dbgmsg("clone_swap ds %llu; in queue; "
1081                     "replacing with %llu",
1082                     (u_longlong_t)ds1->ds_object,
1083                     (u_longlong_t)ds2->ds_object);
1084         } else if (zap_lookup_int_key(dp->dp_meta_objset,
1085             scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
1086                 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
1087                 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
1088                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1089                     scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
1090                 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
1091                     scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
1092                 zfs_dbgmsg("clone_swap ds %llu; in queue; "
1093                     "replacing with %llu",
1094                     (u_longlong_t)ds2->ds_object,
1095                     (u_longlong_t)ds1->ds_object);
1096         }
1097
1098         dsl_scan_sync_state(scn, tx);
1099 }
1100
1101 struct enqueue_clones_arg {
1102         dmu_tx_t *tx;
1103         uint64_t originobj;
1104 };
1105
1106 /* ARGSUSED */
1107 static int
1108 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
1109 {
1110         struct enqueue_clones_arg *eca = arg;
1111         dsl_dataset_t *ds;
1112         int err;
1113         dsl_scan_t *scn = dp->dp_scan;
1114
1115         if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
1116                 return (0);
1117
1118         err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
1119         if (err)
1120                 return (err);
1121
1122         while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
1123                 dsl_dataset_t *prev;
1124                 err = dsl_dataset_hold_obj(dp,
1125                     dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
1126
1127                 dsl_dataset_rele(ds, FTAG);
1128                 if (err)
1129                         return (err);
1130                 ds = prev;
1131         }
1132         VERIFY(zap_add_int_key(dp->dp_meta_objset,
1133             scn->scn_phys.scn_queue_obj, ds->ds_object,
1134             dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
1135         dsl_dataset_rele(ds, FTAG);
1136         return (0);
1137 }
1138
1139 static void
1140 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
1141 {
1142         dsl_pool_t *dp = scn->scn_dp;
1143         dsl_dataset_t *ds;
1144         objset_t *os;
1145
1146         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1147
1148         if (scn->scn_phys.scn_cur_min_txg >=
1149             scn->scn_phys.scn_max_txg) {
1150                 /*
1151                  * This can happen if this snapshot was created after the
1152                  * scan started, and we already completed a previous snapshot
1153                  * that was created after the scan started.  This snapshot
1154                  * only references blocks with:
1155                  *
1156                  *      birth < our ds_creation_txg
1157                  *      cur_min_txg is no less than ds_creation_txg.
1158                  *      We have already visited these blocks.
1159                  * or
1160                  *      birth > scn_max_txg
1161                  *      The scan requested not to visit these blocks.
1162                  *
1163                  * Subsequent snapshots (and clones) can reference our
1164                  * blocks, or blocks with even higher birth times.
1165                  * Therefore we do not need to visit them either,
1166                  * so we do not add them to the work queue.
1167                  *
1168                  * Note that checking for cur_min_txg >= cur_max_txg
1169                  * is not sufficient, because in that case we may need to
1170                  * visit subsequent snapshots.  This happens when min_txg > 0,
1171                  * which raises cur_min_txg.  In this case we will visit
1172                  * this dataset but skip all of its blocks, because the
1173                  * rootbp's birth time is < cur_min_txg.  Then we will
1174                  * add the next snapshots/clones to the work queue.
1175                  */
1176                 char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1177                 dsl_dataset_name(ds, dsname);
1178                 zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
1179                     "cur_min_txg (%llu) >= max_txg (%llu)",
1180                     dsobj, dsname,
1181                     scn->scn_phys.scn_cur_min_txg,
1182                     scn->scn_phys.scn_max_txg);
1183                 kmem_free(dsname, MAXNAMELEN);
1184
1185                 goto out;
1186         }
1187
1188         if (dmu_objset_from_ds(ds, &os))
1189                 goto out;
1190
1191         /*
1192          * Only the ZIL in the head (non-snapshot) is valid.  Even though
1193          * snapshots can have ZIL block pointers (which may be the same
1194          * BP as in the head), they must be ignored.  So we traverse the
1195          * ZIL here, rather than in scan_recurse(), because the regular
1196          * snapshot block-sharing rules don't apply to it.
1197          */
1198         if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
1199                 dsl_scan_zil(dp, &os->os_zil_header);
1200
1201         /*
1202          * Iterate over the bps in this ds.
1203          */
1204         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1205         rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
1206         dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
1207         rrw_exit(&ds->ds_bp_rwlock, FTAG);
1208
1209         char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
1210         dsl_dataset_name(ds, dsname);
1211         zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
1212             "suspending=%u",
1213             (longlong_t)dsobj, dsname,
1214             (longlong_t)scn->scn_phys.scn_cur_min_txg,
1215             (longlong_t)scn->scn_phys.scn_cur_max_txg,
1216             (int)scn->scn_suspending);
1217         kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
1218
1219         if (scn->scn_suspending)
1220                 goto out;
1221
1222         /*
1223          * We've finished this pass over this dataset.
1224          */
1225
1226         /*
1227          * If we did not completely visit this dataset, do another pass.
1228          */
1229         if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
1230                 zfs_dbgmsg("incomplete pass; visiting again");
1231                 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
1232                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1233                     scn->scn_phys.scn_queue_obj, ds->ds_object,
1234                     scn->scn_phys.scn_cur_max_txg, tx) == 0);
1235                 goto out;
1236         }
1237
1238         /*
1239          * Add descendent datasets to work queue.
1240          */
1241         if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
1242                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1243                     scn->scn_phys.scn_queue_obj,
1244                     dsl_dataset_phys(ds)->ds_next_snap_obj,
1245                     dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
1246         }
1247         if (dsl_dataset_phys(ds)->ds_num_children > 1) {
1248                 boolean_t usenext = B_FALSE;
1249                 if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
1250                         uint64_t count;
1251                         /*
1252                          * A bug in a previous version of the code could
1253                          * cause upgrade_clones_cb() to not set
1254                          * ds_next_snap_obj when it should, leading to a
1255                          * missing entry.  Therefore we can only use the
1256                          * next_clones_obj when its count is correct.
1257                          */
1258                         int err = zap_count(dp->dp_meta_objset,
1259                             dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
1260                         if (err == 0 &&
1261                             count == dsl_dataset_phys(ds)->ds_num_children - 1)
1262                                 usenext = B_TRUE;
1263                 }
1264
1265                 if (usenext) {
1266                         VERIFY0(zap_join_key(dp->dp_meta_objset,
1267                             dsl_dataset_phys(ds)->ds_next_clones_obj,
1268                             scn->scn_phys.scn_queue_obj,
1269                             dsl_dataset_phys(ds)->ds_creation_txg, tx));
1270                 } else {
1271                         struct enqueue_clones_arg eca;
1272                         eca.tx = tx;
1273                         eca.originobj = ds->ds_object;
1274
1275                         VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1276                             enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
1277                 }
1278         }
1279
1280 out:
1281         dsl_dataset_rele(ds, FTAG);
1282 }
1283
1284 /* ARGSUSED */
1285 static int
1286 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
1287 {
1288         dmu_tx_t *tx = arg;
1289         dsl_dataset_t *ds;
1290         int err;
1291         dsl_scan_t *scn = dp->dp_scan;
1292
1293         err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
1294         if (err)
1295                 return (err);
1296
1297         while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
1298                 dsl_dataset_t *prev;
1299                 err = dsl_dataset_hold_obj(dp,
1300                     dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
1301                 if (err) {
1302                         dsl_dataset_rele(ds, FTAG);
1303                         return (err);
1304                 }
1305
1306                 /*
1307                  * If this is a clone, we don't need to worry about it for now.
1308                  */
1309                 if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
1310                         dsl_dataset_rele(ds, FTAG);
1311                         dsl_dataset_rele(prev, FTAG);
1312                         return (0);
1313                 }
1314                 dsl_dataset_rele(ds, FTAG);
1315                 ds = prev;
1316         }
1317
1318         VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
1319             ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
1320         dsl_dataset_rele(ds, FTAG);
1321         return (0);
1322 }
1323
1324 /*
1325  * Scrub/dedup interaction.
1326  *
1327  * If there are N references to a deduped block, we don't want to scrub it
1328  * N times -- ideally, we should scrub it exactly once.
1329  *
1330  * We leverage the fact that the dde's replication class (enum ddt_class)
1331  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
1332  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
1333  *
1334  * To prevent excess scrubbing, the scrub begins by walking the DDT
1335  * to find all blocks with refcnt > 1, and scrubs each of these once.
1336  * Since there are two replication classes which contain blocks with
1337  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
1338  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
1339  *
1340  * There would be nothing more to say if a block's refcnt couldn't change
1341  * during a scrub, but of course it can so we must account for changes
1342  * in a block's replication class.
1343  *
1344  * Here's an example of what can occur:
1345  *
1346  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
1347  * when visited during the top-down scrub phase, it will be scrubbed twice.
1348  * This negates our scrub optimization, but is otherwise harmless.
1349  *
1350  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
1351  * on each visit during the top-down scrub phase, it will never be scrubbed.
1352  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
1353  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
1354  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
1355  * while a scrub is in progress, it scrubs the block right then.
1356  */
1357 static void
1358 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
1359 {
1360         ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
1361         ddt_entry_t dde = { 0 };
1362         int error;
1363         uint64_t n = 0;
1364
1365         while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
1366                 ddt_t *ddt;
1367
1368                 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
1369                         break;
1370                 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
1371                     (longlong_t)ddb->ddb_class,
1372                     (longlong_t)ddb->ddb_type,
1373                     (longlong_t)ddb->ddb_checksum,
1374                     (longlong_t)ddb->ddb_cursor);
1375
1376                 /* There should be no pending changes to the dedup table */
1377                 ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
1378                 ASSERT(avl_first(&ddt->ddt_tree) == NULL);
1379
1380                 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
1381                 n++;
1382
1383                 if (dsl_scan_check_suspend(scn, NULL))
1384                         break;
1385         }
1386
1387         zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
1388             "suspending=%u", (longlong_t)n,
1389             (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
1390
1391         ASSERT(error == 0 || error == ENOENT);
1392         ASSERT(error != ENOENT ||
1393             ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
1394 }
1395
1396 /* ARGSUSED */
1397 void
1398 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
1399     ddt_entry_t *dde, dmu_tx_t *tx)
1400 {
1401         const ddt_key_t *ddk = &dde->dde_key;
1402         ddt_phys_t *ddp = dde->dde_phys;
1403         blkptr_t bp;
1404         zbookmark_phys_t zb = { 0 };
1405
1406         if (scn->scn_phys.scn_state != DSS_SCANNING)
1407                 return;
1408
1409         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1410                 if (ddp->ddp_phys_birth == 0 ||
1411                     ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
1412                         continue;
1413                 ddt_bp_create(checksum, ddk, ddp, &bp);
1414
1415                 scn->scn_visited_this_txg++;
1416                 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
1417         }
1418 }
1419
1420 static void
1421 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
1422 {
1423         dsl_pool_t *dp = scn->scn_dp;
1424         zap_cursor_t zc;
1425         zap_attribute_t za;
1426
1427         if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1428             scn->scn_phys.scn_ddt_class_max) {
1429                 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1430                 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1431                 dsl_scan_ddt(scn, tx);
1432                 if (scn->scn_suspending)
1433                         return;
1434         }
1435
1436         if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
1437                 /* First do the MOS & ORIGIN */
1438
1439                 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1440                 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1441                 dsl_scan_visit_rootbp(scn, NULL,
1442                     &dp->dp_meta_rootbp, tx);
1443                 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
1444                 if (scn->scn_suspending)
1445                         return;
1446
1447                 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
1448                         VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1449                             enqueue_cb, tx, DS_FIND_CHILDREN));
1450                 } else {
1451                         dsl_scan_visitds(scn,
1452                             dp->dp_origin_snap->ds_object, tx);
1453                 }
1454                 ASSERT(!scn->scn_suspending);
1455         } else if (scn->scn_phys.scn_bookmark.zb_objset !=
1456             ZB_DESTROYED_OBJSET) {
1457                 /*
1458                  * If we were suspended, continue from here.  Note if the
1459                  * ds we were suspended on was deleted, the zb_objset may
1460                  * be -1, so we will skip this and find a new objset
1461                  * below.
1462                  */
1463                 dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
1464                 if (scn->scn_suspending)
1465                         return;
1466         }
1467
1468         /*
1469          * In case we were suspended right at the end of the ds, zero the
1470          * bookmark so we don't think that we're still trying to resume.
1471          */
1472         bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
1473
1474         /* keep pulling things out of the zap-object-as-queue */
1475         while (zap_cursor_init(&zc, dp->dp_meta_objset,
1476             scn->scn_phys.scn_queue_obj),
1477             zap_cursor_retrieve(&zc, &za) == 0) {
1478                 dsl_dataset_t *ds;
1479                 uint64_t dsobj;
1480
1481                 dsobj = zfs_strtonum(za.za_name, NULL);
1482                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1483                     scn->scn_phys.scn_queue_obj, dsobj, tx));
1484
1485                 /* Set up min/max txg */
1486                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1487                 if (za.za_first_integer != 0) {
1488                         scn->scn_phys.scn_cur_min_txg =
1489                             MAX(scn->scn_phys.scn_min_txg,
1490                             za.za_first_integer);
1491                 } else {
1492                         scn->scn_phys.scn_cur_min_txg =
1493                             MAX(scn->scn_phys.scn_min_txg,
1494                             dsl_dataset_phys(ds)->ds_prev_snap_txg);
1495                 }
1496                 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1497                 dsl_dataset_rele(ds, FTAG);
1498
1499                 dsl_scan_visitds(scn, dsobj, tx);
1500                 zap_cursor_fini(&zc);
1501                 if (scn->scn_suspending)
1502                         return;
1503         }
1504         zap_cursor_fini(&zc);
1505 }
1506
1507 static boolean_t
1508 dsl_scan_free_should_suspend(dsl_scan_t *scn)
1509 {
1510         uint64_t elapsed_nanosecs;
1511
1512         if (zfs_recover)
1513                 return (B_FALSE);
1514
1515         if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
1516                 return (B_TRUE);
1517
1518         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1519         return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1520             (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
1521             txg_sync_waiting(scn->scn_dp)) ||
1522             spa_shutting_down(scn->scn_dp->dp_spa));
1523 }
1524
1525 static int
1526 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1527 {
1528         dsl_scan_t *scn = arg;
1529
1530         if (!scn->scn_is_bptree ||
1531             (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
1532                 if (dsl_scan_free_should_suspend(scn))
1533                         return (SET_ERROR(ERESTART));
1534         }
1535
1536         zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1537             dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
1538         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1539             -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1540             -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1541         scn->scn_visited_this_txg++;
1542         return (0);
1543 }
1544
1545 boolean_t
1546 dsl_scan_active(dsl_scan_t *scn)
1547 {
1548         spa_t *spa = scn->scn_dp->dp_spa;
1549         uint64_t used = 0, comp, uncomp;
1550
1551         if (spa->spa_load_state != SPA_LOAD_NONE)
1552                 return (B_FALSE);
1553         if (spa_shutting_down(spa))
1554                 return (B_FALSE);
1555         if ((scn->scn_phys.scn_state == DSS_SCANNING &&
1556             !dsl_scan_is_paused_scrub(scn)) ||
1557             (scn->scn_async_destroying && !scn->scn_async_stalled))
1558                 return (B_TRUE);
1559
1560         if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1561                 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1562                     &used, &comp, &uncomp);
1563         }
1564         return (used != 0);
1565 }
1566
1567 /* Called whenever a txg syncs. */
1568 void
1569 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1570 {
1571         dsl_scan_t *scn = dp->dp_scan;
1572         spa_t *spa = dp->dp_spa;
1573         int err = 0;
1574
1575         /*
1576          * Check for scn_restart_txg before checking spa_load_state, so
1577          * that we can restart an old-style scan while the pool is being
1578          * imported (see dsl_scan_init).
1579          */
1580         if (dsl_scan_restarting(scn, tx)) {
1581                 pool_scan_func_t func = POOL_SCAN_SCRUB;
1582                 dsl_scan_done(scn, B_FALSE, tx);
1583                 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
1584                         func = POOL_SCAN_RESILVER;
1585                 zfs_dbgmsg("restarting scan func=%u txg=%llu",
1586                     func, tx->tx_txg);
1587                 dsl_scan_setup_sync(&func, tx);
1588         }
1589
1590         /*
1591          * Only process scans in sync pass 1.
1592          */
1593         if (spa_sync_pass(dp->dp_spa) > 1)
1594                 return;
1595
1596         /*
1597          * If the spa is shutting down, then stop scanning. This will
1598          * ensure that the scan does not dirty any new data during the
1599          * shutdown phase.
1600          */
1601         if (spa_shutting_down(spa))
1602                 return;
1603
1604         /*
1605          * If the scan is inactive due to a stalled async destroy, try again.
1606          */
1607         if (!scn->scn_async_stalled && !dsl_scan_active(scn))
1608                 return;
1609
1610         scn->scn_visited_this_txg = 0;
1611         scn->scn_suspending = B_FALSE;
1612         scn->scn_sync_start_time = gethrtime();
1613         spa->spa_scrub_active = B_TRUE;
1614
1615         /*
1616          * First process the async destroys.  If we suspend, don't do
1617          * any scrubbing or resilvering.  This ensures that there are no
1618          * async destroys while we are scanning, so the scan code doesn't
1619          * have to worry about traversing it.  It is also faster to free the
1620          * blocks than to scrub them.
1621          */
1622         if (zfs_free_bpobj_enabled &&
1623             spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1624                 scn->scn_is_bptree = B_FALSE;
1625                 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1626                     NULL, ZIO_FLAG_MUSTSUCCEED);
1627                 err = bpobj_iterate(&dp->dp_free_bpobj,
1628                     dsl_scan_free_block_cb, scn, tx);
1629                 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1630
1631                 if (err != 0 && err != ERESTART)
1632                         zfs_panic_recover("error %u from bpobj_iterate()", err);
1633         }
1634
1635         if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
1636                 ASSERT(scn->scn_async_destroying);
1637                 scn->scn_is_bptree = B_TRUE;
1638                 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1639                     NULL, ZIO_FLAG_MUSTSUCCEED);
1640                 err = bptree_iterate(dp->dp_meta_objset,
1641                     dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
1642                 VERIFY0(zio_wait(scn->scn_zio_root));
1643
1644                 if (err == EIO || err == ECKSUM) {
1645                         err = 0;
1646                 } else if (err != 0 && err != ERESTART) {
1647                         zfs_panic_recover("error %u from "
1648                             "traverse_dataset_destroyed()", err);
1649                 }
1650
1651                 if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
1652                         /* finished; deactivate async destroy feature */
1653                         spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
1654                         ASSERT(!spa_feature_is_active(spa,
1655                             SPA_FEATURE_ASYNC_DESTROY));
1656                         VERIFY0(zap_remove(dp->dp_meta_objset,
1657                             DMU_POOL_DIRECTORY_OBJECT,
1658                             DMU_POOL_BPTREE_OBJ, tx));
1659                         VERIFY0(bptree_free(dp->dp_meta_objset,
1660                             dp->dp_bptree_obj, tx));
1661                         dp->dp_bptree_obj = 0;
1662                         scn->scn_async_destroying = B_FALSE;
1663                         scn->scn_async_stalled = B_FALSE;
1664                 } else {
1665                         /*
1666                          * If we didn't make progress, mark the async
1667                          * destroy as stalled, so that we will not initiate
1668                          * a spa_sync() on its behalf.  Note that we only
1669                          * check this if we are not finished, because if the
1670                          * bptree had no blocks for us to visit, we can
1671                          * finish without "making progress".
1672                          */
1673                         scn->scn_async_stalled =
1674                             (scn->scn_visited_this_txg == 0);
1675                 }
1676         }
1677         if (scn->scn_visited_this_txg) {
1678                 zfs_dbgmsg("freed %llu blocks in %llums from "
1679                     "free_bpobj/bptree txg %llu; err=%d",
1680                     (longlong_t)scn->scn_visited_this_txg,
1681                     (longlong_t)
1682                     NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
1683                     (longlong_t)tx->tx_txg, err);
1684                 scn->scn_visited_this_txg = 0;
1685
1686                 /*
1687                  * Write out changes to the DDT that may be required as a
1688                  * result of the blocks freed.  This ensures that the DDT
1689                  * is clean when a scrub/resilver runs.
1690                  */
1691                 ddt_sync(spa, tx->tx_txg);
1692         }
1693         if (err != 0)
1694                 return;
1695         if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
1696             zfs_free_leak_on_eio &&
1697             (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
1698             dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
1699             dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
1700                 /*
1701                  * We have finished background destroying, but there is still
1702                  * some space left in the dp_free_dir. Transfer this leaked
1703                  * space to the dp_leak_dir.
1704                  */
1705                 if (dp->dp_leak_dir == NULL) {
1706                         rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
1707                         (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
1708                             LEAK_DIR_NAME, tx);
1709                         VERIFY0(dsl_pool_open_special_dir(dp,
1710                             LEAK_DIR_NAME, &dp->dp_leak_dir));
1711                         rrw_exit(&dp->dp_config_rwlock, FTAG);
1712                 }
1713                 dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
1714                     dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
1715                     dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
1716                     dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
1717                 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1718                     -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
1719                     -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
1720                     -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
1721         }
1722         if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
1723                 /* finished; verify that space accounting went to zero */
1724                 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
1725                 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
1726                 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
1727         }
1728
1729         if (scn->scn_phys.scn_state != DSS_SCANNING)
1730                 return;
1731
1732         if (scn->scn_done_txg == tx->tx_txg) {
1733                 ASSERT(!scn->scn_suspending);
1734                 /* finished with scan. */
1735                 zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
1736                 dsl_scan_done(scn, B_TRUE, tx);
1737                 ASSERT3U(spa->spa_scrub_inflight, ==, 0);
1738                 dsl_scan_sync_state(scn, tx);
1739                 return;
1740         }
1741
1742         if (dsl_scan_is_paused_scrub(scn))
1743                 return;
1744
1745         if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1746             scn->scn_phys.scn_ddt_class_max) {
1747                 zfs_dbgmsg("doing scan sync txg %llu; "
1748                     "ddt bm=%llu/%llu/%llu/%llx",
1749                     (longlong_t)tx->tx_txg,
1750                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
1751                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
1752                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
1753                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
1754                 ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
1755                 ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
1756                 ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
1757                 ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
1758         } else {
1759                 zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
1760                     (longlong_t)tx->tx_txg,
1761                     (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
1762                     (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
1763                     (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
1764                     (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
1765         }
1766
1767         scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1768             NULL, ZIO_FLAG_CANFAIL);
1769         dsl_pool_config_enter(dp, FTAG);
1770         dsl_scan_visit(scn, tx);
1771         dsl_pool_config_exit(dp, FTAG);
1772         (void) zio_wait(scn->scn_zio_root);
1773         scn->scn_zio_root = NULL;
1774
1775         zfs_dbgmsg("visited %llu blocks in %llums",
1776             (longlong_t)scn->scn_visited_this_txg,
1777             (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
1778
1779         if (!scn->scn_suspending) {
1780                 scn->scn_done_txg = tx->tx_txg + 1;
1781                 zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
1782                     tx->tx_txg, scn->scn_done_txg);
1783         }
1784
1785         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
1786                 mutex_enter(&spa->spa_scrub_lock);
1787                 while (spa->spa_scrub_inflight > 0) {
1788                         cv_wait(&spa->spa_scrub_io_cv,
1789                             &spa->spa_scrub_lock);
1790                 }
1791                 mutex_exit(&spa->spa_scrub_lock);
1792         }
1793
1794         dsl_scan_sync_state(scn, tx);
1795 }
1796
1797 /*
1798  * This will start a new scan, or restart an existing one.
1799  */
1800 void
1801 dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
1802 {
1803         if (txg == 0) {
1804                 dmu_tx_t *tx;
1805                 tx = dmu_tx_create_dd(dp->dp_mos_dir);
1806                 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
1807
1808                 txg = dmu_tx_get_txg(tx);
1809                 dp->dp_scan->scn_restart_txg = txg;
1810                 dmu_tx_commit(tx);
1811         } else {
1812                 dp->dp_scan->scn_restart_txg = txg;
1813         }
1814         zfs_dbgmsg("restarting resilver txg=%llu", txg);
1815 }
1816
1817 boolean_t
1818 dsl_scan_resilvering(dsl_pool_t *dp)
1819 {
1820         return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
1821             dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
1822 }
1823
1824 /*
1825  * scrub consumers
1826  */
1827
1828 static void
1829 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1830 {
1831         int i;
1832
1833         /*
1834          * If we resume after a reboot, zab will be NULL; don't record
1835          * incomplete stats in that case.
1836          */
1837         if (zab == NULL)
1838                 return;
1839
1840         for (i = 0; i < 4; i++) {
1841                 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1842                 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
1843                 if (t & DMU_OT_NEWTYPE)
1844                         t = DMU_OT_OTHER;
1845                 zfs_blkstat_t *zb = &zab->zab_type[l][t];
1846                 int equal;
1847
1848                 zb->zb_count++;
1849                 zb->zb_asize += BP_GET_ASIZE(bp);
1850                 zb->zb_lsize += BP_GET_LSIZE(bp);
1851                 zb->zb_psize += BP_GET_PSIZE(bp);
1852                 zb->zb_gangs += BP_COUNT_GANG(bp);
1853
1854                 switch (BP_GET_NDVAS(bp)) {
1855                 case 2:
1856                         if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1857                             DVA_GET_VDEV(&bp->blk_dva[1]))
1858                                 zb->zb_ditto_2_of_2_samevdev++;
1859                         break;
1860                 case 3:
1861                         equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1862                             DVA_GET_VDEV(&bp->blk_dva[1])) +
1863                             (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1864                             DVA_GET_VDEV(&bp->blk_dva[2])) +
1865                             (DVA_GET_VDEV(&bp->blk_dva[1]) ==
1866                             DVA_GET_VDEV(&bp->blk_dva[2]));
1867                         if (equal == 1)
1868                                 zb->zb_ditto_2_of_3_samevdev++;
1869                         else if (equal == 3)
1870                                 zb->zb_ditto_3_of_3_samevdev++;
1871                         break;
1872                 }
1873         }
1874 }
1875
1876 static void
1877 dsl_scan_scrub_done(zio_t *zio)
1878 {
1879         spa_t *spa = zio->io_spa;
1880
1881         abd_free(zio->io_abd);
1882
1883         mutex_enter(&spa->spa_scrub_lock);
1884         spa->spa_scrub_inflight--;
1885         cv_broadcast(&spa->spa_scrub_io_cv);
1886
1887         if (zio->io_error && (zio->io_error != ECKSUM ||
1888             !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
1889                 spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
1890         }
1891         mutex_exit(&spa->spa_scrub_lock);
1892 }
1893
1894 static int
1895 dsl_scan_scrub_cb(dsl_pool_t *dp,
1896     const blkptr_t *bp, const zbookmark_phys_t *zb)
1897 {
1898         dsl_scan_t *scn = dp->dp_scan;
1899         size_t size = BP_GET_PSIZE(bp);
1900         spa_t *spa = dp->dp_spa;
1901         uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
1902         boolean_t needs_io;
1903         int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
1904         unsigned int scan_delay = 0;
1905
1906         if (phys_birth <= scn->scn_phys.scn_min_txg ||
1907             phys_birth >= scn->scn_phys.scn_max_txg)
1908                 return (0);
1909
1910         count_block(dp->dp_blkstats, bp);
1911
1912         if (BP_IS_EMBEDDED(bp))
1913                 return (0);
1914
1915         ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
1916         if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
1917                 zio_flags |= ZIO_FLAG_SCRUB;
1918                 needs_io = B_TRUE;
1919                 scan_delay = zfs_scrub_delay;
1920         } else {
1921                 ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
1922                 zio_flags |= ZIO_FLAG_RESILVER;
1923                 needs_io = B_FALSE;
1924                 scan_delay = zfs_resilver_delay;
1925         }
1926
1927         /* If it's an intent log block, failure is expected. */
1928         if (zb->zb_level == ZB_ZIL_LEVEL)
1929                 zio_flags |= ZIO_FLAG_SPECULATIVE;
1930
1931         for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
1932                 vdev_t *vd = vdev_lookup_top(spa,
1933                     DVA_GET_VDEV(&bp->blk_dva[d]));
1934
1935                 /*
1936                  * Keep track of how much data we've examined so that
1937                  * zpool(1M) status can make useful progress reports.
1938                  */
1939                 scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
1940                 spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
1941
1942                 /* if it's a resilver, this may not be in the target range */
1943                 if (!needs_io) {
1944                         if (DVA_GET_GANG(&bp->blk_dva[d])) {
1945                                 /*
1946                                  * Gang members may be spread across multiple
1947                                  * vdevs, so the best estimate we have is the
1948                                  * scrub range, which has already been checked.
1949                                  * XXX -- it would be better to change our
1950                                  * allocation policy to ensure that all
1951                                  * gang members reside on the same vdev.
1952                                  */
1953                                 needs_io = B_TRUE;
1954                         } else {
1955                                 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
1956                                     phys_birth, 1);
1957                         }
1958                 }
1959         }
1960
1961         if (needs_io && !zfs_no_scrub_io) {
1962                 vdev_t *rvd = spa->spa_root_vdev;
1963                 uint64_t maxinflight = rvd->vdev_children *
1964                     MAX(zfs_top_maxinflight, 1);
1965
1966                 mutex_enter(&spa->spa_scrub_lock);
1967                 while (spa->spa_scrub_inflight >= maxinflight)
1968                         cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
1969                 spa->spa_scrub_inflight++;
1970                 mutex_exit(&spa->spa_scrub_lock);
1971
1972                 /*
1973                  * If we're seeing recent (zfs_scan_idle) "important" I/Os
1974                  * then throttle our workload to limit the impact of a scan.
1975                  */
1976                 if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
1977                         delay(MAX((int)scan_delay, 0));
1978
1979                 zio_nowait(zio_read(NULL, spa, bp,
1980                     abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
1981                     NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
1982         }
1983
1984         /* do not relocate this block */
1985         return (0);
1986 }
1987
1988 /*
1989  * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
1990  * Can also be called to resume a paused scrub.
1991  */
1992 int
1993 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
1994 {
1995         spa_t *spa = dp->dp_spa;
1996         dsl_scan_t *scn = dp->dp_scan;
1997
1998         /*
1999          * Purge all vdev caches and probe all devices.  We do this here
2000          * rather than in sync context because this requires a writer lock
2001          * on the spa_config lock, which we can't do from sync context.  The
2002          * spa_scrub_reopen flag indicates that vdev_open() should not
2003          * attempt to start another scrub.
2004          */
2005         spa_vdev_state_enter(spa, SCL_NONE);
2006         spa->spa_scrub_reopen = B_TRUE;
2007         vdev_reopen(spa->spa_root_vdev);
2008         spa->spa_scrub_reopen = B_FALSE;
2009         (void) spa_vdev_state_exit(spa, NULL, 0);
2010
2011         if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
2012                 /* got scrub start cmd, resume paused scrub */
2013                 int err = dsl_scrub_set_pause_resume(scn->scn_dp,
2014                     POOL_SCRUB_NORMAL);
2015                 if (err == 0)
2016                         return (ECANCELED);
2017
2018                 return (SET_ERROR(err));
2019         }
2020
2021         return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
2022             dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
2023 }
2024
2025 static boolean_t
2026 dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
2027 {
2028         return (scn->scn_restart_txg != 0 &&
2029             scn->scn_restart_txg <= tx->tx_txg);
2030 }