]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/contrib/openzfs/module/zfs/dnode_sync.c
Update the Arm Optimized Routine library to v24.01
[FreeBSD/FreeBSD.git] / sys / contrib / openzfs / module / zfs / dnode_sync.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26  * Copyright 2020 Oxide Computer Company
27  */
28
29 #include <sys/zfs_context.h>
30 #include <sys/dbuf.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/dmu_recv.h>
36 #include <sys/dsl_dataset.h>
37 #include <sys/spa.h>
38 #include <sys/range_tree.h>
39 #include <sys/zfeature.h>
40
41 static void
42 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
43 {
44         dmu_buf_impl_t *db;
45         int txgoff = tx->tx_txg & TXG_MASK;
46         int nblkptr = dn->dn_phys->dn_nblkptr;
47         int old_toplvl = dn->dn_phys->dn_nlevels - 1;
48         int new_level = dn->dn_next_nlevels[txgoff];
49         int i;
50
51         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
52
53         /* this dnode can't be paged out because it's dirty */
54         ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
55         ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
56
57         db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
58         ASSERT(db != NULL);
59
60         dn->dn_phys->dn_nlevels = new_level;
61         dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
62             (u_longlong_t)dn->dn_object, dn->dn_phys->dn_nlevels);
63
64         /*
65          * Lock ordering requires that we hold the children's db_mutexes (by
66          * calling dbuf_find()) before holding the parent's db_rwlock.  The lock
67          * order is imposed by dbuf_read's steps of "grab the lock to protect
68          * db_parent, get db_parent, hold db_parent's db_rwlock".
69          */
70         dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
71         ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
72         for (i = 0; i < nblkptr; i++) {
73                 children[i] = dbuf_find(dn->dn_objset, dn->dn_object,
74                     old_toplvl, i, NULL);
75         }
76
77         /* transfer dnode's block pointers to new indirect block */
78         (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
79         if (dn->dn_dbuf != NULL)
80                 rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);
81         rw_enter(&db->db_rwlock, RW_WRITER);
82         ASSERT(db->db.db_data);
83         ASSERT(arc_released(db->db_buf));
84         ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
85         memcpy(db->db.db_data, dn->dn_phys->dn_blkptr,
86             sizeof (blkptr_t) * nblkptr);
87         arc_buf_freeze(db->db_buf);
88
89         /* set dbuf's parent pointers to new indirect buf */
90         for (i = 0; i < nblkptr; i++) {
91                 dmu_buf_impl_t *child = children[i];
92
93                 if (child == NULL)
94                         continue;
95 #ifdef  ZFS_DEBUG
96                 DB_DNODE_ENTER(child);
97                 ASSERT3P(DB_DNODE(child), ==, dn);
98                 DB_DNODE_EXIT(child);
99 #endif  /* DEBUG */
100                 if (child->db_parent && child->db_parent != dn->dn_dbuf) {
101                         ASSERT(child->db_parent->db_level == db->db_level);
102                         ASSERT(child->db_blkptr !=
103                             &dn->dn_phys->dn_blkptr[child->db_blkid]);
104                         mutex_exit(&child->db_mtx);
105                         continue;
106                 }
107                 ASSERT(child->db_parent == NULL ||
108                     child->db_parent == dn->dn_dbuf);
109
110                 child->db_parent = db;
111                 dbuf_add_ref(db, child);
112                 if (db->db.db_data)
113                         child->db_blkptr = (blkptr_t *)db->db.db_data + i;
114                 else
115                         child->db_blkptr = NULL;
116                 dprintf_dbuf_bp(child, child->db_blkptr,
117                     "changed db_blkptr to new indirect %s", "");
118
119                 mutex_exit(&child->db_mtx);
120         }
121
122         memset(dn->dn_phys->dn_blkptr, 0, sizeof (blkptr_t) * nblkptr);
123
124         rw_exit(&db->db_rwlock);
125         if (dn->dn_dbuf != NULL)
126                 rw_exit(&dn->dn_dbuf->db_rwlock);
127
128         dbuf_rele(db, FTAG);
129
130         rw_exit(&dn->dn_struct_rwlock);
131 }
132
133 static void
134 free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
135 {
136         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
137         uint64_t bytesfreed = 0;
138
139         dprintf("ds=%p obj=%llx num=%d\n", ds, (u_longlong_t)dn->dn_object,
140             num);
141
142         for (int i = 0; i < num; i++, bp++) {
143                 if (BP_IS_HOLE(bp))
144                         continue;
145
146                 bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
147                 ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
148
149                 /*
150                  * Save some useful information on the holes being
151                  * punched, including logical size, type, and indirection
152                  * level. Retaining birth time enables detection of when
153                  * holes are punched for reducing the number of free
154                  * records transmitted during a zfs send.
155                  */
156
157                 uint64_t lsize = BP_GET_LSIZE(bp);
158                 dmu_object_type_t type = BP_GET_TYPE(bp);
159                 uint64_t lvl = BP_GET_LEVEL(bp);
160
161                 memset(bp, 0, sizeof (blkptr_t));
162
163                 if (spa_feature_is_active(dn->dn_objset->os_spa,
164                     SPA_FEATURE_HOLE_BIRTH)) {
165                         BP_SET_LSIZE(bp, lsize);
166                         BP_SET_TYPE(bp, type);
167                         BP_SET_LEVEL(bp, lvl);
168                         BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
169                 }
170         }
171         dnode_diduse_space(dn, -bytesfreed);
172 }
173
174 #ifdef ZFS_DEBUG
175 static void
176 free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
177 {
178         uint64_t off, num, i, j;
179         unsigned int epbs;
180         int err;
181         uint64_t txg = tx->tx_txg;
182         dnode_t *dn;
183
184         DB_DNODE_ENTER(db);
185         dn = DB_DNODE(db);
186         epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
187         off = start - (db->db_blkid << epbs);
188         num = end - start + 1;
189
190         ASSERT3U(dn->dn_phys->dn_indblkshift, >=, SPA_BLKPTRSHIFT);
191         ASSERT3U(end + 1, >=, start);
192         ASSERT3U(start, >=, (db->db_blkid << epbs));
193         ASSERT3U(db->db_level, >, 0);
194         ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
195         ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
196         ASSERT(db->db_blkptr != NULL);
197
198         for (i = off; i < off+num; i++) {
199                 uint64_t *buf;
200                 dmu_buf_impl_t *child;
201                 dbuf_dirty_record_t *dr;
202
203                 ASSERT(db->db_level == 1);
204
205                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
206                 err = dbuf_hold_impl(dn, db->db_level - 1,
207                     (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
208                 rw_exit(&dn->dn_struct_rwlock);
209                 if (err == ENOENT)
210                         continue;
211                 ASSERT(err == 0);
212                 ASSERT(child->db_level == 0);
213                 dr = dbuf_find_dirty_eq(child, txg);
214
215                 /* data_old better be zeroed */
216                 if (dr) {
217                         buf = dr->dt.dl.dr_data->b_data;
218                         for (j = 0; j < child->db.db_size >> 3; j++) {
219                                 if (buf[j] != 0) {
220                                         panic("freed data not zero: "
221                                             "child=%p i=%llu off=%llu "
222                                             "num=%llu\n",
223                                             (void *)child, (u_longlong_t)i,
224                                             (u_longlong_t)off,
225                                             (u_longlong_t)num);
226                                 }
227                         }
228                 }
229
230                 /*
231                  * db_data better be zeroed unless it's dirty in a
232                  * future txg.
233                  */
234                 mutex_enter(&child->db_mtx);
235                 buf = child->db.db_data;
236                 if (buf != NULL && child->db_state != DB_FILL &&
237                     list_is_empty(&child->db_dirty_records)) {
238                         for (j = 0; j < child->db.db_size >> 3; j++) {
239                                 if (buf[j] != 0) {
240                                         panic("freed data not zero: "
241                                             "child=%p i=%llu off=%llu "
242                                             "num=%llu\n",
243                                             (void *)child, (u_longlong_t)i,
244                                             (u_longlong_t)off,
245                                             (u_longlong_t)num);
246                                 }
247                         }
248                 }
249                 mutex_exit(&child->db_mtx);
250
251                 dbuf_rele(child, FTAG);
252         }
253         DB_DNODE_EXIT(db);
254 }
255 #endif
256
257 /*
258  * We don't usually free the indirect blocks here.  If in one txg we have a
259  * free_range and a write to the same indirect block, it's important that we
260  * preserve the hole's birth times. Therefore, we don't free any any indirect
261  * blocks in free_children().  If an indirect block happens to turn into all
262  * holes, it will be freed by dbuf_write_children_ready, which happens at a
263  * point in the syncing process where we know for certain the contents of the
264  * indirect block.
265  *
266  * However, if we're freeing a dnode, its space accounting must go to zero
267  * before we actually try to free the dnode, or we will trip an assertion. In
268  * addition, we know the case described above cannot occur, because the dnode is
269  * being freed.  Therefore, we free the indirect blocks immediately in that
270  * case.
271  */
272 static void
273 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
274     boolean_t free_indirects, dmu_tx_t *tx)
275 {
276         dnode_t *dn;
277         blkptr_t *bp;
278         dmu_buf_impl_t *subdb;
279         uint64_t start, end, dbstart, dbend;
280         unsigned int epbs, shift, i;
281
282         /*
283          * There is a small possibility that this block will not be cached:
284          *   1 - if level > 1 and there are no children with level <= 1
285          *   2 - if this block was evicted since we read it from
286          *       dmu_tx_hold_free().
287          */
288         if (db->db_state != DB_CACHED)
289                 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
290
291         /*
292          * If we modify this indirect block, and we are not freeing the
293          * dnode (!free_indirects), then this indirect block needs to get
294          * written to disk by dbuf_write().  If it is dirty, we know it will
295          * be written (otherwise, we would have incorrect on-disk state
296          * because the space would be freed but still referenced by the BP
297          * in this indirect block).  Therefore we VERIFY that it is
298          * dirty.
299          *
300          * Our VERIFY covers some cases that do not actually have to be
301          * dirty, but the open-context code happens to dirty.  E.g. if the
302          * blocks we are freeing are all holes, because in that case, we
303          * are only freeing part of this indirect block, so it is an
304          * ancestor of the first or last block to be freed.  The first and
305          * last L1 indirect blocks are always dirtied by dnode_free_range().
306          */
307         db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
308         VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
309         dmu_buf_unlock_parent(db, dblt, FTAG);
310
311         dbuf_release_bp(db);
312         bp = db->db.db_data;
313
314         DB_DNODE_ENTER(db);
315         dn = DB_DNODE(db);
316         epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
317         ASSERT3U(epbs, <, 31);
318         shift = (db->db_level - 1) * epbs;
319         dbstart = db->db_blkid << epbs;
320         start = blkid >> shift;
321         if (dbstart < start) {
322                 bp += start - dbstart;
323         } else {
324                 start = dbstart;
325         }
326         dbend = ((db->db_blkid + 1) << epbs) - 1;
327         end = (blkid + nblks - 1) >> shift;
328         if (dbend <= end)
329                 end = dbend;
330
331         ASSERT3U(start, <=, end);
332
333         if (db->db_level == 1) {
334                 FREE_VERIFY(db, start, end, tx);
335                 rw_enter(&db->db_rwlock, RW_WRITER);
336                 free_blocks(dn, bp, end - start + 1, tx);
337                 rw_exit(&db->db_rwlock);
338         } else {
339                 for (uint64_t id = start; id <= end; id++, bp++) {
340                         if (BP_IS_HOLE(bp))
341                                 continue;
342                         rw_enter(&dn->dn_struct_rwlock, RW_READER);
343                         VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
344                             id, TRUE, FALSE, FTAG, &subdb));
345                         rw_exit(&dn->dn_struct_rwlock);
346                         ASSERT3P(bp, ==, subdb->db_blkptr);
347
348                         free_children(subdb, blkid, nblks, free_indirects, tx);
349                         dbuf_rele(subdb, FTAG);
350                 }
351         }
352
353         if (free_indirects) {
354                 rw_enter(&db->db_rwlock, RW_WRITER);
355                 for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
356                         ASSERT(BP_IS_HOLE(bp));
357                 memset(db->db.db_data, 0, db->db.db_size);
358                 free_blocks(dn, db->db_blkptr, 1, tx);
359                 rw_exit(&db->db_rwlock);
360         }
361
362         DB_DNODE_EXIT(db);
363         arc_buf_freeze(db->db_buf);
364 }
365
366 /*
367  * Traverse the indicated range of the provided file
368  * and "free" all the blocks contained there.
369  */
370 static void
371 dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
372     boolean_t free_indirects, dmu_tx_t *tx)
373 {
374         blkptr_t *bp = dn->dn_phys->dn_blkptr;
375         int dnlevel = dn->dn_phys->dn_nlevels;
376         boolean_t trunc = B_FALSE;
377
378         if (blkid > dn->dn_phys->dn_maxblkid)
379                 return;
380
381         ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
382         if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
383                 nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
384                 trunc = B_TRUE;
385         }
386
387         /* There are no indirect blocks in the object */
388         if (dnlevel == 1) {
389                 if (blkid >= dn->dn_phys->dn_nblkptr) {
390                         /* this range was never made persistent */
391                         return;
392                 }
393                 ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
394                 free_blocks(dn, bp + blkid, nblks, tx);
395         } else {
396                 int shift = (dnlevel - 1) *
397                     (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
398                 int start = blkid >> shift;
399                 int end = (blkid + nblks - 1) >> shift;
400                 dmu_buf_impl_t *db;
401
402                 ASSERT(start < dn->dn_phys->dn_nblkptr);
403                 bp += start;
404                 for (int i = start; i <= end; i++, bp++) {
405                         if (BP_IS_HOLE(bp))
406                                 continue;
407                         rw_enter(&dn->dn_struct_rwlock, RW_READER);
408                         VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
409                             TRUE, FALSE, FTAG, &db));
410                         rw_exit(&dn->dn_struct_rwlock);
411                         free_children(db, blkid, nblks, free_indirects, tx);
412                         dbuf_rele(db, FTAG);
413                 }
414         }
415
416         /*
417          * Do not truncate the maxblkid if we are performing a raw
418          * receive. The raw receive sets the maxblkid manually and
419          * must not be overridden. Usually, the last DRR_FREE record
420          * will be at the maxblkid, because the source system sets
421          * the maxblkid when truncating. However, if the last block
422          * was freed by overwriting with zeros and being compressed
423          * away to a hole, the source system will generate a DRR_FREE
424          * record while leaving the maxblkid after the end of that
425          * record. In this case we need to leave the maxblkid as
426          * indicated in the DRR_OBJECT record, so that it matches the
427          * source system, ensuring that the cryptographic hashes will
428          * match.
429          */
430         if (trunc && !dn->dn_objset->os_raw_receive) {
431                 uint64_t off __maybe_unused;
432                 dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
433
434                 off = (dn->dn_phys->dn_maxblkid + 1) *
435                     (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
436                 ASSERT(off < dn->dn_phys->dn_maxblkid ||
437                     dn->dn_phys->dn_maxblkid == 0 ||
438                     dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
439         }
440 }
441
442 typedef struct dnode_sync_free_range_arg {
443         dnode_t *dsfra_dnode;
444         dmu_tx_t *dsfra_tx;
445         boolean_t dsfra_free_indirects;
446 } dnode_sync_free_range_arg_t;
447
448 static void
449 dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
450 {
451         dnode_sync_free_range_arg_t *dsfra = arg;
452         dnode_t *dn = dsfra->dsfra_dnode;
453
454         mutex_exit(&dn->dn_mtx);
455         dnode_sync_free_range_impl(dn, blkid, nblks,
456             dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
457         mutex_enter(&dn->dn_mtx);
458 }
459
460 /*
461  * Try to kick all the dnode's dbufs out of the cache...
462  */
463 void
464 dnode_evict_dbufs(dnode_t *dn)
465 {
466         dmu_buf_impl_t *db_marker;
467         dmu_buf_impl_t *db, *db_next;
468
469         db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
470
471         mutex_enter(&dn->dn_dbufs_mtx);
472         for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
473
474 #ifdef  ZFS_DEBUG
475                 DB_DNODE_ENTER(db);
476                 ASSERT3P(DB_DNODE(db), ==, dn);
477                 DB_DNODE_EXIT(db);
478 #endif  /* DEBUG */
479
480                 mutex_enter(&db->db_mtx);
481                 if (db->db_state != DB_EVICTING &&
482                     zfs_refcount_is_zero(&db->db_holds)) {
483                         db_marker->db_level = db->db_level;
484                         db_marker->db_blkid = db->db_blkid;
485                         /*
486                          * Insert a MARKER node with the same level and blkid.
487                          * And to resolve any ties in dbuf_compare() use the
488                          * pointer of the dbuf that we are evicting. Pass the
489                          * address in db_parent.
490                          */
491                         db_marker->db_state = DB_MARKER;
492                         db_marker->db_parent = (void *)((uintptr_t)db - 1);
493                         avl_insert_here(&dn->dn_dbufs, db_marker, db,
494                             AVL_BEFORE);
495
496                         /*
497                          * We need to use the "marker" dbuf rather than
498                          * simply getting the next dbuf, because
499                          * dbuf_destroy() may actually remove multiple dbufs.
500                          * It can call itself recursively on the parent dbuf,
501                          * which may also be removed from dn_dbufs.  The code
502                          * flow would look like:
503                          *
504                          * dbuf_destroy():
505                          *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
506                          *      if (!cacheable || pending_evict)
507                          *        dbuf_destroy()
508                          */
509                         dbuf_destroy(db);
510
511                         db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
512                         avl_remove(&dn->dn_dbufs, db_marker);
513                 } else {
514                         db->db_pending_evict = TRUE;
515                         mutex_exit(&db->db_mtx);
516                         db_next = AVL_NEXT(&dn->dn_dbufs, db);
517                 }
518         }
519         mutex_exit(&dn->dn_dbufs_mtx);
520
521         kmem_free(db_marker, sizeof (dmu_buf_impl_t));
522
523         dnode_evict_bonus(dn);
524 }
525
526 void
527 dnode_evict_bonus(dnode_t *dn)
528 {
529         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
530         if (dn->dn_bonus != NULL) {
531                 if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) {
532                         mutex_enter(&dn->dn_bonus->db_mtx);
533                         dbuf_destroy(dn->dn_bonus);
534                         dn->dn_bonus = NULL;
535                 } else {
536                         dn->dn_bonus->db_pending_evict = TRUE;
537                 }
538         }
539         rw_exit(&dn->dn_struct_rwlock);
540 }
541
542 static void
543 dnode_undirty_dbufs(list_t *list)
544 {
545         dbuf_dirty_record_t *dr;
546
547         while ((dr = list_head(list))) {
548                 dmu_buf_impl_t *db = dr->dr_dbuf;
549                 uint64_t txg = dr->dr_txg;
550
551                 if (db->db_level != 0)
552                         dnode_undirty_dbufs(&dr->dt.di.dr_children);
553
554                 mutex_enter(&db->db_mtx);
555                 /* XXX - use dbuf_undirty()? */
556                 list_remove(list, dr);
557                 ASSERT(list_head(&db->db_dirty_records) == dr);
558                 list_remove_head(&db->db_dirty_records);
559                 ASSERT(list_is_empty(&db->db_dirty_records));
560                 db->db_dirtycnt -= 1;
561                 if (db->db_level == 0) {
562                         ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
563                             dr->dt.dl.dr_data == db->db_buf);
564                         dbuf_unoverride(dr);
565                 } else {
566                         mutex_destroy(&dr->dt.di.dr_mtx);
567                         list_destroy(&dr->dt.di.dr_children);
568                 }
569                 kmem_free(dr, sizeof (dbuf_dirty_record_t));
570                 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
571         }
572 }
573
574 static void
575 dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
576 {
577         int txgoff = tx->tx_txg & TXG_MASK;
578
579         ASSERT(dmu_tx_is_syncing(tx));
580
581         /*
582          * Our contents should have been freed in dnode_sync() by the
583          * free range record inserted by the caller of dnode_free().
584          */
585         ASSERT0(DN_USED_BYTES(dn->dn_phys));
586         ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
587
588         dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
589         dnode_evict_dbufs(dn);
590
591         /*
592          * XXX - It would be nice to assert this, but we may still
593          * have residual holds from async evictions from the arc...
594          *
595          * zfs_obj_to_path() also depends on this being
596          * commented out.
597          *
598          * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1);
599          */
600
601         /* Undirty next bits */
602         dn->dn_next_nlevels[txgoff] = 0;
603         dn->dn_next_indblkshift[txgoff] = 0;
604         dn->dn_next_blksz[txgoff] = 0;
605         dn->dn_next_maxblkid[txgoff] = 0;
606
607         /* ASSERT(blkptrs are zero); */
608         ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
609         ASSERT(dn->dn_type != DMU_OT_NONE);
610
611         ASSERT(dn->dn_free_txg > 0);
612         if (dn->dn_allocated_txg != dn->dn_free_txg)
613                 dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
614         memset(dn->dn_phys, 0, sizeof (dnode_phys_t) * dn->dn_num_slots);
615         dnode_free_interior_slots(dn);
616
617         mutex_enter(&dn->dn_mtx);
618         dn->dn_type = DMU_OT_NONE;
619         dn->dn_maxblkid = 0;
620         dn->dn_allocated_txg = 0;
621         dn->dn_free_txg = 0;
622         dn->dn_have_spill = B_FALSE;
623         dn->dn_num_slots = 1;
624         mutex_exit(&dn->dn_mtx);
625
626         ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
627
628         dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
629         /*
630          * Now that we've released our hold, the dnode may
631          * be evicted, so we mustn't access it.
632          */
633 }
634
635 /*
636  * Write out the dnode's dirty buffers.
637  * Does not wait for zio completions.
638  */
639 void
640 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
641 {
642         objset_t *os = dn->dn_objset;
643         dnode_phys_t *dnp = dn->dn_phys;
644         int txgoff = tx->tx_txg & TXG_MASK;
645         list_t *list = &dn->dn_dirty_records[txgoff];
646         static const dnode_phys_t zerodn __maybe_unused = { 0 };
647         boolean_t kill_spill = B_FALSE;
648
649         ASSERT(dmu_tx_is_syncing(tx));
650         ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
651         ASSERT(dnp->dn_type != DMU_OT_NONE ||
652             memcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
653         DNODE_VERIFY(dn);
654
655         ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
656
657         /*
658          * Do user accounting if it is enabled and this is not
659          * an encrypted receive.
660          */
661         if (dmu_objset_userused_enabled(os) &&
662             !DMU_OBJECT_IS_SPECIAL(dn->dn_object) &&
663             (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
664                 mutex_enter(&dn->dn_mtx);
665                 dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
666                 dn->dn_oldflags = dn->dn_phys->dn_flags;
667                 dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
668                 if (dmu_objset_userobjused_enabled(dn->dn_objset))
669                         dn->dn_phys->dn_flags |=
670                             DNODE_FLAG_USEROBJUSED_ACCOUNTED;
671                 mutex_exit(&dn->dn_mtx);
672                 dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
673         } else if (!(os->os_encrypted && dmu_objset_is_receiving(os))) {
674                 /*
675                  * Once we account for it, we should always account for it,
676                  * except for the case of a raw receive. We will not be able
677                  * to account for it until the receiving dataset has been
678                  * mounted.
679                  */
680                 ASSERT(!(dn->dn_phys->dn_flags &
681                     DNODE_FLAG_USERUSED_ACCOUNTED));
682                 ASSERT(!(dn->dn_phys->dn_flags &
683                     DNODE_FLAG_USEROBJUSED_ACCOUNTED));
684         }
685
686         mutex_enter(&dn->dn_mtx);
687         if (dn->dn_allocated_txg == tx->tx_txg) {
688                 /* The dnode is newly allocated or reallocated */
689                 if (dnp->dn_type == DMU_OT_NONE) {
690                         /* this is a first alloc, not a realloc */
691                         dnp->dn_nlevels = 1;
692                         dnp->dn_nblkptr = dn->dn_nblkptr;
693                 }
694
695                 dnp->dn_type = dn->dn_type;
696                 dnp->dn_bonustype = dn->dn_bonustype;
697                 dnp->dn_bonuslen = dn->dn_bonuslen;
698         }
699
700         dnp->dn_extra_slots = dn->dn_num_slots - 1;
701
702         ASSERT(dnp->dn_nlevels > 1 ||
703             BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
704             BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
705             BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
706             dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
707         ASSERT(dnp->dn_nlevels < 2 ||
708             BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
709             BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
710
711         if (dn->dn_next_type[txgoff] != 0) {
712                 dnp->dn_type = dn->dn_type;
713                 dn->dn_next_type[txgoff] = 0;
714         }
715
716         if (dn->dn_next_blksz[txgoff] != 0) {
717                 ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
718                     SPA_MINBLOCKSIZE) == 0);
719                 ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
720                     dn->dn_maxblkid == 0 || list_head(list) != NULL ||
721                     dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
722                     dnp->dn_datablkszsec ||
723                     !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
724                 dnp->dn_datablkszsec =
725                     dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
726                 dn->dn_next_blksz[txgoff] = 0;
727         }
728
729         if (dn->dn_next_bonuslen[txgoff] != 0) {
730                 if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
731                         dnp->dn_bonuslen = 0;
732                 else
733                         dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
734                 ASSERT(dnp->dn_bonuslen <=
735                     DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
736                 dn->dn_next_bonuslen[txgoff] = 0;
737         }
738
739         if (dn->dn_next_bonustype[txgoff] != 0) {
740                 ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
741                 dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
742                 dn->dn_next_bonustype[txgoff] = 0;
743         }
744
745         boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
746             dn->dn_free_txg <= tx->tx_txg;
747
748         /*
749          * Remove the spill block if we have been explicitly asked to
750          * remove it, or if the object is being removed.
751          */
752         if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
753                 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
754                         kill_spill = B_TRUE;
755                 dn->dn_rm_spillblk[txgoff] = 0;
756         }
757
758         if (dn->dn_next_indblkshift[txgoff] != 0) {
759                 ASSERT(dnp->dn_nlevels == 1);
760                 dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
761                 dn->dn_next_indblkshift[txgoff] = 0;
762         }
763
764         /*
765          * Just take the live (open-context) values for checksum and compress.
766          * Strictly speaking it's a future leak, but nothing bad happens if we
767          * start using the new checksum or compress algorithm a little early.
768          */
769         dnp->dn_checksum = dn->dn_checksum;
770         dnp->dn_compress = dn->dn_compress;
771
772         mutex_exit(&dn->dn_mtx);
773
774         if (kill_spill) {
775                 free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
776                 mutex_enter(&dn->dn_mtx);
777                 dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
778                 mutex_exit(&dn->dn_mtx);
779         }
780
781         /* process all the "freed" ranges in the file */
782         if (dn->dn_free_ranges[txgoff] != NULL) {
783                 dnode_sync_free_range_arg_t dsfra;
784                 dsfra.dsfra_dnode = dn;
785                 dsfra.dsfra_tx = tx;
786                 dsfra.dsfra_free_indirects = freeing_dnode;
787                 mutex_enter(&dn->dn_mtx);
788                 if (freeing_dnode) {
789                         ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
790                             0, dn->dn_maxblkid + 1));
791                 }
792                 /*
793                  * Because dnode_sync_free_range() must drop dn_mtx during its
794                  * processing, using it as a callback to range_tree_vacate() is
795                  * not safe.  No other operations (besides destroy) are allowed
796                  * once range_tree_vacate() has begun, and dropping dn_mtx
797                  * would leave a window open for another thread to observe that
798                  * invalid (and unsafe) state.
799                  */
800                 range_tree_walk(dn->dn_free_ranges[txgoff],
801                     dnode_sync_free_range, &dsfra);
802                 range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);
803                 range_tree_destroy(dn->dn_free_ranges[txgoff]);
804                 dn->dn_free_ranges[txgoff] = NULL;
805                 mutex_exit(&dn->dn_mtx);
806         }
807
808         if (freeing_dnode) {
809                 dn->dn_objset->os_freed_dnodes++;
810                 dnode_sync_free(dn, tx);
811                 return;
812         }
813
814         if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
815                 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
816                 mutex_enter(&ds->ds_lock);
817                 ds->ds_feature_activation[SPA_FEATURE_LARGE_DNODE] =
818                     (void *)B_TRUE;
819                 mutex_exit(&ds->ds_lock);
820         }
821
822         if (dn->dn_next_nlevels[txgoff]) {
823                 dnode_increase_indirection(dn, tx);
824                 dn->dn_next_nlevels[txgoff] = 0;
825         }
826
827         /*
828          * This must be done after dnode_sync_free_range()
829          * and dnode_increase_indirection(). See dnode_new_blkid()
830          * for an explanation of the high bit being set.
831          */
832         if (dn->dn_next_maxblkid[txgoff]) {
833                 mutex_enter(&dn->dn_mtx);
834                 dnp->dn_maxblkid =
835                     dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET;
836                 dn->dn_next_maxblkid[txgoff] = 0;
837                 mutex_exit(&dn->dn_mtx);
838         }
839
840         if (dn->dn_next_nblkptr[txgoff]) {
841                 /* this should only happen on a realloc */
842                 ASSERT(dn->dn_allocated_txg == tx->tx_txg);
843                 if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
844                         /* zero the new blkptrs we are gaining */
845                         memset(dnp->dn_blkptr + dnp->dn_nblkptr, 0,
846                             sizeof (blkptr_t) *
847                             (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
848 #ifdef ZFS_DEBUG
849                 } else {
850                         int i;
851                         ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
852                         /* the blkptrs we are losing better be unallocated */
853                         for (i = 0; i < dnp->dn_nblkptr; i++) {
854                                 if (i >= dn->dn_next_nblkptr[txgoff])
855                                         ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
856                         }
857 #endif
858                 }
859                 mutex_enter(&dn->dn_mtx);
860                 dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
861                 dn->dn_next_nblkptr[txgoff] = 0;
862                 mutex_exit(&dn->dn_mtx);
863         }
864
865         dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
866
867         if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
868                 ASSERT3P(list_head(list), ==, NULL);
869                 dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
870         }
871
872         ASSERT3U(dnp->dn_bonuslen, <=, DN_MAX_BONUS_LEN(dnp));
873
874         /*
875          * Although we have dropped our reference to the dnode, it
876          * can't be evicted until its written, and we haven't yet
877          * initiated the IO for the dnode's dbuf.  Additionally, the caller
878          * has already added a reference to the dnode because it's on the
879          * os_synced_dnodes list.
880          */
881 }