4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
28 #include <sys/zfs_context.h>
30 #include <sys/dmu_impl.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/dsl_dir.h>
35 #include <sys/dmu_tx.h>
38 #include <sys/dmu_zfetch.h>
40 static void dbuf_destroy(dmu_buf_impl_t *db);
41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
42 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
43 int compress, dmu_tx_t *tx);
44 static arc_done_func_t dbuf_write_ready;
45 static arc_done_func_t dbuf_write_done;
47 int zfs_mdcomp_disable = 0;
48 SYSCTL_DECL(_vfs_zfs);
49 TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
50 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
51 &zfs_mdcomp_disable, 0, "Disable metadata compression");
54 * Global data structures and functions for the dbuf cache.
56 static kmem_cache_t *dbuf_cache;
60 dbuf_cons(void *vdb, void *unused, int kmflag)
62 dmu_buf_impl_t *db = vdb;
63 bzero(db, sizeof (dmu_buf_impl_t));
65 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
66 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
67 refcount_create(&db->db_holds);
73 dbuf_dest(void *vdb, void *unused)
75 dmu_buf_impl_t *db = vdb;
76 mutex_destroy(&db->db_mtx);
77 cv_destroy(&db->db_changed);
78 refcount_destroy(&db->db_holds);
82 * dbuf hash table routines
84 static dbuf_hash_table_t dbuf_hash_table;
86 static uint64_t dbuf_hash_count;
89 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
91 uintptr_t osv = (uintptr_t)os;
94 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
95 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
96 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
97 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
98 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
99 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
100 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
102 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
107 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
109 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
110 ((dbuf)->db.db_object == (obj) && \
111 (dbuf)->db_objset == (os) && \
112 (dbuf)->db_level == (level) && \
113 (dbuf)->db_blkid == (blkid))
116 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
118 dbuf_hash_table_t *h = &dbuf_hash_table;
119 objset_impl_t *os = dn->dn_objset;
120 uint64_t obj = dn->dn_object;
121 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
122 uint64_t idx = hv & h->hash_table_mask;
125 mutex_enter(DBUF_HASH_MUTEX(h, idx));
126 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
127 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
128 mutex_enter(&db->db_mtx);
129 if (db->db_state != DB_EVICTING) {
130 mutex_exit(DBUF_HASH_MUTEX(h, idx));
133 mutex_exit(&db->db_mtx);
136 mutex_exit(DBUF_HASH_MUTEX(h, idx));
141 * Insert an entry into the hash table. If there is already an element
142 * equal to elem in the hash table, then the already existing element
143 * will be returned and the new element will not be inserted.
144 * Otherwise returns NULL.
146 static dmu_buf_impl_t *
147 dbuf_hash_insert(dmu_buf_impl_t *db)
149 dbuf_hash_table_t *h = &dbuf_hash_table;
150 objset_impl_t *os = db->db_objset;
151 uint64_t obj = db->db.db_object;
152 int level = db->db_level;
153 uint64_t blkid = db->db_blkid;
154 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
155 uint64_t idx = hv & h->hash_table_mask;
158 mutex_enter(DBUF_HASH_MUTEX(h, idx));
159 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
160 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
161 mutex_enter(&dbf->db_mtx);
162 if (dbf->db_state != DB_EVICTING) {
163 mutex_exit(DBUF_HASH_MUTEX(h, idx));
166 mutex_exit(&dbf->db_mtx);
170 mutex_enter(&db->db_mtx);
171 db->db_hash_next = h->hash_table[idx];
172 h->hash_table[idx] = db;
173 mutex_exit(DBUF_HASH_MUTEX(h, idx));
174 atomic_add_64(&dbuf_hash_count, 1);
180 * Remove an entry from the hash table. This operation will
181 * fail if there are any existing holds on the db.
184 dbuf_hash_remove(dmu_buf_impl_t *db)
186 dbuf_hash_table_t *h = &dbuf_hash_table;
187 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
188 db->db_level, db->db_blkid);
189 uint64_t idx = hv & h->hash_table_mask;
190 dmu_buf_impl_t *dbf, **dbp;
193 * We musn't hold db_mtx to maintin lock ordering:
194 * DBUF_HASH_MUTEX > db_mtx.
196 ASSERT(refcount_is_zero(&db->db_holds));
197 ASSERT(db->db_state == DB_EVICTING);
198 ASSERT(!MUTEX_HELD(&db->db_mtx));
200 mutex_enter(DBUF_HASH_MUTEX(h, idx));
201 dbp = &h->hash_table[idx];
202 while ((dbf = *dbp) != db) {
203 dbp = &dbf->db_hash_next;
206 *dbp = db->db_hash_next;
207 db->db_hash_next = NULL;
208 mutex_exit(DBUF_HASH_MUTEX(h, idx));
209 atomic_add_64(&dbuf_hash_count, -1);
212 static arc_evict_func_t dbuf_do_evict;
215 dbuf_evict_user(dmu_buf_impl_t *db)
217 ASSERT(MUTEX_HELD(&db->db_mtx));
219 if (db->db_level != 0 || db->db_evict_func == NULL)
222 if (db->db_user_data_ptr_ptr)
223 *db->db_user_data_ptr_ptr = db->db.db_data;
224 db->db_evict_func(&db->db, db->db_user_ptr);
225 db->db_user_ptr = NULL;
226 db->db_user_data_ptr_ptr = NULL;
227 db->db_evict_func = NULL;
231 dbuf_evict(dmu_buf_impl_t *db)
233 ASSERT(MUTEX_HELD(&db->db_mtx));
234 ASSERT(db->db_buf == NULL);
235 ASSERT(db->db_data_pending == NULL);
244 uint64_t hsize = 1ULL << 16;
245 dbuf_hash_table_t *h = &dbuf_hash_table;
249 * The hash table is big enough to fill all of physical memory
250 * with an average 4K block size. The table will take up
251 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
253 while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
257 h->hash_table_mask = hsize - 1;
258 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
259 if (h->hash_table == NULL) {
260 /* XXX - we should really return an error instead of assert */
261 ASSERT(hsize > (1ULL << 10));
266 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
267 sizeof (dmu_buf_impl_t),
268 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
270 for (i = 0; i < DBUF_MUTEXES; i++)
271 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
277 dbuf_hash_table_t *h = &dbuf_hash_table;
280 for (i = 0; i < DBUF_MUTEXES; i++)
281 mutex_destroy(&h->hash_mutexes[i]);
282 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
283 kmem_cache_destroy(dbuf_cache);
292 dbuf_verify(dmu_buf_impl_t *db)
294 dnode_t *dn = db->db_dnode;
296 ASSERT(MUTEX_HELD(&db->db_mtx));
298 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
301 ASSERT(db->db_objset != NULL);
303 ASSERT(db->db_parent == NULL);
304 ASSERT(db->db_blkptr == NULL);
306 ASSERT3U(db->db.db_object, ==, dn->dn_object);
307 ASSERT3P(db->db_objset, ==, dn->dn_objset);
308 ASSERT3U(db->db_level, <, dn->dn_nlevels);
309 ASSERT(db->db_blkid == DB_BONUS_BLKID ||
310 list_head(&dn->dn_dbufs));
312 if (db->db_blkid == DB_BONUS_BLKID) {
314 ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
315 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
317 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
320 if (db->db_level == 0) {
321 /* we can be momentarily larger in dnode_set_blksz() */
322 if (db->db_blkid != DB_BONUS_BLKID && dn) {
323 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
325 if (db->db.db_object == DMU_META_DNODE_OBJECT) {
326 dbuf_dirty_record_t *dr = db->db_data_pending;
328 * it should only be modified in syncing
329 * context, so make sure we only have
330 * one copy of the data.
332 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
336 /* verify db->db_blkptr */
338 if (db->db_parent == dn->dn_dbuf) {
339 /* db is pointed to by the dnode */
340 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
341 if (db->db.db_object == DMU_META_DNODE_OBJECT)
342 ASSERT(db->db_parent == NULL);
344 ASSERT(db->db_parent != NULL);
345 ASSERT3P(db->db_blkptr, ==,
346 &dn->dn_phys->dn_blkptr[db->db_blkid]);
348 /* db is pointed to by an indirect block */
349 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
350 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
351 ASSERT3U(db->db_parent->db.db_object, ==,
354 * dnode_grow_indblksz() can make this fail if we don't
355 * have the struct_rwlock. XXX indblksz no longer
356 * grows. safe to do this now?
358 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
359 ASSERT3P(db->db_blkptr, ==,
360 ((blkptr_t *)db->db_parent->db.db_data +
361 db->db_blkid % epb));
365 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
366 db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
367 db->db_state != DB_FILL && !dn->dn_free_txg) {
369 * If the blkptr isn't set but they have nonzero data,
370 * it had better be dirty, otherwise we'll lose that
371 * data when we evict this buffer.
373 if (db->db_dirtycnt == 0) {
374 uint64_t *buf = db->db.db_data;
377 for (i = 0; i < db->db.db_size >> 3; i++) {
386 dbuf_update_data(dmu_buf_impl_t *db)
388 ASSERT(MUTEX_HELD(&db->db_mtx));
389 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
390 ASSERT(!refcount_is_zero(&db->db_holds));
391 *db->db_user_data_ptr_ptr = db->db.db_data;
396 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
398 ASSERT(MUTEX_HELD(&db->db_mtx));
399 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
402 ASSERT(buf->b_data != NULL);
403 db->db.db_data = buf->b_data;
404 if (!arc_released(buf))
405 arc_set_callback(buf, dbuf_do_evict, db);
406 dbuf_update_data(db);
409 db->db.db_data = NULL;
410 db->db_state = DB_UNCACHED;
415 dbuf_whichblock(dnode_t *dn, uint64_t offset)
417 if (dn->dn_datablkshift) {
418 return (offset >> dn->dn_datablkshift);
420 ASSERT3U(offset, <, dn->dn_datablksz);
426 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
428 dmu_buf_impl_t *db = vdb;
430 mutex_enter(&db->db_mtx);
431 ASSERT3U(db->db_state, ==, DB_READ);
433 * All reads are synchronous, so we must have a hold on the dbuf
435 ASSERT(refcount_count(&db->db_holds) > 0);
436 ASSERT(db->db_buf == NULL);
437 ASSERT(db->db.db_data == NULL);
438 if (db->db_level == 0 && db->db_freed_in_flight) {
439 /* we were freed in flight; disregard any error */
440 arc_release(buf, db);
441 bzero(buf->b_data, db->db.db_size);
443 db->db_freed_in_flight = FALSE;
444 dbuf_set_data(db, buf);
445 db->db_state = DB_CACHED;
446 } else if (zio == NULL || zio->io_error == 0) {
447 dbuf_set_data(db, buf);
448 db->db_state = DB_CACHED;
450 ASSERT(db->db_blkid != DB_BONUS_BLKID);
451 ASSERT3P(db->db_buf, ==, NULL);
452 VERIFY(arc_buf_remove_ref(buf, db) == 1);
453 db->db_state = DB_UNCACHED;
455 cv_broadcast(&db->db_changed);
456 mutex_exit(&db->db_mtx);
461 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
465 uint32_t aflags = ARC_NOWAIT;
467 ASSERT(!refcount_is_zero(&db->db_holds));
468 /* We need the struct_rwlock to prevent db_blkptr from changing. */
469 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
470 ASSERT(MUTEX_HELD(&db->db_mtx));
471 ASSERT(db->db_state == DB_UNCACHED);
472 ASSERT(db->db_buf == NULL);
474 if (db->db_blkid == DB_BONUS_BLKID) {
475 ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
476 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
477 if (db->db.db_size < DN_MAX_BONUSLEN)
478 bzero(db->db.db_data, DN_MAX_BONUSLEN);
479 bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
481 dbuf_update_data(db);
482 db->db_state = DB_CACHED;
483 mutex_exit(&db->db_mtx);
487 if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
493 dprintf_dbuf(db, "blkptr: %s\n", "NULL");
495 dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
497 if (bp == NULL || BP_IS_HOLE(bp)) {
498 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
500 ASSERT(bp == NULL || BP_IS_HOLE(bp));
501 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
502 db->db.db_size, db, type));
503 bzero(db->db.db_data, db->db.db_size);
504 db->db_state = DB_CACHED;
505 *flags |= DB_RF_CACHED;
506 mutex_exit(&db->db_mtx);
510 db->db_state = DB_READ;
511 mutex_exit(&db->db_mtx);
513 zb.zb_objset = db->db_objset->os_dsl_dataset ?
514 db->db_objset->os_dsl_dataset->ds_object : 0;
515 zb.zb_object = db->db.db_object;
516 zb.zb_level = db->db_level;
517 zb.zb_blkid = db->db_blkid;
519 dbuf_add_ref(db, NULL);
520 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
521 ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
522 (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
523 db->db_level > 0 ? byteswap_uint64_array :
524 dmu_ot[db->db_dnode->dn_type].ot_byteswap,
525 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
526 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
528 if (aflags & ARC_CACHED)
529 *flags |= DB_RF_CACHED;
533 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
536 int havepzio = (zio != NULL);
540 * We don't have to hold the mutex to check db_state because it
541 * can't be freed while we have a hold on the buffer.
543 ASSERT(!refcount_is_zero(&db->db_holds));
545 if ((flags & DB_RF_HAVESTRUCT) == 0)
546 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
548 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
549 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
551 mutex_enter(&db->db_mtx);
552 if (db->db_state == DB_CACHED) {
553 mutex_exit(&db->db_mtx);
555 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
556 db->db.db_size, TRUE);
557 if ((flags & DB_RF_HAVESTRUCT) == 0)
558 rw_exit(&db->db_dnode->dn_struct_rwlock);
559 } else if (db->db_state == DB_UNCACHED) {
561 zio = zio_root(db->db_dnode->dn_objset->os_spa,
562 NULL, NULL, ZIO_FLAG_CANFAIL);
564 dbuf_read_impl(db, zio, &flags);
566 /* dbuf_read_impl has dropped db_mtx for us */
569 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
570 db->db.db_size, flags & DB_RF_CACHED);
572 if ((flags & DB_RF_HAVESTRUCT) == 0)
573 rw_exit(&db->db_dnode->dn_struct_rwlock);
578 mutex_exit(&db->db_mtx);
580 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
581 db->db.db_size, TRUE);
582 if ((flags & DB_RF_HAVESTRUCT) == 0)
583 rw_exit(&db->db_dnode->dn_struct_rwlock);
585 mutex_enter(&db->db_mtx);
586 if ((flags & DB_RF_NEVERWAIT) == 0) {
587 while (db->db_state == DB_READ ||
588 db->db_state == DB_FILL) {
589 ASSERT(db->db_state == DB_READ ||
590 (flags & DB_RF_HAVESTRUCT) == 0);
591 cv_wait(&db->db_changed, &db->db_mtx);
593 if (db->db_state == DB_UNCACHED)
596 mutex_exit(&db->db_mtx);
599 ASSERT(err || havepzio || db->db_state == DB_CACHED);
604 dbuf_noread(dmu_buf_impl_t *db)
606 ASSERT(!refcount_is_zero(&db->db_holds));
607 ASSERT(db->db_blkid != DB_BONUS_BLKID);
608 mutex_enter(&db->db_mtx);
609 while (db->db_state == DB_READ || db->db_state == DB_FILL)
610 cv_wait(&db->db_changed, &db->db_mtx);
611 if (db->db_state == DB_UNCACHED) {
612 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
614 ASSERT(db->db_buf == NULL);
615 ASSERT(db->db.db_data == NULL);
616 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
617 db->db.db_size, db, type));
618 db->db_state = DB_FILL;
620 ASSERT3U(db->db_state, ==, DB_CACHED);
622 mutex_exit(&db->db_mtx);
626 * This is our just-in-time copy function. It makes a copy of
627 * buffers, that have been modified in a previous transaction
628 * group, before we modify them in the current active group.
630 * This function is used in two places: when we are dirtying a
631 * buffer for the first time in a txg, and when we are freeing
632 * a range in a dnode that includes this buffer.
634 * Note that when we are called from dbuf_free_range() we do
635 * not put a hold on the buffer, we just traverse the active
636 * dbuf list for the dnode.
639 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
641 dbuf_dirty_record_t *dr = db->db_last_dirty;
643 ASSERT(MUTEX_HELD(&db->db_mtx));
644 ASSERT(db->db.db_data != NULL);
645 ASSERT(db->db_level == 0);
646 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
649 (dr->dt.dl.dr_data !=
650 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
654 * If the last dirty record for this dbuf has not yet synced
655 * and its referencing the dbuf data, either:
656 * reset the reference to point to a new copy,
657 * or (if there a no active holders)
658 * just null out the current db_data pointer.
660 ASSERT(dr->dr_txg >= txg - 2);
661 if (db->db_blkid == DB_BONUS_BLKID) {
662 /* Note that the data bufs here are zio_bufs */
663 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
664 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
665 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
666 int size = db->db.db_size;
667 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
668 dr->dt.dl.dr_data = arc_buf_alloc(
669 db->db_dnode->dn_objset->os_spa, size, db, type);
670 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
672 dbuf_set_data(db, NULL);
677 dbuf_unoverride(dbuf_dirty_record_t *dr)
679 dmu_buf_impl_t *db = dr->dr_dbuf;
680 uint64_t txg = dr->dr_txg;
682 ASSERT(MUTEX_HELD(&db->db_mtx));
683 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
684 ASSERT(db->db_level == 0);
686 if (db->db_blkid == DB_BONUS_BLKID ||
687 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
690 /* free this block */
691 if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
692 /* XXX can get silent EIO here */
693 (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
694 txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
696 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
698 * Release the already-written buffer, so we leave it in
699 * a consistent dirty state. Note that all callers are
700 * modifying the buffer, so they will immediately do
701 * another (redundant) arc_release(). Therefore, leave
702 * the buf thawed to save the effort of freezing &
703 * immediately re-thawing it.
705 arc_release(dr->dt.dl.dr_data, db);
709 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
711 dmu_buf_impl_t *db, *db_next;
712 uint64_t txg = tx->tx_txg;
714 dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
715 mutex_enter(&dn->dn_dbufs_mtx);
716 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
717 db_next = list_next(&dn->dn_dbufs, db);
718 ASSERT(db->db_blkid != DB_BONUS_BLKID);
719 if (db->db_level != 0)
721 dprintf_dbuf(db, "found buf %s\n", "");
722 if (db->db_blkid < blkid ||
723 db->db_blkid >= blkid+nblks)
726 /* found a level 0 buffer in the range */
727 if (dbuf_undirty(db, tx))
730 mutex_enter(&db->db_mtx);
731 if (db->db_state == DB_UNCACHED ||
732 db->db_state == DB_EVICTING) {
733 ASSERT(db->db.db_data == NULL);
734 mutex_exit(&db->db_mtx);
737 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
738 /* will be handled in dbuf_read_done or dbuf_rele */
739 db->db_freed_in_flight = TRUE;
740 mutex_exit(&db->db_mtx);
743 if (refcount_count(&db->db_holds) == 0) {
748 /* The dbuf is referenced */
750 if (db->db_last_dirty != NULL) {
751 dbuf_dirty_record_t *dr = db->db_last_dirty;
753 if (dr->dr_txg == txg) {
755 * This buffer is "in-use", re-adjust the file
756 * size to reflect that this buffer may
757 * contain new data when we sync.
759 if (db->db_blkid > dn->dn_maxblkid)
760 dn->dn_maxblkid = db->db_blkid;
764 * This dbuf is not dirty in the open context.
765 * Either uncache it (if its not referenced in
766 * the open context) or reset its contents to
769 dbuf_fix_old_data(db, txg);
772 /* clear the contents if its cached */
773 if (db->db_state == DB_CACHED) {
774 ASSERT(db->db.db_data != NULL);
775 arc_release(db->db_buf, db);
776 bzero(db->db.db_data, db->db.db_size);
777 arc_buf_freeze(db->db_buf);
780 mutex_exit(&db->db_mtx);
782 mutex_exit(&dn->dn_dbufs_mtx);
786 dbuf_new_block(dmu_buf_impl_t *db)
788 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
789 uint64_t birth_txg = 0;
791 /* Don't count meta-objects */
796 * We don't need any locking to protect db_blkptr:
797 * If it's syncing, then db_last_dirty will be set
798 * so we'll ignore db_blkptr.
800 ASSERT(MUTEX_HELD(&db->db_mtx));
801 /* If we have been dirtied since the last snapshot, its not new */
802 if (db->db_last_dirty)
803 birth_txg = db->db_last_dirty->dr_txg;
804 else if (db->db_blkptr)
805 birth_txg = db->db_blkptr->blk_birth;
808 return (!dsl_dataset_block_freeable(ds, birth_txg));
814 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
816 arc_buf_t *buf, *obuf;
817 int osize = db->db.db_size;
818 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
820 ASSERT(db->db_blkid != DB_BONUS_BLKID);
822 /* XXX does *this* func really need the lock? */
823 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
826 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
827 * is OK, because there can be no other references to the db
828 * when we are changing its size, so no concurrent DB_FILL can
832 * XXX we should be doing a dbuf_read, checking the return
833 * value and returning that up to our callers
835 dbuf_will_dirty(db, tx);
837 /* create the data buffer for the new block */
838 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
840 /* copy old block data to the new block */
842 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
843 /* zero the remainder */
845 bzero((uint8_t *)buf->b_data + osize, size - osize);
847 mutex_enter(&db->db_mtx);
848 dbuf_set_data(db, buf);
849 VERIFY(arc_buf_remove_ref(obuf, db) == 1);
850 db->db.db_size = size;
852 if (db->db_level == 0) {
853 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
854 db->db_last_dirty->dt.dl.dr_data = buf;
856 mutex_exit(&db->db_mtx);
858 dnode_willuse_space(db->db_dnode, size-osize, tx);
861 dbuf_dirty_record_t *
862 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
864 dnode_t *dn = db->db_dnode;
865 objset_impl_t *os = dn->dn_objset;
866 dbuf_dirty_record_t **drp, *dr;
867 int drop_struct_lock = FALSE;
868 int txgoff = tx->tx_txg & TXG_MASK;
870 ASSERT(tx->tx_txg != 0);
871 ASSERT(!refcount_is_zero(&db->db_holds));
872 DMU_TX_DIRTY_BUF(tx, db);
875 * Shouldn't dirty a regular buffer in syncing context. Private
876 * objects may be dirtied in syncing context, but only if they
877 * were already pre-dirtied in open context.
878 * XXX We may want to prohibit dirtying in syncing context even
879 * if they did pre-dirty.
881 ASSERT(!dmu_tx_is_syncing(tx) ||
882 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
883 dn->dn_object == DMU_META_DNODE_OBJECT ||
884 dn->dn_objset->os_dsl_dataset == NULL ||
885 dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
888 * We make this assert for private objects as well, but after we
889 * check if we're already dirty. They are allowed to re-dirty
890 * in syncing context.
892 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
893 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
894 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
896 mutex_enter(&db->db_mtx);
898 * XXX make this true for indirects too? The problem is that
899 * transactions created with dmu_tx_create_assigned() from
900 * syncing context don't bother holding ahead.
902 ASSERT(db->db_level != 0 ||
903 db->db_state == DB_CACHED || db->db_state == DB_FILL);
905 mutex_enter(&dn->dn_mtx);
907 * Don't set dirtyctx to SYNC if we're just modifying this as we
908 * initialize the objset.
910 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
911 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
913 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
914 ASSERT(dn->dn_dirtyctx_firstset == NULL);
915 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
917 mutex_exit(&dn->dn_mtx);
920 * If this buffer is already dirty, we're done.
922 drp = &db->db_last_dirty;
923 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
924 db->db.db_object == DMU_META_DNODE_OBJECT);
925 while (*drp && (*drp)->dr_txg > tx->tx_txg)
926 drp = &(*drp)->dr_next;
927 if (*drp && (*drp)->dr_txg == tx->tx_txg) {
928 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
930 * If this buffer has already been written out,
931 * we now need to reset its state.
933 dbuf_unoverride(*drp);
934 if (db->db.db_object != DMU_META_DNODE_OBJECT)
935 arc_buf_thaw(db->db_buf);
937 mutex_exit(&db->db_mtx);
942 * Only valid if not already dirty.
944 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
945 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
947 ASSERT3U(dn->dn_nlevels, >, db->db_level);
948 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
949 dn->dn_phys->dn_nlevels > db->db_level ||
950 dn->dn_next_nlevels[txgoff] > db->db_level ||
951 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
952 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
955 * We should only be dirtying in syncing context if it's the
956 * mos, a spa os, or we're initializing the os. However, we are
957 * allowed to dirty in syncing context provided we already
958 * dirtied it in open context. Hence we must make this
959 * assertion only if we're not already dirty.
961 ASSERT(!dmu_tx_is_syncing(tx) ||
962 os->os_dsl_dataset == NULL ||
963 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
964 !BP_IS_HOLE(os->os_rootbp));
965 ASSERT(db->db.db_size != 0);
967 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
970 * If this buffer is dirty in an old transaction group we need
971 * to make a copy of it so that the changes we make in this
972 * transaction group won't leak out when we sync the older txg.
974 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
975 if (db->db_level == 0) {
976 void *data_old = db->db_buf;
978 if (db->db_blkid == DB_BONUS_BLKID) {
979 dbuf_fix_old_data(db, tx->tx_txg);
980 data_old = db->db.db_data;
981 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
983 * Release the data buffer from the cache so that we
984 * can modify it without impacting possible other users
985 * of this cached data block. Note that indirect
986 * blocks and private objects are not released until the
987 * syncing state (since they are only modified then).
989 arc_release(db->db_buf, db);
990 dbuf_fix_old_data(db, tx->tx_txg);
991 data_old = db->db_buf;
993 ASSERT(data_old != NULL);
994 dr->dt.dl.dr_data = data_old;
996 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
997 list_create(&dr->dt.di.dr_children,
998 sizeof (dbuf_dirty_record_t),
999 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1002 dr->dr_txg = tx->tx_txg;
1007 * We could have been freed_in_flight between the dbuf_noread
1008 * and dbuf_dirty. We win, as though the dbuf_noread() had
1009 * happened after the free.
1011 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
1012 mutex_enter(&dn->dn_mtx);
1013 dnode_clear_range(dn, db->db_blkid, 1, tx);
1014 mutex_exit(&dn->dn_mtx);
1015 db->db_freed_in_flight = FALSE;
1018 if (db->db_blkid != DB_BONUS_BLKID) {
1020 * Update the accounting.
1022 if (!dbuf_new_block(db) && db->db_blkptr) {
1024 * This is only a guess -- if the dbuf is dirty
1025 * in a previous txg, we don't know how much
1026 * space it will use on disk yet. We should
1027 * really have the struct_rwlock to access
1028 * db_blkptr, but since this is just a guess,
1029 * it's OK if we get an odd answer.
1031 dnode_willuse_space(dn,
1032 -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
1034 dnode_willuse_space(dn, db->db.db_size, tx);
1038 * This buffer is now part of this txg
1040 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1041 db->db_dirtycnt += 1;
1042 ASSERT3U(db->db_dirtycnt, <=, 3);
1044 mutex_exit(&db->db_mtx);
1046 if (db->db_blkid == DB_BONUS_BLKID) {
1047 mutex_enter(&dn->dn_mtx);
1048 ASSERT(!list_link_active(&dr->dr_dirty_node));
1049 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1050 mutex_exit(&dn->dn_mtx);
1051 dnode_setdirty(dn, tx);
1055 if (db->db_level == 0) {
1056 dnode_new_blkid(dn, db->db_blkid, tx);
1057 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1060 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1061 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1062 drop_struct_lock = TRUE;
1065 if (db->db_level+1 < dn->dn_nlevels) {
1066 dmu_buf_impl_t *parent = db->db_parent;
1067 dbuf_dirty_record_t *di;
1068 int parent_held = FALSE;
1070 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1071 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1073 parent = dbuf_hold_level(dn, db->db_level+1,
1074 db->db_blkid >> epbs, FTAG);
1077 if (drop_struct_lock)
1078 rw_exit(&dn->dn_struct_rwlock);
1079 ASSERT3U(db->db_level+1, ==, parent->db_level);
1080 di = dbuf_dirty(parent, tx);
1082 dbuf_rele(parent, FTAG);
1084 mutex_enter(&db->db_mtx);
1085 /* possible race with dbuf_undirty() */
1086 if (db->db_last_dirty == dr ||
1087 dn->dn_object == DMU_META_DNODE_OBJECT) {
1088 mutex_enter(&di->dt.di.dr_mtx);
1089 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1090 ASSERT(!list_link_active(&dr->dr_dirty_node));
1091 list_insert_tail(&di->dt.di.dr_children, dr);
1092 mutex_exit(&di->dt.di.dr_mtx);
1095 mutex_exit(&db->db_mtx);
1097 ASSERT(db->db_level+1 == dn->dn_nlevels);
1098 ASSERT(db->db_blkid < dn->dn_nblkptr);
1099 ASSERT(db->db_parent == NULL ||
1100 db->db_parent == db->db_dnode->dn_dbuf);
1101 mutex_enter(&dn->dn_mtx);
1102 ASSERT(!list_link_active(&dr->dr_dirty_node));
1103 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1104 mutex_exit(&dn->dn_mtx);
1105 if (drop_struct_lock)
1106 rw_exit(&dn->dn_struct_rwlock);
1109 dnode_setdirty(dn, tx);
1114 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1116 dnode_t *dn = db->db_dnode;
1117 uint64_t txg = tx->tx_txg;
1118 dbuf_dirty_record_t *dr;
1121 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1123 mutex_enter(&db->db_mtx);
1126 * If this buffer is not dirty, we're done.
1128 for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
1129 if (dr->dr_txg <= txg)
1131 if (dr == NULL || dr->dr_txg < txg) {
1132 mutex_exit(&db->db_mtx);
1135 ASSERT(dr->dr_txg == txg);
1138 * If this buffer is currently held, we cannot undirty
1139 * it, since one of the current holders may be in the
1140 * middle of an update. Note that users of dbuf_undirty()
1141 * should not place a hold on the dbuf before the call.
1143 if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1144 mutex_exit(&db->db_mtx);
1145 /* Make sure we don't toss this buffer at sync phase */
1146 mutex_enter(&dn->dn_mtx);
1147 dnode_clear_range(dn, db->db_blkid, 1, tx);
1148 mutex_exit(&dn->dn_mtx);
1152 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1154 ASSERT(db->db.db_size != 0);
1156 /* XXX would be nice to fix up dn_towrite_space[] */
1158 db->db_last_dirty = dr->dr_next;
1160 if (dr->dr_parent) {
1161 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1162 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1163 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1164 } else if (db->db_level+1 == dn->dn_nlevels) {
1165 ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
1166 mutex_enter(&dn->dn_mtx);
1167 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1168 mutex_exit(&dn->dn_mtx);
1171 if (db->db_level == 0) {
1172 dbuf_unoverride(dr);
1174 ASSERT(db->db_buf != NULL);
1175 ASSERT(dr->dt.dl.dr_data != NULL);
1176 if (dr->dt.dl.dr_data != db->db_buf)
1177 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
1179 ASSERT(db->db_buf != NULL);
1180 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1181 /* XXX - mutex and list destroy? */
1183 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1185 ASSERT(db->db_dirtycnt > 0);
1186 db->db_dirtycnt -= 1;
1188 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1189 arc_buf_t *buf = db->db_buf;
1191 ASSERT(arc_released(buf));
1192 dbuf_set_data(db, NULL);
1193 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1198 mutex_exit(&db->db_mtx);
1202 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1204 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1206 int rf = DB_RF_MUST_SUCCEED;
1208 ASSERT(tx->tx_txg != 0);
1209 ASSERT(!refcount_is_zero(&db->db_holds));
1211 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1212 rf |= DB_RF_HAVESTRUCT;
1213 (void) dbuf_read(db, NULL, rf);
1214 (void) dbuf_dirty(db, tx);
1218 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1220 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1222 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1223 ASSERT(tx->tx_txg != 0);
1224 ASSERT(db->db_level == 0);
1225 ASSERT(!refcount_is_zero(&db->db_holds));
1227 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1228 dmu_tx_private_ok(tx));
1231 (void) dbuf_dirty(db, tx);
1234 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1237 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1239 mutex_enter(&db->db_mtx);
1242 if (db->db_state == DB_FILL) {
1243 if (db->db_level == 0 && db->db_freed_in_flight) {
1244 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1245 /* we were freed while filling */
1246 /* XXX dbuf_undirty? */
1247 bzero(db->db.db_data, db->db.db_size);
1248 db->db_freed_in_flight = FALSE;
1250 db->db_state = DB_CACHED;
1251 cv_broadcast(&db->db_changed);
1253 mutex_exit(&db->db_mtx);
1257 * "Clear" the contents of this dbuf. This will mark the dbuf
1258 * EVICTING and clear *most* of its references. Unfortunetely,
1259 * when we are not holding the dn_dbufs_mtx, we can't clear the
1260 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1261 * in this case. For callers from the DMU we will usually see:
1262 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1263 * For the arc callback, we will usually see:
1264 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1265 * Sometimes, though, we will get a mix of these two:
1266 * DMU: dbuf_clear()->arc_buf_evict()
1267 * ARC: dbuf_do_evict()->dbuf_destroy()
1270 dbuf_clear(dmu_buf_impl_t *db)
1272 dnode_t *dn = db->db_dnode;
1273 dmu_buf_impl_t *parent = db->db_parent;
1274 dmu_buf_impl_t *dndb = dn->dn_dbuf;
1275 int dbuf_gone = FALSE;
1277 ASSERT(MUTEX_HELD(&db->db_mtx));
1278 ASSERT(refcount_is_zero(&db->db_holds));
1280 dbuf_evict_user(db);
1282 if (db->db_state == DB_CACHED) {
1283 ASSERT(db->db.db_data != NULL);
1284 if (db->db_blkid == DB_BONUS_BLKID)
1285 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1286 db->db.db_data = NULL;
1287 db->db_state = DB_UNCACHED;
1290 ASSERT3U(db->db_state, ==, DB_UNCACHED);
1291 ASSERT(db->db_data_pending == NULL);
1293 db->db_state = DB_EVICTING;
1294 db->db_blkptr = NULL;
1296 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1297 list_remove(&dn->dn_dbufs, db);
1302 dbuf_gone = arc_buf_evict(db->db_buf);
1305 mutex_exit(&db->db_mtx);
1308 * If this dbuf is referened from an indirect dbuf,
1309 * decrement the ref count on the indirect dbuf.
1311 if (parent && parent != dndb)
1312 dbuf_rele(parent, db);
1316 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1317 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1324 ASSERT(blkid != DB_BONUS_BLKID);
1326 if (dn->dn_phys->dn_nlevels == 0)
1329 nlevels = dn->dn_phys->dn_nlevels;
1331 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1333 ASSERT3U(level * epbs, <, 64);
1334 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1335 if (level >= nlevels ||
1336 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1337 /* the buffer has no parent yet */
1339 } else if (level < nlevels-1) {
1340 /* this block is referenced from an indirect block */
1341 int err = dbuf_hold_impl(dn, level+1,
1342 blkid >> epbs, fail_sparse, NULL, parentp);
1345 err = dbuf_read(*parentp, NULL,
1346 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1348 dbuf_rele(*parentp, NULL);
1352 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1353 (blkid & ((1ULL << epbs) - 1));
1356 /* the block is referenced from the dnode */
1357 ASSERT3U(level, ==, nlevels-1);
1358 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1359 blkid < dn->dn_phys->dn_nblkptr);
1361 dbuf_add_ref(dn->dn_dbuf, NULL);
1362 *parentp = dn->dn_dbuf;
1364 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1369 static dmu_buf_impl_t *
1370 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1371 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1373 objset_impl_t *os = dn->dn_objset;
1374 dmu_buf_impl_t *db, *odb;
1376 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1377 ASSERT(dn->dn_type != DMU_OT_NONE);
1379 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1382 db->db.db_object = dn->dn_object;
1383 db->db_level = level;
1384 db->db_blkid = blkid;
1385 db->db_last_dirty = NULL;
1386 db->db_dirtycnt = 0;
1388 db->db_parent = parent;
1389 db->db_blkptr = blkptr;
1391 db->db_user_ptr = NULL;
1392 db->db_user_data_ptr_ptr = NULL;
1393 db->db_evict_func = NULL;
1394 db->db_immediate_evict = 0;
1395 db->db_freed_in_flight = 0;
1397 if (blkid == DB_BONUS_BLKID) {
1398 ASSERT3P(parent, ==, dn->dn_dbuf);
1399 db->db.db_size = dn->dn_bonuslen;
1400 db->db.db_offset = DB_BONUS_BLKID;
1401 db->db_state = DB_UNCACHED;
1402 /* the bonus dbuf is not placed in the hash table */
1406 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1407 db->db.db_size = blocksize;
1408 db->db.db_offset = db->db_blkid * blocksize;
1412 * Hold the dn_dbufs_mtx while we get the new dbuf
1413 * in the hash table *and* added to the dbufs list.
1414 * This prevents a possible deadlock with someone
1415 * trying to look up this dbuf before its added to the
1418 mutex_enter(&dn->dn_dbufs_mtx);
1419 db->db_state = DB_EVICTING;
1420 if ((odb = dbuf_hash_insert(db)) != NULL) {
1421 /* someone else inserted it first */
1422 kmem_cache_free(dbuf_cache, db);
1423 mutex_exit(&dn->dn_dbufs_mtx);
1426 list_insert_head(&dn->dn_dbufs, db);
1427 db->db_state = DB_UNCACHED;
1428 mutex_exit(&dn->dn_dbufs_mtx);
1430 if (parent && parent != dn->dn_dbuf)
1431 dbuf_add_ref(parent, db);
1433 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1434 refcount_count(&dn->dn_holds) > 0);
1435 (void) refcount_add(&dn->dn_holds, db);
1437 dprintf_dbuf(db, "db=%p\n", db);
1443 dbuf_do_evict(void *private)
1445 arc_buf_t *buf = private;
1446 dmu_buf_impl_t *db = buf->b_private;
1448 if (!MUTEX_HELD(&db->db_mtx))
1449 mutex_enter(&db->db_mtx);
1451 ASSERT(refcount_is_zero(&db->db_holds));
1453 if (db->db_state != DB_EVICTING) {
1454 ASSERT(db->db_state == DB_CACHED);
1459 mutex_exit(&db->db_mtx);
1466 dbuf_destroy(dmu_buf_impl_t *db)
1468 ASSERT(refcount_is_zero(&db->db_holds));
1470 if (db->db_blkid != DB_BONUS_BLKID) {
1471 dnode_t *dn = db->db_dnode;
1474 * If this dbuf is still on the dn_dbufs list,
1475 * remove it from that list.
1477 if (list_link_active(&db->db_link)) {
1478 mutex_enter(&dn->dn_dbufs_mtx);
1479 list_remove(&dn->dn_dbufs, db);
1480 mutex_exit(&dn->dn_dbufs_mtx);
1484 dbuf_hash_remove(db);
1486 db->db_parent = NULL;
1487 db->db_dnode = NULL;
1490 ASSERT(db->db.db_data == NULL);
1491 ASSERT(db->db_hash_next == NULL);
1492 ASSERT(db->db_blkptr == NULL);
1493 ASSERT(db->db_data_pending == NULL);
1495 kmem_cache_free(dbuf_cache, db);
1499 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1501 dmu_buf_impl_t *db = NULL;
1502 blkptr_t *bp = NULL;
1504 ASSERT(blkid != DB_BONUS_BLKID);
1505 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1507 if (dnode_block_freed(dn, blkid))
1510 /* dbuf_find() returns with db_mtx held */
1511 if (db = dbuf_find(dn, 0, blkid)) {
1512 if (refcount_count(&db->db_holds) > 0) {
1514 * This dbuf is active. We assume that it is
1515 * already CACHED, or else about to be either
1518 mutex_exit(&db->db_mtx);
1521 mutex_exit(&db->db_mtx);
1525 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1526 if (bp && !BP_IS_HOLE(bp)) {
1527 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1529 zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
1530 dn->dn_objset->os_dsl_dataset->ds_object : 0;
1531 zb.zb_object = dn->dn_object;
1533 zb.zb_blkid = blkid;
1535 (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
1536 dmu_ot[dn->dn_type].ot_byteswap,
1537 NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1538 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1542 dbuf_rele(db, NULL);
1547 * Returns with db_holds incremented, and db_mtx not held.
1548 * Note: dn_struct_rwlock must be held.
1551 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1552 void *tag, dmu_buf_impl_t **dbp)
1554 dmu_buf_impl_t *db, *parent = NULL;
1556 ASSERT(blkid != DB_BONUS_BLKID);
1557 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1558 ASSERT3U(dn->dn_nlevels, >, level);
1562 /* dbuf_find() returns with db_mtx held */
1563 db = dbuf_find(dn, level, blkid);
1566 blkptr_t *bp = NULL;
1569 ASSERT3P(parent, ==, NULL);
1570 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1572 if (err == 0 && bp && BP_IS_HOLE(bp))
1576 dbuf_rele(parent, NULL);
1580 if (err && err != ENOENT)
1582 db = dbuf_create(dn, level, blkid, parent, bp);
1585 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1586 arc_buf_add_ref(db->db_buf, db);
1587 if (db->db_buf->b_data == NULL) {
1590 dbuf_rele(parent, NULL);
1595 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1598 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1601 * If this buffer is currently syncing out, and we are are
1602 * still referencing it from db_data, we need to make a copy
1603 * of it in case we decide we want to dirty it again in this txg.
1605 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
1606 dn->dn_object != DMU_META_DNODE_OBJECT &&
1607 db->db_state == DB_CACHED && db->db_data_pending) {
1608 dbuf_dirty_record_t *dr = db->db_data_pending;
1610 if (dr->dt.dl.dr_data == db->db_buf) {
1611 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1614 arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1615 db->db.db_size, db, type));
1616 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1621 (void) refcount_add(&db->db_holds, tag);
1622 dbuf_update_data(db);
1624 mutex_exit(&db->db_mtx);
1626 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1628 dbuf_rele(parent, NULL);
1630 ASSERT3P(db->db_dnode, ==, dn);
1631 ASSERT3U(db->db_blkid, ==, blkid);
1632 ASSERT3U(db->db_level, ==, level);
1639 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1642 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1643 return (err ? NULL : db);
1647 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1650 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1651 return (err ? NULL : db);
1655 dbuf_create_bonus(dnode_t *dn)
1657 dmu_buf_impl_t *db = dn->dn_bonus;
1659 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1661 ASSERT(dn->dn_bonus == NULL);
1662 db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
1666 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1668 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1670 int64_t holds = refcount_add(&db->db_holds, tag);
1674 #pragma weak dmu_buf_rele = dbuf_rele
1676 dbuf_rele(dmu_buf_impl_t *db, void *tag)
1680 mutex_enter(&db->db_mtx);
1683 holds = refcount_remove(&db->db_holds, tag);
1687 * We can't freeze indirects if there is a possibility that they
1688 * may be modified in the current syncing context.
1690 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
1691 arc_buf_freeze(db->db_buf);
1693 if (holds == db->db_dirtycnt &&
1694 db->db_level == 0 && db->db_immediate_evict)
1695 dbuf_evict_user(db);
1698 if (db->db_blkid == DB_BONUS_BLKID) {
1699 mutex_exit(&db->db_mtx);
1700 dnode_rele(db->db_dnode, db);
1701 } else if (db->db_buf == NULL) {
1703 * This is a special case: we never associated this
1704 * dbuf with any data allocated from the ARC.
1706 ASSERT3U(db->db_state, ==, DB_UNCACHED);
1708 } else if (arc_released(db->db_buf)) {
1709 arc_buf_t *buf = db->db_buf;
1711 * This dbuf has anonymous data associated with it.
1713 dbuf_set_data(db, NULL);
1714 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1717 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
1718 mutex_exit(&db->db_mtx);
1721 mutex_exit(&db->db_mtx);
1725 #pragma weak dmu_buf_refcount = dbuf_refcount
1727 dbuf_refcount(dmu_buf_impl_t *db)
1729 return (refcount_count(&db->db_holds));
1733 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1734 dmu_buf_evict_func_t *evict_func)
1736 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1737 user_data_ptr_ptr, evict_func));
1741 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1742 dmu_buf_evict_func_t *evict_func)
1744 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1746 db->db_immediate_evict = TRUE;
1747 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1748 user_data_ptr_ptr, evict_func));
1752 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1753 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1755 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1756 ASSERT(db->db_level == 0);
1758 ASSERT((user_ptr == NULL) == (evict_func == NULL));
1760 mutex_enter(&db->db_mtx);
1762 if (db->db_user_ptr == old_user_ptr) {
1763 db->db_user_ptr = user_ptr;
1764 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
1765 db->db_evict_func = evict_func;
1767 dbuf_update_data(db);
1769 old_user_ptr = db->db_user_ptr;
1772 mutex_exit(&db->db_mtx);
1773 return (old_user_ptr);
1777 dmu_buf_get_user(dmu_buf_t *db_fake)
1779 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1780 ASSERT(!refcount_is_zero(&db->db_holds));
1782 return (db->db_user_ptr);
1786 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
1788 /* ASSERT(dmu_tx_is_syncing(tx) */
1789 ASSERT(MUTEX_HELD(&db->db_mtx));
1791 if (db->db_blkptr != NULL)
1794 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
1796 * This buffer was allocated at a time when there was
1797 * no available blkptrs from the dnode, or it was
1798 * inappropriate to hook it in (i.e., nlevels mis-match).
1800 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
1801 ASSERT(db->db_parent == NULL);
1802 db->db_parent = dn->dn_dbuf;
1803 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1806 dmu_buf_impl_t *parent = db->db_parent;
1807 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1809 ASSERT(dn->dn_phys->dn_nlevels > 1);
1810 if (parent == NULL) {
1811 mutex_exit(&db->db_mtx);
1812 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1813 (void) dbuf_hold_impl(dn, db->db_level+1,
1814 db->db_blkid >> epbs, FALSE, db, &parent);
1815 rw_exit(&dn->dn_struct_rwlock);
1816 mutex_enter(&db->db_mtx);
1817 db->db_parent = parent;
1819 db->db_blkptr = (blkptr_t *)parent->db.db_data +
1820 (db->db_blkid & ((1ULL << epbs) - 1));
1826 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1828 dmu_buf_impl_t *db = dr->dr_dbuf;
1829 dnode_t *dn = db->db_dnode;
1832 ASSERT(dmu_tx_is_syncing(tx));
1834 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1836 mutex_enter(&db->db_mtx);
1838 ASSERT(db->db_level > 0);
1841 if (db->db_buf == NULL) {
1842 mutex_exit(&db->db_mtx);
1843 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
1844 mutex_enter(&db->db_mtx);
1846 ASSERT3U(db->db_state, ==, DB_CACHED);
1847 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
1848 ASSERT(db->db_buf != NULL);
1850 dbuf_check_blkptr(dn, db);
1852 db->db_data_pending = dr;
1854 arc_release(db->db_buf, db);
1855 mutex_exit(&db->db_mtx);
1858 * XXX -- we should design a compression algorithm
1859 * that specializes in arrays of bps.
1861 dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
1862 zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
1865 mutex_enter(&dr->dt.di.dr_mtx);
1866 dbuf_sync_list(&dr->dt.di.dr_children, tx);
1867 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1868 mutex_exit(&dr->dt.di.dr_mtx);
1873 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1875 arc_buf_t **datap = &dr->dt.dl.dr_data;
1876 dmu_buf_impl_t *db = dr->dr_dbuf;
1877 dnode_t *dn = db->db_dnode;
1878 objset_impl_t *os = dn->dn_objset;
1879 uint64_t txg = tx->tx_txg;
1880 int checksum, compress;
1883 ASSERT(dmu_tx_is_syncing(tx));
1885 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1887 mutex_enter(&db->db_mtx);
1889 * To be synced, we must be dirtied. But we
1890 * might have been freed after the dirty.
1892 if (db->db_state == DB_UNCACHED) {
1893 /* This buffer has been freed since it was dirtied */
1894 ASSERT(db->db.db_data == NULL);
1895 } else if (db->db_state == DB_FILL) {
1896 /* This buffer was freed and is now being re-filled */
1897 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
1899 ASSERT3U(db->db_state, ==, DB_CACHED);
1904 * If this is a bonus buffer, simply copy the bonus data into the
1905 * dnode. It will be written out when the dnode is synced (and it
1906 * will be synced, since it must have been dirty for dbuf_sync to
1909 if (db->db_blkid == DB_BONUS_BLKID) {
1910 dbuf_dirty_record_t **drp;
1912 * Use dn_phys->dn_bonuslen since db.db_size is the length
1913 * of the bonus buffer in the open transaction rather than
1914 * the syncing transaction.
1916 ASSERT(*datap != NULL);
1917 ASSERT3U(db->db_level, ==, 0);
1918 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
1919 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
1920 if (*datap != db->db.db_data)
1921 zio_buf_free(*datap, DN_MAX_BONUSLEN);
1922 db->db_data_pending = NULL;
1923 drp = &db->db_last_dirty;
1925 drp = &(*drp)->dr_next;
1926 ASSERT((*drp)->dr_next == NULL);
1928 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1929 ASSERT(db->db_dirtycnt > 0);
1930 db->db_dirtycnt -= 1;
1931 mutex_exit(&db->db_mtx);
1932 dbuf_rele(db, (void *)(uintptr_t)txg);
1937 * If this buffer is in the middle of an immdiate write,
1938 * wait for the synchronous IO to complete.
1940 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
1941 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1942 cv_wait(&db->db_changed, &db->db_mtx);
1943 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
1946 dbuf_check_blkptr(dn, db);
1949 * If this dbuf has already been written out via an immediate write,
1950 * just complete the write by copying over the new block pointer and
1951 * updating the accounting via the write-completion functions.
1953 if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1956 zio_fake.io_private = &db;
1957 zio_fake.io_error = 0;
1958 zio_fake.io_bp = db->db_blkptr;
1959 zio_fake.io_bp_orig = *db->db_blkptr;
1960 zio_fake.io_txg = txg;
1962 *db->db_blkptr = dr->dt.dl.dr_overridden_by;
1963 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1964 db->db_data_pending = dr;
1965 dr->dr_zio = &zio_fake;
1966 mutex_exit(&db->db_mtx);
1968 if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
1969 dsl_dataset_block_kill(os->os_dsl_dataset,
1970 &zio_fake.io_bp_orig, dn->dn_zio, tx);
1972 dbuf_write_ready(&zio_fake, db->db_buf, db);
1973 dbuf_write_done(&zio_fake, db->db_buf, db);
1978 blksz = arc_buf_size(*datap);
1980 if (dn->dn_object != DMU_META_DNODE_OBJECT) {
1982 * If this buffer is currently "in use" (i.e., there are
1983 * active holds and db_data still references it), then make
1984 * a copy before we start the write so that any modifications
1985 * from the open txg will not leak into this write.
1987 * NOTE: this copy does not need to be made for objects only
1988 * modified in the syncing context (e.g. DNONE_DNODE blocks).
1990 if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
1991 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1992 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
1993 bcopy(db->db.db_data, (*datap)->b_data, blksz);
1997 * Private object buffers are released here rather
1998 * than in dbuf_dirty() since they are only modified
1999 * in the syncing context and we don't want the
2000 * overhead of making multiple copies of the data.
2002 arc_release(db->db_buf, db);
2005 ASSERT(*datap != NULL);
2006 db->db_data_pending = dr;
2008 mutex_exit(&db->db_mtx);
2011 * Allow dnode settings to override objset settings,
2012 * except for metadata checksums.
2014 if (dmu_ot[dn->dn_type].ot_metadata) {
2015 checksum = os->os_md_checksum;
2016 compress = zio_compress_select(dn->dn_compress,
2017 os->os_md_compress);
2019 checksum = zio_checksum_select(dn->dn_checksum,
2021 compress = zio_compress_select(dn->dn_compress,
2025 dbuf_write(dr, *datap, checksum, compress, tx);
2027 ASSERT(!list_link_active(&dr->dr_dirty_node));
2028 if (dn->dn_object == DMU_META_DNODE_OBJECT)
2029 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2031 zio_nowait(dr->dr_zio);
2035 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2037 dbuf_dirty_record_t *dr;
2039 while (dr = list_head(list)) {
2040 if (dr->dr_zio != NULL) {
2042 * If we find an already initialized zio then we
2043 * are processing the meta-dnode, and we have finished.
2044 * The dbufs for all dnodes are put back on the list
2045 * during processing, so that we can zio_wait()
2046 * these IOs after initiating all child IOs.
2048 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2049 DMU_META_DNODE_OBJECT);
2052 list_remove(list, dr);
2053 if (dr->dr_dbuf->db_level > 0)
2054 dbuf_sync_indirect(dr, tx);
2056 dbuf_sync_leaf(dr, tx);
2061 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
2062 int compress, dmu_tx_t *tx)
2064 dmu_buf_impl_t *db = dr->dr_dbuf;
2065 dnode_t *dn = db->db_dnode;
2066 objset_impl_t *os = dn->dn_objset;
2067 dmu_buf_impl_t *parent = db->db_parent;
2068 uint64_t txg = tx->tx_txg;
2073 if (parent != dn->dn_dbuf) {
2074 ASSERT(parent && parent->db_data_pending);
2075 ASSERT(db->db_level == parent->db_level-1);
2076 ASSERT(arc_released(parent->db_buf));
2077 zio = parent->db_data_pending->dr_zio;
2079 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
2080 ASSERT3P(db->db_blkptr, ==,
2081 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2085 ASSERT(db->db_level == 0 || data == db->db_buf);
2086 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2089 zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
2090 zb.zb_object = db->db.db_object;
2091 zb.zb_level = db->db_level;
2092 zb.zb_blkid = db->db_blkid;
2094 zio_flags = ZIO_FLAG_MUSTSUCCEED;
2095 if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
2096 zio_flags |= ZIO_FLAG_METADATA;
2097 if (BP_IS_OLDER(db->db_blkptr, txg))
2098 dsl_dataset_block_kill(
2099 os->os_dsl_dataset, db->db_blkptr, zio, tx);
2101 dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
2102 dmu_get_replication_level(os, &zb, dn->dn_type), txg,
2103 db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
2104 ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
2109 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2111 dmu_buf_impl_t *db = vdb;
2112 dnode_t *dn = db->db_dnode;
2113 objset_impl_t *os = dn->dn_objset;
2114 blkptr_t *bp_orig = &zio->io_bp_orig;
2116 int old_size, new_size, i;
2118 dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
2120 old_size = bp_get_dasize(os->os_spa, bp_orig);
2121 new_size = bp_get_dasize(os->os_spa, zio->io_bp);
2123 dnode_diduse_space(dn, new_size-old_size);
2125 if (BP_IS_HOLE(zio->io_bp)) {
2126 dsl_dataset_t *ds = os->os_dsl_dataset;
2127 dmu_tx_t *tx = os->os_synctx;
2129 if (bp_orig->blk_birth == tx->tx_txg)
2130 dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
2131 ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
2135 mutex_enter(&db->db_mtx);
2137 if (db->db_level == 0) {
2138 mutex_enter(&dn->dn_mtx);
2139 if (db->db_blkid > dn->dn_phys->dn_maxblkid)
2140 dn->dn_phys->dn_maxblkid = db->db_blkid;
2141 mutex_exit(&dn->dn_mtx);
2143 if (dn->dn_type == DMU_OT_DNODE) {
2144 dnode_phys_t *dnp = db->db.db_data;
2145 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2147 if (dnp->dn_type != DMU_OT_NONE)
2154 blkptr_t *bp = db->db.db_data;
2155 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2156 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
2159 ASSERT3U(BP_GET_LSIZE(bp), ==,
2160 db->db_level == 1 ? dn->dn_datablksz :
2161 (1<<dn->dn_phys->dn_indblkshift));
2162 fill += bp->blk_fill;
2166 db->db_blkptr->blk_fill = fill;
2167 BP_SET_TYPE(db->db_blkptr, dn->dn_type);
2168 BP_SET_LEVEL(db->db_blkptr, db->db_level);
2170 mutex_exit(&db->db_mtx);
2172 /* We must do this after we've set the bp's type and level */
2173 if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
2174 dsl_dataset_t *ds = os->os_dsl_dataset;
2175 dmu_tx_t *tx = os->os_synctx;
2177 if (bp_orig->blk_birth == tx->tx_txg)
2178 dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
2179 dsl_dataset_block_born(ds, zio->io_bp, tx);
2185 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2187 dmu_buf_impl_t *db = vdb;
2188 uint64_t txg = zio->io_txg;
2189 dbuf_dirty_record_t **drp, *dr;
2191 ASSERT3U(zio->io_error, ==, 0);
2193 mutex_enter(&db->db_mtx);
2195 drp = &db->db_last_dirty;
2196 while (*drp != db->db_data_pending)
2197 drp = &(*drp)->dr_next;
2198 ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
2199 ASSERT((*drp)->dr_txg == txg);
2200 ASSERT((*drp)->dr_next == NULL);
2204 if (db->db_level == 0) {
2205 ASSERT(db->db_blkid != DB_BONUS_BLKID);
2206 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2208 if (dr->dt.dl.dr_data != db->db_buf)
2209 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
2210 else if (!BP_IS_HOLE(db->db_blkptr))
2211 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2213 ASSERT(arc_released(db->db_buf));
2215 dnode_t *dn = db->db_dnode;
2217 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2218 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2219 if (!BP_IS_HOLE(db->db_blkptr)) {
2221 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2222 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2224 ASSERT3U(dn->dn_phys->dn_maxblkid
2225 >> (db->db_level * epbs), >=, db->db_blkid);
2226 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2229 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2231 cv_broadcast(&db->db_changed);
2232 ASSERT(db->db_dirtycnt > 0);
2233 db->db_dirtycnt -= 1;
2234 db->db_data_pending = NULL;
2235 mutex_exit(&db->db_mtx);
2237 dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
2239 dbuf_rele(db, (void *)(uintptr_t)txg);