4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
28 #include <sys/zfs_context.h>
30 #include <sys/dmu_impl.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/dsl_dir.h>
35 #include <sys/dmu_tx.h>
38 #include <sys/dmu_zfetch.h>
40 static void dbuf_destroy(dmu_buf_impl_t *db);
41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
42 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
43 int compress, dmu_tx_t *tx);
44 static arc_done_func_t dbuf_write_ready;
45 static arc_done_func_t dbuf_write_done;
47 int zfs_mdcomp_disable = 0;
48 SYSCTL_DECL(_vfs_zfs);
49 TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
50 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
51 &zfs_mdcomp_disable, 0, "Disable metadata compression");
54 * Global data structures and functions for the dbuf cache.
56 static kmem_cache_t *dbuf_cache;
60 dbuf_cons(void *vdb, void *unused, int kmflag)
62 dmu_buf_impl_t *db = vdb;
63 bzero(db, sizeof (dmu_buf_impl_t));
65 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
66 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
67 refcount_create(&db->db_holds);
73 dbuf_dest(void *vdb, void *unused)
75 dmu_buf_impl_t *db = vdb;
76 mutex_destroy(&db->db_mtx);
77 cv_destroy(&db->db_changed);
78 refcount_destroy(&db->db_holds);
82 * dbuf hash table routines
84 static dbuf_hash_table_t dbuf_hash_table;
86 static uint64_t dbuf_hash_count;
89 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
91 uintptr_t osv = (uintptr_t)os;
94 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
95 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
96 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
97 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
98 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
99 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
100 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
102 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
107 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
109 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
110 ((dbuf)->db.db_object == (obj) && \
111 (dbuf)->db_objset == (os) && \
112 (dbuf)->db_level == (level) && \
113 (dbuf)->db_blkid == (blkid))
116 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
118 dbuf_hash_table_t *h = &dbuf_hash_table;
119 objset_impl_t *os = dn->dn_objset;
120 uint64_t obj = dn->dn_object;
121 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
122 uint64_t idx = hv & h->hash_table_mask;
125 mutex_enter(DBUF_HASH_MUTEX(h, idx));
126 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
127 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
128 mutex_enter(&db->db_mtx);
129 if (db->db_state != DB_EVICTING) {
130 mutex_exit(DBUF_HASH_MUTEX(h, idx));
133 mutex_exit(&db->db_mtx);
136 mutex_exit(DBUF_HASH_MUTEX(h, idx));
141 * Insert an entry into the hash table. If there is already an element
142 * equal to elem in the hash table, then the already existing element
143 * will be returned and the new element will not be inserted.
144 * Otherwise returns NULL.
146 static dmu_buf_impl_t *
147 dbuf_hash_insert(dmu_buf_impl_t *db)
149 dbuf_hash_table_t *h = &dbuf_hash_table;
150 objset_impl_t *os = db->db_objset;
151 uint64_t obj = db->db.db_object;
152 int level = db->db_level;
153 uint64_t blkid = db->db_blkid;
154 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
155 uint64_t idx = hv & h->hash_table_mask;
158 mutex_enter(DBUF_HASH_MUTEX(h, idx));
159 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
160 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
161 mutex_enter(&dbf->db_mtx);
162 if (dbf->db_state != DB_EVICTING) {
163 mutex_exit(DBUF_HASH_MUTEX(h, idx));
166 mutex_exit(&dbf->db_mtx);
170 mutex_enter(&db->db_mtx);
171 db->db_hash_next = h->hash_table[idx];
172 h->hash_table[idx] = db;
173 mutex_exit(DBUF_HASH_MUTEX(h, idx));
174 atomic_add_64(&dbuf_hash_count, 1);
180 * Remove an entry from the hash table. This operation will
181 * fail if there are any existing holds on the db.
184 dbuf_hash_remove(dmu_buf_impl_t *db)
186 dbuf_hash_table_t *h = &dbuf_hash_table;
187 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
188 db->db_level, db->db_blkid);
189 uint64_t idx = hv & h->hash_table_mask;
190 dmu_buf_impl_t *dbf, **dbp;
193 * We musn't hold db_mtx to maintin lock ordering:
194 * DBUF_HASH_MUTEX > db_mtx.
196 ASSERT(refcount_is_zero(&db->db_holds));
197 ASSERT(db->db_state == DB_EVICTING);
198 ASSERT(!MUTEX_HELD(&db->db_mtx));
200 mutex_enter(DBUF_HASH_MUTEX(h, idx));
201 dbp = &h->hash_table[idx];
202 while ((dbf = *dbp) != db) {
203 dbp = &dbf->db_hash_next;
206 *dbp = db->db_hash_next;
207 db->db_hash_next = NULL;
208 mutex_exit(DBUF_HASH_MUTEX(h, idx));
209 atomic_add_64(&dbuf_hash_count, -1);
212 static arc_evict_func_t dbuf_do_evict;
215 dbuf_evict_user(dmu_buf_impl_t *db)
217 ASSERT(MUTEX_HELD(&db->db_mtx));
219 if (db->db_level != 0 || db->db_evict_func == NULL)
222 if (db->db_user_data_ptr_ptr)
223 *db->db_user_data_ptr_ptr = db->db.db_data;
224 db->db_evict_func(&db->db, db->db_user_ptr);
225 db->db_user_ptr = NULL;
226 db->db_user_data_ptr_ptr = NULL;
227 db->db_evict_func = NULL;
231 dbuf_evict(dmu_buf_impl_t *db)
233 ASSERT(MUTEX_HELD(&db->db_mtx));
234 ASSERT(db->db_buf == NULL);
235 ASSERT(db->db_data_pending == NULL);
244 uint64_t hsize = 1ULL << 16;
245 dbuf_hash_table_t *h = &dbuf_hash_table;
249 * The hash table is big enough to fill all of physical memory
250 * with an average 4K block size. The table will take up
251 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
253 while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
257 h->hash_table_mask = hsize - 1;
258 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
259 if (h->hash_table == NULL) {
260 /* XXX - we should really return an error instead of assert */
261 ASSERT(hsize > (1ULL << 10));
266 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
267 sizeof (dmu_buf_impl_t),
268 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
270 for (i = 0; i < DBUF_MUTEXES; i++)
271 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
277 dbuf_hash_table_t *h = &dbuf_hash_table;
280 for (i = 0; i < DBUF_MUTEXES; i++)
281 mutex_destroy(&h->hash_mutexes[i]);
282 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
283 kmem_cache_destroy(dbuf_cache);
292 dbuf_verify(dmu_buf_impl_t *db)
294 dnode_t *dn = db->db_dnode;
296 ASSERT(MUTEX_HELD(&db->db_mtx));
298 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
301 ASSERT(db->db_objset != NULL);
303 ASSERT(db->db_parent == NULL);
304 ASSERT(db->db_blkptr == NULL);
306 ASSERT3U(db->db.db_object, ==, dn->dn_object);
307 ASSERT3P(db->db_objset, ==, dn->dn_objset);
308 ASSERT3U(db->db_level, <, dn->dn_nlevels);
309 ASSERT(db->db_blkid == DB_BONUS_BLKID ||
310 list_head(&dn->dn_dbufs));
312 if (db->db_blkid == DB_BONUS_BLKID) {
314 ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
315 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
317 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
320 if (db->db_level == 0) {
321 /* we can be momentarily larger in dnode_set_blksz() */
322 if (db->db_blkid != DB_BONUS_BLKID && dn) {
323 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
325 if (db->db.db_object == DMU_META_DNODE_OBJECT) {
326 dbuf_dirty_record_t *dr = db->db_data_pending;
328 * it should only be modified in syncing
329 * context, so make sure we only have
330 * one copy of the data.
332 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
336 /* verify db->db_blkptr */
338 if (db->db_parent == dn->dn_dbuf) {
339 /* db is pointed to by the dnode */
340 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
341 if (db->db.db_object == DMU_META_DNODE_OBJECT)
342 ASSERT(db->db_parent == NULL);
344 ASSERT(db->db_parent != NULL);
345 ASSERT3P(db->db_blkptr, ==,
346 &dn->dn_phys->dn_blkptr[db->db_blkid]);
348 /* db is pointed to by an indirect block */
349 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
350 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
351 ASSERT3U(db->db_parent->db.db_object, ==,
354 * dnode_grow_indblksz() can make this fail if we don't
355 * have the struct_rwlock. XXX indblksz no longer
356 * grows. safe to do this now?
358 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
359 ASSERT3P(db->db_blkptr, ==,
360 ((blkptr_t *)db->db_parent->db.db_data +
361 db->db_blkid % epb));
365 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
366 db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
367 db->db_state != DB_FILL && !dn->dn_free_txg) {
369 * If the blkptr isn't set but they have nonzero data,
370 * it had better be dirty, otherwise we'll lose that
371 * data when we evict this buffer.
373 if (db->db_dirtycnt == 0) {
374 uint64_t *buf = db->db.db_data;
377 for (i = 0; i < db->db.db_size >> 3; i++) {
386 dbuf_update_data(dmu_buf_impl_t *db)
388 ASSERT(MUTEX_HELD(&db->db_mtx));
389 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
390 ASSERT(!refcount_is_zero(&db->db_holds));
391 *db->db_user_data_ptr_ptr = db->db.db_data;
396 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
398 ASSERT(MUTEX_HELD(&db->db_mtx));
399 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
402 ASSERT(buf->b_data != NULL);
403 db->db.db_data = buf->b_data;
404 if (!arc_released(buf))
405 arc_set_callback(buf, dbuf_do_evict, db);
406 dbuf_update_data(db);
409 db->db.db_data = NULL;
410 db->db_state = DB_UNCACHED;
415 dbuf_whichblock(dnode_t *dn, uint64_t offset)
417 if (dn->dn_datablkshift) {
418 return (offset >> dn->dn_datablkshift);
420 ASSERT3U(offset, <, dn->dn_datablksz);
426 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
428 dmu_buf_impl_t *db = vdb;
430 mutex_enter(&db->db_mtx);
431 ASSERT3U(db->db_state, ==, DB_READ);
433 * All reads are synchronous, so we must have a hold on the dbuf
435 ASSERT(refcount_count(&db->db_holds) > 0);
436 ASSERT(db->db_buf == NULL);
437 ASSERT(db->db.db_data == NULL);
438 if (db->db_level == 0 && db->db_freed_in_flight) {
439 /* we were freed in flight; disregard any error */
440 arc_release(buf, db);
441 bzero(buf->b_data, db->db.db_size);
443 db->db_freed_in_flight = FALSE;
444 dbuf_set_data(db, buf);
445 db->db_state = DB_CACHED;
446 } else if (zio == NULL || zio->io_error == 0) {
447 dbuf_set_data(db, buf);
448 db->db_state = DB_CACHED;
450 ASSERT(db->db_blkid != DB_BONUS_BLKID);
451 ASSERT3P(db->db_buf, ==, NULL);
452 VERIFY(arc_buf_remove_ref(buf, db) == 1);
453 db->db_state = DB_UNCACHED;
455 cv_broadcast(&db->db_changed);
456 mutex_exit(&db->db_mtx);
461 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
465 uint32_t aflags = ARC_NOWAIT;
467 ASSERT(!refcount_is_zero(&db->db_holds));
468 /* We need the struct_rwlock to prevent db_blkptr from changing. */
469 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
470 ASSERT(MUTEX_HELD(&db->db_mtx));
471 ASSERT(db->db_state == DB_UNCACHED);
472 ASSERT(db->db_buf == NULL);
474 if (db->db_blkid == DB_BONUS_BLKID) {
475 ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
476 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
477 if (db->db.db_size < DN_MAX_BONUSLEN)
478 bzero(db->db.db_data, DN_MAX_BONUSLEN);
479 bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
481 dbuf_update_data(db);
482 db->db_state = DB_CACHED;
483 mutex_exit(&db->db_mtx);
487 if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
493 dprintf_dbuf(db, "blkptr: %s\n", "NULL");
495 dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
497 if (bp == NULL || BP_IS_HOLE(bp)) {
498 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
500 ASSERT(bp == NULL || BP_IS_HOLE(bp));
501 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
502 db->db.db_size, db, type));
503 bzero(db->db.db_data, db->db.db_size);
504 db->db_state = DB_CACHED;
505 *flags |= DB_RF_CACHED;
506 mutex_exit(&db->db_mtx);
510 db->db_state = DB_READ;
511 mutex_exit(&db->db_mtx);
513 zb.zb_objset = db->db_objset->os_dsl_dataset ?
514 db->db_objset->os_dsl_dataset->ds_object : 0;
515 zb.zb_object = db->db.db_object;
516 zb.zb_level = db->db_level;
517 zb.zb_blkid = db->db_blkid;
519 dbuf_add_ref(db, NULL);
520 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
521 ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
522 (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
523 db->db_level > 0 ? byteswap_uint64_array :
524 dmu_ot[db->db_dnode->dn_type].ot_byteswap,
525 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
526 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
528 if (aflags & ARC_CACHED)
529 *flags |= DB_RF_CACHED;
533 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
536 int havepzio = (zio != NULL);
540 * We don't have to hold the mutex to check db_state because it
541 * can't be freed while we have a hold on the buffer.
543 ASSERT(!refcount_is_zero(&db->db_holds));
545 if ((flags & DB_RF_HAVESTRUCT) == 0)
546 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
548 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
549 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
551 mutex_enter(&db->db_mtx);
552 if (db->db_state == DB_CACHED) {
553 mutex_exit(&db->db_mtx);
555 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
556 db->db.db_size, TRUE);
557 if ((flags & DB_RF_HAVESTRUCT) == 0)
558 rw_exit(&db->db_dnode->dn_struct_rwlock);
559 } else if (db->db_state == DB_UNCACHED) {
561 zio = zio_root(db->db_dnode->dn_objset->os_spa,
562 NULL, NULL, ZIO_FLAG_CANFAIL);
564 dbuf_read_impl(db, zio, &flags);
566 /* dbuf_read_impl has dropped db_mtx for us */
569 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
570 db->db.db_size, flags & DB_RF_CACHED);
572 if ((flags & DB_RF_HAVESTRUCT) == 0)
573 rw_exit(&db->db_dnode->dn_struct_rwlock);
578 mutex_exit(&db->db_mtx);
580 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
581 db->db.db_size, TRUE);
582 if ((flags & DB_RF_HAVESTRUCT) == 0)
583 rw_exit(&db->db_dnode->dn_struct_rwlock);
585 mutex_enter(&db->db_mtx);
586 if ((flags & DB_RF_NEVERWAIT) == 0) {
587 while (db->db_state == DB_READ ||
588 db->db_state == DB_FILL) {
589 ASSERT(db->db_state == DB_READ ||
590 (flags & DB_RF_HAVESTRUCT) == 0);
591 cv_wait(&db->db_changed, &db->db_mtx);
593 if (db->db_state == DB_UNCACHED)
596 mutex_exit(&db->db_mtx);
599 ASSERT(err || havepzio || db->db_state == DB_CACHED);
604 dbuf_noread(dmu_buf_impl_t *db)
606 ASSERT(!refcount_is_zero(&db->db_holds));
607 ASSERT(db->db_blkid != DB_BONUS_BLKID);
608 mutex_enter(&db->db_mtx);
609 while (db->db_state == DB_READ || db->db_state == DB_FILL)
610 cv_wait(&db->db_changed, &db->db_mtx);
611 if (db->db_state == DB_UNCACHED) {
612 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
614 ASSERT(db->db_buf == NULL);
615 ASSERT(db->db.db_data == NULL);
616 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
617 db->db.db_size, db, type));
618 db->db_state = DB_FILL;
620 ASSERT3U(db->db_state, ==, DB_CACHED);
622 mutex_exit(&db->db_mtx);
626 * This is our just-in-time copy function. It makes a copy of
627 * buffers, that have been modified in a previous transaction
628 * group, before we modify them in the current active group.
630 * This function is used in two places: when we are dirtying a
631 * buffer for the first time in a txg, and when we are freeing
632 * a range in a dnode that includes this buffer.
634 * Note that when we are called from dbuf_free_range() we do
635 * not put a hold on the buffer, we just traverse the active
636 * dbuf list for the dnode.
639 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
641 dbuf_dirty_record_t *dr = db->db_last_dirty;
643 ASSERT(MUTEX_HELD(&db->db_mtx));
644 ASSERT(db->db.db_data != NULL);
645 ASSERT(db->db_level == 0);
646 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
649 (dr->dt.dl.dr_data !=
650 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
654 * If the last dirty record for this dbuf has not yet synced
655 * and its referencing the dbuf data, either:
656 * reset the reference to point to a new copy,
657 * or (if there a no active holders)
658 * just null out the current db_data pointer.
660 ASSERT(dr->dr_txg >= txg - 2);
661 if (db->db_blkid == DB_BONUS_BLKID) {
662 /* Note that the data bufs here are zio_bufs */
663 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
664 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
665 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
666 int size = db->db.db_size;
667 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
668 dr->dt.dl.dr_data = arc_buf_alloc(
669 db->db_dnode->dn_objset->os_spa, size, db, type);
670 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
672 dbuf_set_data(db, NULL);
677 dbuf_unoverride(dbuf_dirty_record_t *dr)
679 dmu_buf_impl_t *db = dr->dr_dbuf;
680 uint64_t txg = dr->dr_txg;
682 ASSERT(MUTEX_HELD(&db->db_mtx));
683 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
684 ASSERT(db->db_level == 0);
686 if (db->db_blkid == DB_BONUS_BLKID ||
687 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
690 /* free this block */
691 if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
692 /* XXX can get silent EIO here */
693 (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
694 txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
696 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
698 * Release the already-written buffer, so we leave it in
699 * a consistent dirty state. Note that all callers are
700 * modifying the buffer, so they will immediately do
701 * another (redundant) arc_release(). Therefore, leave
702 * the buf thawed to save the effort of freezing &
703 * immediately re-thawing it.
705 arc_release(dr->dt.dl.dr_data, db);
709 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
711 dmu_buf_impl_t *db, *db_next;
712 uint64_t txg = tx->tx_txg;
714 dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
715 mutex_enter(&dn->dn_dbufs_mtx);
716 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
717 db_next = list_next(&dn->dn_dbufs, db);
718 ASSERT(db->db_blkid != DB_BONUS_BLKID);
719 if (db->db_level != 0)
721 dprintf_dbuf(db, "found buf %s\n", "");
722 if (db->db_blkid < blkid ||
723 db->db_blkid >= blkid+nblks)
726 /* found a level 0 buffer in the range */
727 if (dbuf_undirty(db, tx))
730 mutex_enter(&db->db_mtx);
731 if (db->db_state == DB_UNCACHED ||
732 db->db_state == DB_EVICTING) {
733 ASSERT(db->db.db_data == NULL);
734 mutex_exit(&db->db_mtx);
737 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
738 /* will be handled in dbuf_read_done or dbuf_rele */
739 db->db_freed_in_flight = TRUE;
740 mutex_exit(&db->db_mtx);
743 if (refcount_count(&db->db_holds) == 0) {
748 /* The dbuf is referenced */
750 if (db->db_last_dirty != NULL) {
751 dbuf_dirty_record_t *dr = db->db_last_dirty;
753 if (dr->dr_txg == txg) {
755 * This buffer is "in-use", re-adjust the file
756 * size to reflect that this buffer may
757 * contain new data when we sync.
759 if (db->db_blkid > dn->dn_maxblkid)
760 dn->dn_maxblkid = db->db_blkid;
764 * This dbuf is not dirty in the open context.
765 * Either uncache it (if its not referenced in
766 * the open context) or reset its contents to
769 dbuf_fix_old_data(db, txg);
772 /* clear the contents if its cached */
773 if (db->db_state == DB_CACHED) {
774 ASSERT(db->db.db_data != NULL);
775 arc_release(db->db_buf, db);
776 bzero(db->db.db_data, db->db.db_size);
777 arc_buf_freeze(db->db_buf);
780 mutex_exit(&db->db_mtx);
782 mutex_exit(&dn->dn_dbufs_mtx);
786 dbuf_new_block(dmu_buf_impl_t *db)
788 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
789 uint64_t birth_txg = 0;
791 /* Don't count meta-objects */
796 * We don't need any locking to protect db_blkptr:
797 * If it's syncing, then db_last_dirty will be set
798 * so we'll ignore db_blkptr.
800 ASSERT(MUTEX_HELD(&db->db_mtx));
801 /* If we have been dirtied since the last snapshot, its not new */
802 if (db->db_last_dirty)
803 birth_txg = db->db_last_dirty->dr_txg;
804 else if (db->db_blkptr)
805 birth_txg = db->db_blkptr->blk_birth;
808 return (!dsl_dataset_block_freeable(ds, birth_txg));
814 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
816 arc_buf_t *buf, *obuf;
817 int osize = db->db.db_size;
818 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
820 ASSERT(db->db_blkid != DB_BONUS_BLKID);
822 /* XXX does *this* func really need the lock? */
823 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
826 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
827 * is OK, because there can be no other references to the db
828 * when we are changing its size, so no concurrent DB_FILL can
832 * XXX we should be doing a dbuf_read, checking the return
833 * value and returning that up to our callers
835 dbuf_will_dirty(db, tx);
837 /* create the data buffer for the new block */
838 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
840 /* copy old block data to the new block */
842 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
843 /* zero the remainder */
845 bzero((uint8_t *)buf->b_data + osize, size - osize);
847 mutex_enter(&db->db_mtx);
848 dbuf_set_data(db, buf);
849 VERIFY(arc_buf_remove_ref(obuf, db) == 1);
850 db->db.db_size = size;
852 if (db->db_level == 0) {
853 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
854 db->db_last_dirty->dt.dl.dr_data = buf;
856 mutex_exit(&db->db_mtx);
858 dnode_willuse_space(db->db_dnode, size-osize, tx);
861 dbuf_dirty_record_t *
862 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
864 dnode_t *dn = db->db_dnode;
865 objset_impl_t *os = dn->dn_objset;
866 dbuf_dirty_record_t **drp, *dr;
867 int drop_struct_lock = FALSE;
868 int txgoff = tx->tx_txg & TXG_MASK;
870 ASSERT(tx->tx_txg != 0);
871 ASSERT(!refcount_is_zero(&db->db_holds));
872 DMU_TX_DIRTY_BUF(tx, db);
875 * Shouldn't dirty a regular buffer in syncing context. Private
876 * objects may be dirtied in syncing context, but only if they
877 * were already pre-dirtied in open context.
878 * XXX We may want to prohibit dirtying in syncing context even
879 * if they did pre-dirty.
881 ASSERT(!dmu_tx_is_syncing(tx) ||
882 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
883 dn->dn_object == DMU_META_DNODE_OBJECT ||
884 dn->dn_objset->os_dsl_dataset == NULL ||
885 dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
888 * We make this assert for private objects as well, but after we
889 * check if we're already dirty. They are allowed to re-dirty
890 * in syncing context.
892 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
893 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
894 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
896 mutex_enter(&db->db_mtx);
898 * XXX make this true for indirects too? The problem is that
899 * transactions created with dmu_tx_create_assigned() from
900 * syncing context don't bother holding ahead.
902 ASSERT(db->db_level != 0 ||
903 db->db_state == DB_CACHED || db->db_state == DB_FILL);
905 mutex_enter(&dn->dn_mtx);
907 * Don't set dirtyctx to SYNC if we're just modifying this as we
908 * initialize the objset.
910 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
911 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
913 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
914 ASSERT(dn->dn_dirtyctx_firstset == NULL);
915 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
917 mutex_exit(&dn->dn_mtx);
920 * If this buffer is already dirty, we're done.
922 drp = &db->db_last_dirty;
923 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
924 db->db.db_object == DMU_META_DNODE_OBJECT);
925 while (*drp && (*drp)->dr_txg > tx->tx_txg)
926 drp = &(*drp)->dr_next;
927 if (*drp && (*drp)->dr_txg == tx->tx_txg) {
928 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
930 * If this buffer has already been written out,
931 * we now need to reset its state.
933 dbuf_unoverride(*drp);
934 if (db->db.db_object != DMU_META_DNODE_OBJECT)
935 arc_buf_thaw(db->db_buf);
937 mutex_exit(&db->db_mtx);
942 * Only valid if not already dirty.
944 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
945 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
947 ASSERT3U(dn->dn_nlevels, >, db->db_level);
948 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
949 dn->dn_phys->dn_nlevels > db->db_level ||
950 dn->dn_next_nlevels[txgoff] > db->db_level ||
951 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
952 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
955 * We should only be dirtying in syncing context if it's the
956 * mos, a spa os, or we're initializing the os. However, we are
957 * allowed to dirty in syncing context provided we already
958 * dirtied it in open context. Hence we must make this
959 * assertion only if we're not already dirty.
961 ASSERT(!dmu_tx_is_syncing(tx) ||
962 os->os_dsl_dataset == NULL ||
963 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
964 !BP_IS_HOLE(os->os_rootbp));
965 ASSERT(db->db.db_size != 0);
967 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
970 * If this buffer is dirty in an old transaction group we need
971 * to make a copy of it so that the changes we make in this
972 * transaction group won't leak out when we sync the older txg.
974 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
975 if (db->db_level == 0) {
976 void *data_old = db->db_buf;
978 if (db->db_blkid == DB_BONUS_BLKID) {
979 dbuf_fix_old_data(db, tx->tx_txg);
980 data_old = db->db.db_data;
981 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
983 * Release the data buffer from the cache so that we
984 * can modify it without impacting possible other users
985 * of this cached data block. Note that indirect
986 * blocks and private objects are not released until the
987 * syncing state (since they are only modified then).
989 arc_release(db->db_buf, db);
990 dbuf_fix_old_data(db, tx->tx_txg);
991 data_old = db->db_buf;
993 ASSERT(data_old != NULL);
994 dr->dt.dl.dr_data = data_old;
996 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
997 list_create(&dr->dt.di.dr_children,
998 sizeof (dbuf_dirty_record_t),
999 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1002 dr->dr_txg = tx->tx_txg;
1007 * We could have been freed_in_flight between the dbuf_noread
1008 * and dbuf_dirty. We win, as though the dbuf_noread() had
1009 * happened after the free.
1011 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
1012 mutex_enter(&dn->dn_mtx);
1013 dnode_clear_range(dn, db->db_blkid, 1, tx);
1014 mutex_exit(&dn->dn_mtx);
1015 db->db_freed_in_flight = FALSE;
1018 if (db->db_blkid != DB_BONUS_BLKID) {
1020 * Update the accounting.
1022 if (!dbuf_new_block(db) && db->db_blkptr) {
1024 * This is only a guess -- if the dbuf is dirty
1025 * in a previous txg, we don't know how much
1026 * space it will use on disk yet. We should
1027 * really have the struct_rwlock to access
1028 * db_blkptr, but since this is just a guess,
1029 * it's OK if we get an odd answer.
1031 dnode_willuse_space(dn,
1032 -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
1034 dnode_willuse_space(dn, db->db.db_size, tx);
1038 * This buffer is now part of this txg
1040 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1041 db->db_dirtycnt += 1;
1042 ASSERT3U(db->db_dirtycnt, <=, 3);
1044 mutex_exit(&db->db_mtx);
1046 if (db->db_blkid == DB_BONUS_BLKID) {
1047 mutex_enter(&dn->dn_mtx);
1048 ASSERT(!list_link_active(&dr->dr_dirty_node));
1049 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1050 mutex_exit(&dn->dn_mtx);
1051 dnode_setdirty(dn, tx);
1055 if (db->db_level == 0) {
1056 dnode_new_blkid(dn, db->db_blkid, tx);
1057 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1060 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1061 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1062 drop_struct_lock = TRUE;
1065 if (db->db_level+1 < dn->dn_nlevels) {
1066 dmu_buf_impl_t *parent = db->db_parent;
1067 dbuf_dirty_record_t *di;
1068 int parent_held = FALSE;
1070 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1071 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1073 parent = dbuf_hold_level(dn, db->db_level+1,
1074 db->db_blkid >> epbs, FTAG);
1077 if (drop_struct_lock)
1078 rw_exit(&dn->dn_struct_rwlock);
1079 ASSERT3U(db->db_level+1, ==, parent->db_level);
1080 di = dbuf_dirty(parent, tx);
1082 dbuf_rele(parent, FTAG);
1084 mutex_enter(&db->db_mtx);
1085 /* possible race with dbuf_undirty() */
1086 if (db->db_last_dirty == dr ||
1087 dn->dn_object == DMU_META_DNODE_OBJECT) {
1088 mutex_enter(&di->dt.di.dr_mtx);
1089 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1090 ASSERT(!list_link_active(&dr->dr_dirty_node));
1091 list_insert_tail(&di->dt.di.dr_children, dr);
1092 mutex_exit(&di->dt.di.dr_mtx);
1095 mutex_exit(&db->db_mtx);
1097 ASSERT(db->db_level+1 == dn->dn_nlevels);
1098 ASSERT(db->db_blkid < dn->dn_nblkptr);
1099 ASSERT(db->db_parent == NULL ||
1100 db->db_parent == db->db_dnode->dn_dbuf);
1101 mutex_enter(&dn->dn_mtx);
1102 ASSERT(!list_link_active(&dr->dr_dirty_node));
1103 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1104 mutex_exit(&dn->dn_mtx);
1105 if (drop_struct_lock)
1106 rw_exit(&dn->dn_struct_rwlock);
1109 dnode_setdirty(dn, tx);
1114 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1116 dnode_t *dn = db->db_dnode;
1117 uint64_t txg = tx->tx_txg;
1118 dbuf_dirty_record_t *dr;
1121 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1123 mutex_enter(&db->db_mtx);
1126 * If this buffer is not dirty, we're done.
1128 for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
1129 if (dr->dr_txg <= txg)
1131 if (dr == NULL || dr->dr_txg < txg) {
1132 mutex_exit(&db->db_mtx);
1135 ASSERT(dr->dr_txg == txg);
1138 * If this buffer is currently held, we cannot undirty
1139 * it, since one of the current holders may be in the
1140 * middle of an update. Note that users of dbuf_undirty()
1141 * should not place a hold on the dbuf before the call.
1143 if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1144 mutex_exit(&db->db_mtx);
1145 /* Make sure we don't toss this buffer at sync phase */
1146 mutex_enter(&dn->dn_mtx);
1147 dnode_clear_range(dn, db->db_blkid, 1, tx);
1148 mutex_exit(&dn->dn_mtx);
1152 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1154 ASSERT(db->db.db_size != 0);
1156 /* XXX would be nice to fix up dn_towrite_space[] */
1158 db->db_last_dirty = dr->dr_next;
1160 if (dr->dr_parent) {
1161 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1162 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1163 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1164 } else if (db->db_level+1 == dn->dn_nlevels) {
1165 ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
1166 mutex_enter(&dn->dn_mtx);
1167 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1168 mutex_exit(&dn->dn_mtx);
1171 if (db->db_level == 0) {
1172 dbuf_unoverride(dr);
1174 ASSERT(db->db_buf != NULL);
1175 ASSERT(dr->dt.dl.dr_data != NULL);
1176 if (dr->dt.dl.dr_data != db->db_buf)
1177 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
1179 ASSERT(db->db_buf != NULL);
1180 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1181 list_destroy(&dr->dt.di.dr_children);
1182 mutex_destroy(&dr->dt.di.dr_mtx);
1184 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1186 ASSERT(db->db_dirtycnt > 0);
1187 db->db_dirtycnt -= 1;
1189 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1190 arc_buf_t *buf = db->db_buf;
1192 ASSERT(arc_released(buf));
1193 dbuf_set_data(db, NULL);
1194 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1199 mutex_exit(&db->db_mtx);
1203 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1205 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1207 int rf = DB_RF_MUST_SUCCEED;
1209 ASSERT(tx->tx_txg != 0);
1210 ASSERT(!refcount_is_zero(&db->db_holds));
1212 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1213 rf |= DB_RF_HAVESTRUCT;
1214 (void) dbuf_read(db, NULL, rf);
1215 (void) dbuf_dirty(db, tx);
1219 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1221 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1223 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1224 ASSERT(tx->tx_txg != 0);
1225 ASSERT(db->db_level == 0);
1226 ASSERT(!refcount_is_zero(&db->db_holds));
1228 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1229 dmu_tx_private_ok(tx));
1232 (void) dbuf_dirty(db, tx);
1235 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1238 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1240 mutex_enter(&db->db_mtx);
1243 if (db->db_state == DB_FILL) {
1244 if (db->db_level == 0 && db->db_freed_in_flight) {
1245 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1246 /* we were freed while filling */
1247 /* XXX dbuf_undirty? */
1248 bzero(db->db.db_data, db->db.db_size);
1249 db->db_freed_in_flight = FALSE;
1251 db->db_state = DB_CACHED;
1252 cv_broadcast(&db->db_changed);
1254 mutex_exit(&db->db_mtx);
1258 * "Clear" the contents of this dbuf. This will mark the dbuf
1259 * EVICTING and clear *most* of its references. Unfortunetely,
1260 * when we are not holding the dn_dbufs_mtx, we can't clear the
1261 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1262 * in this case. For callers from the DMU we will usually see:
1263 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1264 * For the arc callback, we will usually see:
1265 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1266 * Sometimes, though, we will get a mix of these two:
1267 * DMU: dbuf_clear()->arc_buf_evict()
1268 * ARC: dbuf_do_evict()->dbuf_destroy()
1271 dbuf_clear(dmu_buf_impl_t *db)
1273 dnode_t *dn = db->db_dnode;
1274 dmu_buf_impl_t *parent = db->db_parent;
1275 dmu_buf_impl_t *dndb = dn->dn_dbuf;
1276 int dbuf_gone = FALSE;
1278 ASSERT(MUTEX_HELD(&db->db_mtx));
1279 ASSERT(refcount_is_zero(&db->db_holds));
1281 dbuf_evict_user(db);
1283 if (db->db_state == DB_CACHED) {
1284 ASSERT(db->db.db_data != NULL);
1285 if (db->db_blkid == DB_BONUS_BLKID)
1286 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1287 db->db.db_data = NULL;
1288 db->db_state = DB_UNCACHED;
1291 ASSERT3U(db->db_state, ==, DB_UNCACHED);
1292 ASSERT(db->db_data_pending == NULL);
1294 db->db_state = DB_EVICTING;
1295 db->db_blkptr = NULL;
1297 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1298 list_remove(&dn->dn_dbufs, db);
1303 dbuf_gone = arc_buf_evict(db->db_buf);
1306 mutex_exit(&db->db_mtx);
1309 * If this dbuf is referened from an indirect dbuf,
1310 * decrement the ref count on the indirect dbuf.
1312 if (parent && parent != dndb)
1313 dbuf_rele(parent, db);
1317 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1318 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1325 ASSERT(blkid != DB_BONUS_BLKID);
1327 if (dn->dn_phys->dn_nlevels == 0)
1330 nlevels = dn->dn_phys->dn_nlevels;
1332 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1334 ASSERT3U(level * epbs, <, 64);
1335 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1336 if (level >= nlevels ||
1337 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1338 /* the buffer has no parent yet */
1340 } else if (level < nlevels-1) {
1341 /* this block is referenced from an indirect block */
1342 int err = dbuf_hold_impl(dn, level+1,
1343 blkid >> epbs, fail_sparse, NULL, parentp);
1346 err = dbuf_read(*parentp, NULL,
1347 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1349 dbuf_rele(*parentp, NULL);
1353 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1354 (blkid & ((1ULL << epbs) - 1));
1357 /* the block is referenced from the dnode */
1358 ASSERT3U(level, ==, nlevels-1);
1359 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1360 blkid < dn->dn_phys->dn_nblkptr);
1362 dbuf_add_ref(dn->dn_dbuf, NULL);
1363 *parentp = dn->dn_dbuf;
1365 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1370 static dmu_buf_impl_t *
1371 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1372 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1374 objset_impl_t *os = dn->dn_objset;
1375 dmu_buf_impl_t *db, *odb;
1377 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1378 ASSERT(dn->dn_type != DMU_OT_NONE);
1380 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1383 db->db.db_object = dn->dn_object;
1384 db->db_level = level;
1385 db->db_blkid = blkid;
1386 db->db_last_dirty = NULL;
1387 db->db_dirtycnt = 0;
1389 db->db_parent = parent;
1390 db->db_blkptr = blkptr;
1392 db->db_user_ptr = NULL;
1393 db->db_user_data_ptr_ptr = NULL;
1394 db->db_evict_func = NULL;
1395 db->db_immediate_evict = 0;
1396 db->db_freed_in_flight = 0;
1398 if (blkid == DB_BONUS_BLKID) {
1399 ASSERT3P(parent, ==, dn->dn_dbuf);
1400 db->db.db_size = dn->dn_bonuslen;
1401 db->db.db_offset = DB_BONUS_BLKID;
1402 db->db_state = DB_UNCACHED;
1403 /* the bonus dbuf is not placed in the hash table */
1407 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1408 db->db.db_size = blocksize;
1409 db->db.db_offset = db->db_blkid * blocksize;
1413 * Hold the dn_dbufs_mtx while we get the new dbuf
1414 * in the hash table *and* added to the dbufs list.
1415 * This prevents a possible deadlock with someone
1416 * trying to look up this dbuf before its added to the
1419 mutex_enter(&dn->dn_dbufs_mtx);
1420 db->db_state = DB_EVICTING;
1421 if ((odb = dbuf_hash_insert(db)) != NULL) {
1422 /* someone else inserted it first */
1423 kmem_cache_free(dbuf_cache, db);
1424 mutex_exit(&dn->dn_dbufs_mtx);
1427 list_insert_head(&dn->dn_dbufs, db);
1428 db->db_state = DB_UNCACHED;
1429 mutex_exit(&dn->dn_dbufs_mtx);
1431 if (parent && parent != dn->dn_dbuf)
1432 dbuf_add_ref(parent, db);
1434 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1435 refcount_count(&dn->dn_holds) > 0);
1436 (void) refcount_add(&dn->dn_holds, db);
1438 dprintf_dbuf(db, "db=%p\n", db);
1444 dbuf_do_evict(void *private)
1446 arc_buf_t *buf = private;
1447 dmu_buf_impl_t *db = buf->b_private;
1449 if (!MUTEX_HELD(&db->db_mtx))
1450 mutex_enter(&db->db_mtx);
1452 ASSERT(refcount_is_zero(&db->db_holds));
1454 if (db->db_state != DB_EVICTING) {
1455 ASSERT(db->db_state == DB_CACHED);
1460 mutex_exit(&db->db_mtx);
1467 dbuf_destroy(dmu_buf_impl_t *db)
1469 ASSERT(refcount_is_zero(&db->db_holds));
1471 if (db->db_blkid != DB_BONUS_BLKID) {
1472 dnode_t *dn = db->db_dnode;
1475 * If this dbuf is still on the dn_dbufs list,
1476 * remove it from that list.
1478 if (list_link_active(&db->db_link)) {
1479 mutex_enter(&dn->dn_dbufs_mtx);
1480 list_remove(&dn->dn_dbufs, db);
1481 mutex_exit(&dn->dn_dbufs_mtx);
1485 dbuf_hash_remove(db);
1487 db->db_parent = NULL;
1488 db->db_dnode = NULL;
1491 ASSERT(db->db.db_data == NULL);
1492 ASSERT(db->db_hash_next == NULL);
1493 ASSERT(db->db_blkptr == NULL);
1494 ASSERT(db->db_data_pending == NULL);
1496 kmem_cache_free(dbuf_cache, db);
1500 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1502 dmu_buf_impl_t *db = NULL;
1503 blkptr_t *bp = NULL;
1505 ASSERT(blkid != DB_BONUS_BLKID);
1506 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1508 if (dnode_block_freed(dn, blkid))
1511 /* dbuf_find() returns with db_mtx held */
1512 if (db = dbuf_find(dn, 0, blkid)) {
1513 if (refcount_count(&db->db_holds) > 0) {
1515 * This dbuf is active. We assume that it is
1516 * already CACHED, or else about to be either
1519 mutex_exit(&db->db_mtx);
1522 mutex_exit(&db->db_mtx);
1526 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1527 if (bp && !BP_IS_HOLE(bp)) {
1528 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1530 zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
1531 dn->dn_objset->os_dsl_dataset->ds_object : 0;
1532 zb.zb_object = dn->dn_object;
1534 zb.zb_blkid = blkid;
1536 (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
1537 dmu_ot[dn->dn_type].ot_byteswap,
1538 NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1539 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1543 dbuf_rele(db, NULL);
1548 * Returns with db_holds incremented, and db_mtx not held.
1549 * Note: dn_struct_rwlock must be held.
1552 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1553 void *tag, dmu_buf_impl_t **dbp)
1555 dmu_buf_impl_t *db, *parent = NULL;
1557 ASSERT(blkid != DB_BONUS_BLKID);
1558 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1559 ASSERT3U(dn->dn_nlevels, >, level);
1563 /* dbuf_find() returns with db_mtx held */
1564 db = dbuf_find(dn, level, blkid);
1567 blkptr_t *bp = NULL;
1570 ASSERT3P(parent, ==, NULL);
1571 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1573 if (err == 0 && bp && BP_IS_HOLE(bp))
1577 dbuf_rele(parent, NULL);
1581 if (err && err != ENOENT)
1583 db = dbuf_create(dn, level, blkid, parent, bp);
1586 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1587 arc_buf_add_ref(db->db_buf, db);
1588 if (db->db_buf->b_data == NULL) {
1591 dbuf_rele(parent, NULL);
1596 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1599 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1602 * If this buffer is currently syncing out, and we are are
1603 * still referencing it from db_data, we need to make a copy
1604 * of it in case we decide we want to dirty it again in this txg.
1606 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
1607 dn->dn_object != DMU_META_DNODE_OBJECT &&
1608 db->db_state == DB_CACHED && db->db_data_pending) {
1609 dbuf_dirty_record_t *dr = db->db_data_pending;
1611 if (dr->dt.dl.dr_data == db->db_buf) {
1612 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1615 arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1616 db->db.db_size, db, type));
1617 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1622 (void) refcount_add(&db->db_holds, tag);
1623 dbuf_update_data(db);
1625 mutex_exit(&db->db_mtx);
1627 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1629 dbuf_rele(parent, NULL);
1631 ASSERT3P(db->db_dnode, ==, dn);
1632 ASSERT3U(db->db_blkid, ==, blkid);
1633 ASSERT3U(db->db_level, ==, level);
1640 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1643 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1644 return (err ? NULL : db);
1648 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1651 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1652 return (err ? NULL : db);
1656 dbuf_create_bonus(dnode_t *dn)
1658 dmu_buf_impl_t *db = dn->dn_bonus;
1660 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1662 ASSERT(dn->dn_bonus == NULL);
1663 db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
1667 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1669 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1671 int64_t holds = refcount_add(&db->db_holds, tag);
1675 #pragma weak dmu_buf_rele = dbuf_rele
1677 dbuf_rele(dmu_buf_impl_t *db, void *tag)
1681 mutex_enter(&db->db_mtx);
1684 holds = refcount_remove(&db->db_holds, tag);
1688 * We can't freeze indirects if there is a possibility that they
1689 * may be modified in the current syncing context.
1691 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
1692 arc_buf_freeze(db->db_buf);
1694 if (holds == db->db_dirtycnt &&
1695 db->db_level == 0 && db->db_immediate_evict)
1696 dbuf_evict_user(db);
1699 if (db->db_blkid == DB_BONUS_BLKID) {
1700 mutex_exit(&db->db_mtx);
1701 dnode_rele(db->db_dnode, db);
1702 } else if (db->db_buf == NULL) {
1704 * This is a special case: we never associated this
1705 * dbuf with any data allocated from the ARC.
1707 ASSERT3U(db->db_state, ==, DB_UNCACHED);
1709 } else if (arc_released(db->db_buf)) {
1710 arc_buf_t *buf = db->db_buf;
1712 * This dbuf has anonymous data associated with it.
1714 dbuf_set_data(db, NULL);
1715 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1718 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
1719 mutex_exit(&db->db_mtx);
1722 mutex_exit(&db->db_mtx);
1726 #pragma weak dmu_buf_refcount = dbuf_refcount
1728 dbuf_refcount(dmu_buf_impl_t *db)
1730 return (refcount_count(&db->db_holds));
1734 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1735 dmu_buf_evict_func_t *evict_func)
1737 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1738 user_data_ptr_ptr, evict_func));
1742 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1743 dmu_buf_evict_func_t *evict_func)
1745 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1747 db->db_immediate_evict = TRUE;
1748 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1749 user_data_ptr_ptr, evict_func));
1753 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1754 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1756 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1757 ASSERT(db->db_level == 0);
1759 ASSERT((user_ptr == NULL) == (evict_func == NULL));
1761 mutex_enter(&db->db_mtx);
1763 if (db->db_user_ptr == old_user_ptr) {
1764 db->db_user_ptr = user_ptr;
1765 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
1766 db->db_evict_func = evict_func;
1768 dbuf_update_data(db);
1770 old_user_ptr = db->db_user_ptr;
1773 mutex_exit(&db->db_mtx);
1774 return (old_user_ptr);
1778 dmu_buf_get_user(dmu_buf_t *db_fake)
1780 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1781 ASSERT(!refcount_is_zero(&db->db_holds));
1783 return (db->db_user_ptr);
1787 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
1789 /* ASSERT(dmu_tx_is_syncing(tx) */
1790 ASSERT(MUTEX_HELD(&db->db_mtx));
1792 if (db->db_blkptr != NULL)
1795 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
1797 * This buffer was allocated at a time when there was
1798 * no available blkptrs from the dnode, or it was
1799 * inappropriate to hook it in (i.e., nlevels mis-match).
1801 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
1802 ASSERT(db->db_parent == NULL);
1803 db->db_parent = dn->dn_dbuf;
1804 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1807 dmu_buf_impl_t *parent = db->db_parent;
1808 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1810 ASSERT(dn->dn_phys->dn_nlevels > 1);
1811 if (parent == NULL) {
1812 mutex_exit(&db->db_mtx);
1813 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1814 (void) dbuf_hold_impl(dn, db->db_level+1,
1815 db->db_blkid >> epbs, FALSE, db, &parent);
1816 rw_exit(&dn->dn_struct_rwlock);
1817 mutex_enter(&db->db_mtx);
1818 db->db_parent = parent;
1820 db->db_blkptr = (blkptr_t *)parent->db.db_data +
1821 (db->db_blkid & ((1ULL << epbs) - 1));
1827 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1829 dmu_buf_impl_t *db = dr->dr_dbuf;
1830 dnode_t *dn = db->db_dnode;
1833 ASSERT(dmu_tx_is_syncing(tx));
1835 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1837 mutex_enter(&db->db_mtx);
1839 ASSERT(db->db_level > 0);
1842 if (db->db_buf == NULL) {
1843 mutex_exit(&db->db_mtx);
1844 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
1845 mutex_enter(&db->db_mtx);
1847 ASSERT3U(db->db_state, ==, DB_CACHED);
1848 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
1849 ASSERT(db->db_buf != NULL);
1851 dbuf_check_blkptr(dn, db);
1853 db->db_data_pending = dr;
1855 arc_release(db->db_buf, db);
1856 mutex_exit(&db->db_mtx);
1859 * XXX -- we should design a compression algorithm
1860 * that specializes in arrays of bps.
1862 dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
1863 zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
1866 mutex_enter(&dr->dt.di.dr_mtx);
1867 dbuf_sync_list(&dr->dt.di.dr_children, tx);
1868 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1869 mutex_exit(&dr->dt.di.dr_mtx);
1874 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1876 arc_buf_t **datap = &dr->dt.dl.dr_data;
1877 dmu_buf_impl_t *db = dr->dr_dbuf;
1878 dnode_t *dn = db->db_dnode;
1879 objset_impl_t *os = dn->dn_objset;
1880 uint64_t txg = tx->tx_txg;
1881 int checksum, compress;
1884 ASSERT(dmu_tx_is_syncing(tx));
1886 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1888 mutex_enter(&db->db_mtx);
1890 * To be synced, we must be dirtied. But we
1891 * might have been freed after the dirty.
1893 if (db->db_state == DB_UNCACHED) {
1894 /* This buffer has been freed since it was dirtied */
1895 ASSERT(db->db.db_data == NULL);
1896 } else if (db->db_state == DB_FILL) {
1897 /* This buffer was freed and is now being re-filled */
1898 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
1900 ASSERT3U(db->db_state, ==, DB_CACHED);
1905 * If this is a bonus buffer, simply copy the bonus data into the
1906 * dnode. It will be written out when the dnode is synced (and it
1907 * will be synced, since it must have been dirty for dbuf_sync to
1910 if (db->db_blkid == DB_BONUS_BLKID) {
1911 dbuf_dirty_record_t **drp;
1913 * Use dn_phys->dn_bonuslen since db.db_size is the length
1914 * of the bonus buffer in the open transaction rather than
1915 * the syncing transaction.
1917 ASSERT(*datap != NULL);
1918 ASSERT3U(db->db_level, ==, 0);
1919 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
1920 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
1921 if (*datap != db->db.db_data)
1922 zio_buf_free(*datap, DN_MAX_BONUSLEN);
1923 db->db_data_pending = NULL;
1924 drp = &db->db_last_dirty;
1926 drp = &(*drp)->dr_next;
1927 ASSERT((*drp)->dr_next == NULL);
1929 if (dr->dr_dbuf->db_level != 0) {
1930 list_destroy(&dr->dt.di.dr_children);
1931 mutex_destroy(&dr->dt.di.dr_mtx);
1933 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1934 ASSERT(db->db_dirtycnt > 0);
1935 db->db_dirtycnt -= 1;
1936 mutex_exit(&db->db_mtx);
1937 dbuf_rele(db, (void *)(uintptr_t)txg);
1942 * If this buffer is in the middle of an immdiate write,
1943 * wait for the synchronous IO to complete.
1945 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
1946 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1947 cv_wait(&db->db_changed, &db->db_mtx);
1948 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
1951 dbuf_check_blkptr(dn, db);
1954 * If this dbuf has already been written out via an immediate write,
1955 * just complete the write by copying over the new block pointer and
1956 * updating the accounting via the write-completion functions.
1958 if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1961 zio_fake.io_private = &db;
1962 zio_fake.io_error = 0;
1963 zio_fake.io_bp = db->db_blkptr;
1964 zio_fake.io_bp_orig = *db->db_blkptr;
1965 zio_fake.io_txg = txg;
1967 *db->db_blkptr = dr->dt.dl.dr_overridden_by;
1968 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1969 db->db_data_pending = dr;
1970 dr->dr_zio = &zio_fake;
1971 mutex_exit(&db->db_mtx);
1973 if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
1974 dsl_dataset_block_kill(os->os_dsl_dataset,
1975 &zio_fake.io_bp_orig, dn->dn_zio, tx);
1977 dbuf_write_ready(&zio_fake, db->db_buf, db);
1978 dbuf_write_done(&zio_fake, db->db_buf, db);
1983 blksz = arc_buf_size(*datap);
1985 if (dn->dn_object != DMU_META_DNODE_OBJECT) {
1987 * If this buffer is currently "in use" (i.e., there are
1988 * active holds and db_data still references it), then make
1989 * a copy before we start the write so that any modifications
1990 * from the open txg will not leak into this write.
1992 * NOTE: this copy does not need to be made for objects only
1993 * modified in the syncing context (e.g. DNONE_DNODE blocks).
1995 if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
1996 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1997 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
1998 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2002 * Private object buffers are released here rather
2003 * than in dbuf_dirty() since they are only modified
2004 * in the syncing context and we don't want the
2005 * overhead of making multiple copies of the data.
2007 arc_release(db->db_buf, db);
2010 ASSERT(*datap != NULL);
2011 db->db_data_pending = dr;
2013 mutex_exit(&db->db_mtx);
2016 * Allow dnode settings to override objset settings,
2017 * except for metadata checksums.
2019 if (dmu_ot[dn->dn_type].ot_metadata) {
2020 checksum = os->os_md_checksum;
2021 compress = zio_compress_select(dn->dn_compress,
2022 os->os_md_compress);
2024 checksum = zio_checksum_select(dn->dn_checksum,
2026 compress = zio_compress_select(dn->dn_compress,
2030 dbuf_write(dr, *datap, checksum, compress, tx);
2032 ASSERT(!list_link_active(&dr->dr_dirty_node));
2033 if (dn->dn_object == DMU_META_DNODE_OBJECT)
2034 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2036 zio_nowait(dr->dr_zio);
2040 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2042 dbuf_dirty_record_t *dr;
2044 while (dr = list_head(list)) {
2045 if (dr->dr_zio != NULL) {
2047 * If we find an already initialized zio then we
2048 * are processing the meta-dnode, and we have finished.
2049 * The dbufs for all dnodes are put back on the list
2050 * during processing, so that we can zio_wait()
2051 * these IOs after initiating all child IOs.
2053 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2054 DMU_META_DNODE_OBJECT);
2057 list_remove(list, dr);
2058 if (dr->dr_dbuf->db_level > 0)
2059 dbuf_sync_indirect(dr, tx);
2061 dbuf_sync_leaf(dr, tx);
2066 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
2067 int compress, dmu_tx_t *tx)
2069 dmu_buf_impl_t *db = dr->dr_dbuf;
2070 dnode_t *dn = db->db_dnode;
2071 objset_impl_t *os = dn->dn_objset;
2072 dmu_buf_impl_t *parent = db->db_parent;
2073 uint64_t txg = tx->tx_txg;
2078 if (parent != dn->dn_dbuf) {
2079 ASSERT(parent && parent->db_data_pending);
2080 ASSERT(db->db_level == parent->db_level-1);
2081 ASSERT(arc_released(parent->db_buf));
2082 zio = parent->db_data_pending->dr_zio;
2084 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
2085 ASSERT3P(db->db_blkptr, ==,
2086 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2090 ASSERT(db->db_level == 0 || data == db->db_buf);
2091 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2094 zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
2095 zb.zb_object = db->db.db_object;
2096 zb.zb_level = db->db_level;
2097 zb.zb_blkid = db->db_blkid;
2099 zio_flags = ZIO_FLAG_MUSTSUCCEED;
2100 if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
2101 zio_flags |= ZIO_FLAG_METADATA;
2102 if (BP_IS_OLDER(db->db_blkptr, txg))
2103 dsl_dataset_block_kill(
2104 os->os_dsl_dataset, db->db_blkptr, zio, tx);
2106 dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
2107 dmu_get_replication_level(os, &zb, dn->dn_type), txg,
2108 db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
2109 ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
2114 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2116 dmu_buf_impl_t *db = vdb;
2117 dnode_t *dn = db->db_dnode;
2118 objset_impl_t *os = dn->dn_objset;
2119 blkptr_t *bp_orig = &zio->io_bp_orig;
2121 int old_size, new_size, i;
2123 dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
2125 old_size = bp_get_dasize(os->os_spa, bp_orig);
2126 new_size = bp_get_dasize(os->os_spa, zio->io_bp);
2128 dnode_diduse_space(dn, new_size-old_size);
2130 if (BP_IS_HOLE(zio->io_bp)) {
2131 dsl_dataset_t *ds = os->os_dsl_dataset;
2132 dmu_tx_t *tx = os->os_synctx;
2134 if (bp_orig->blk_birth == tx->tx_txg)
2135 dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
2136 ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
2140 mutex_enter(&db->db_mtx);
2142 if (db->db_level == 0) {
2143 mutex_enter(&dn->dn_mtx);
2144 if (db->db_blkid > dn->dn_phys->dn_maxblkid)
2145 dn->dn_phys->dn_maxblkid = db->db_blkid;
2146 mutex_exit(&dn->dn_mtx);
2148 if (dn->dn_type == DMU_OT_DNODE) {
2149 dnode_phys_t *dnp = db->db.db_data;
2150 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2152 if (dnp->dn_type != DMU_OT_NONE)
2159 blkptr_t *bp = db->db.db_data;
2160 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2161 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
2164 ASSERT3U(BP_GET_LSIZE(bp), ==,
2165 db->db_level == 1 ? dn->dn_datablksz :
2166 (1<<dn->dn_phys->dn_indblkshift));
2167 fill += bp->blk_fill;
2171 db->db_blkptr->blk_fill = fill;
2172 BP_SET_TYPE(db->db_blkptr, dn->dn_type);
2173 BP_SET_LEVEL(db->db_blkptr, db->db_level);
2175 mutex_exit(&db->db_mtx);
2177 /* We must do this after we've set the bp's type and level */
2178 if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
2179 dsl_dataset_t *ds = os->os_dsl_dataset;
2180 dmu_tx_t *tx = os->os_synctx;
2182 if (bp_orig->blk_birth == tx->tx_txg)
2183 dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
2184 dsl_dataset_block_born(ds, zio->io_bp, tx);
2190 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2192 dmu_buf_impl_t *db = vdb;
2193 uint64_t txg = zio->io_txg;
2194 dbuf_dirty_record_t **drp, *dr;
2196 ASSERT3U(zio->io_error, ==, 0);
2198 mutex_enter(&db->db_mtx);
2200 drp = &db->db_last_dirty;
2201 while (*drp != db->db_data_pending)
2202 drp = &(*drp)->dr_next;
2203 ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
2204 ASSERT((*drp)->dr_txg == txg);
2205 ASSERT((*drp)->dr_next == NULL);
2209 if (db->db_level == 0) {
2210 ASSERT(db->db_blkid != DB_BONUS_BLKID);
2211 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2213 if (dr->dt.dl.dr_data != db->db_buf)
2214 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
2215 else if (!BP_IS_HOLE(db->db_blkptr))
2216 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2218 ASSERT(arc_released(db->db_buf));
2220 dnode_t *dn = db->db_dnode;
2222 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2223 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2224 if (!BP_IS_HOLE(db->db_blkptr)) {
2226 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2227 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2229 ASSERT3U(dn->dn_phys->dn_maxblkid
2230 >> (db->db_level * epbs), >=, db->db_blkid);
2231 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2233 list_destroy(&dr->dt.di.dr_children);
2234 mutex_destroy(&dr->dt.di.dr_mtx);
2236 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2238 cv_broadcast(&db->db_changed);
2239 ASSERT(db->db_dirtycnt > 0);
2240 db->db_dirtycnt -= 1;
2241 db->db_data_pending = NULL;
2242 mutex_exit(&db->db_mtx);
2244 dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
2246 dbuf_rele(db, (void *)(uintptr_t)txg);