4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
28 #include <sys/dmu_impl.h>
30 #include <sys/dmu_objset.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dmu_tx.h>
36 #include <sys/dmu_zfetch.h>
38 static void dbuf_destroy(dmu_buf_impl_t *db);
39 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
40 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
41 static arc_done_func_t dbuf_write_ready;
42 static arc_done_func_t dbuf_write_done;
45 * Global data structures and functions for the dbuf cache.
47 static kmem_cache_t *dbuf_cache;
51 dbuf_cons(void *vdb, void *unused, int kmflag)
53 dmu_buf_impl_t *db = vdb;
54 bzero(db, sizeof (dmu_buf_impl_t));
56 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
57 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
58 refcount_create(&db->db_holds);
64 dbuf_dest(void *vdb, void *unused)
66 dmu_buf_impl_t *db = vdb;
67 mutex_destroy(&db->db_mtx);
68 cv_destroy(&db->db_changed);
69 refcount_destroy(&db->db_holds);
73 * dbuf hash table routines
75 static dbuf_hash_table_t dbuf_hash_table;
77 static uint64_t dbuf_hash_count;
80 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
82 uintptr_t osv = (uintptr_t)os;
85 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
86 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
93 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
98 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
100 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
101 ((dbuf)->db.db_object == (obj) && \
102 (dbuf)->db_objset == (os) && \
103 (dbuf)->db_level == (level) && \
104 (dbuf)->db_blkid == (blkid))
107 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
109 dbuf_hash_table_t *h = &dbuf_hash_table;
110 objset_impl_t *os = dn->dn_objset;
111 uint64_t obj = dn->dn_object;
112 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
113 uint64_t idx = hv & h->hash_table_mask;
116 mutex_enter(DBUF_HASH_MUTEX(h, idx));
117 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
118 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
119 mutex_enter(&db->db_mtx);
120 if (db->db_state != DB_EVICTING) {
121 mutex_exit(DBUF_HASH_MUTEX(h, idx));
124 mutex_exit(&db->db_mtx);
127 mutex_exit(DBUF_HASH_MUTEX(h, idx));
132 * Insert an entry into the hash table. If there is already an element
133 * equal to elem in the hash table, then the already existing element
134 * will be returned and the new element will not be inserted.
135 * Otherwise returns NULL.
137 static dmu_buf_impl_t *
138 dbuf_hash_insert(dmu_buf_impl_t *db)
140 dbuf_hash_table_t *h = &dbuf_hash_table;
141 objset_impl_t *os = db->db_objset;
142 uint64_t obj = db->db.db_object;
143 int level = db->db_level;
144 uint64_t blkid = db->db_blkid;
145 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
146 uint64_t idx = hv & h->hash_table_mask;
149 mutex_enter(DBUF_HASH_MUTEX(h, idx));
150 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
151 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
152 mutex_enter(&dbf->db_mtx);
153 if (dbf->db_state != DB_EVICTING) {
154 mutex_exit(DBUF_HASH_MUTEX(h, idx));
157 mutex_exit(&dbf->db_mtx);
161 mutex_enter(&db->db_mtx);
162 db->db_hash_next = h->hash_table[idx];
163 h->hash_table[idx] = db;
164 mutex_exit(DBUF_HASH_MUTEX(h, idx));
165 atomic_add_64(&dbuf_hash_count, 1);
171 * Remove an entry from the hash table. This operation will
172 * fail if there are any existing holds on the db.
175 dbuf_hash_remove(dmu_buf_impl_t *db)
177 dbuf_hash_table_t *h = &dbuf_hash_table;
178 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
179 db->db_level, db->db_blkid);
180 uint64_t idx = hv & h->hash_table_mask;
181 dmu_buf_impl_t *dbf, **dbp;
184 * We musn't hold db_mtx to maintin lock ordering:
185 * DBUF_HASH_MUTEX > db_mtx.
187 ASSERT(refcount_is_zero(&db->db_holds));
188 ASSERT(db->db_state == DB_EVICTING);
189 ASSERT(!MUTEX_HELD(&db->db_mtx));
191 mutex_enter(DBUF_HASH_MUTEX(h, idx));
192 dbp = &h->hash_table[idx];
193 while ((dbf = *dbp) != db) {
194 dbp = &dbf->db_hash_next;
197 *dbp = db->db_hash_next;
198 db->db_hash_next = NULL;
199 mutex_exit(DBUF_HASH_MUTEX(h, idx));
200 atomic_add_64(&dbuf_hash_count, -1);
203 static arc_evict_func_t dbuf_do_evict;
206 dbuf_evict_user(dmu_buf_impl_t *db)
208 ASSERT(MUTEX_HELD(&db->db_mtx));
210 if (db->db_level != 0 || db->db_evict_func == NULL)
213 if (db->db_user_data_ptr_ptr)
214 *db->db_user_data_ptr_ptr = db->db.db_data;
215 db->db_evict_func(&db->db, db->db_user_ptr);
216 db->db_user_ptr = NULL;
217 db->db_user_data_ptr_ptr = NULL;
218 db->db_evict_func = NULL;
222 dbuf_evict(dmu_buf_impl_t *db)
224 ASSERT(MUTEX_HELD(&db->db_mtx));
225 ASSERT(db->db_buf == NULL);
226 ASSERT(db->db_data_pending == NULL);
235 uint64_t hsize = 1ULL << 16;
236 dbuf_hash_table_t *h = &dbuf_hash_table;
240 * The hash table is big enough to fill all of physical memory
241 * with an average 4K block size. The table will take up
242 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
244 while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
248 h->hash_table_mask = hsize - 1;
249 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
250 if (h->hash_table == NULL) {
251 /* XXX - we should really return an error instead of assert */
252 ASSERT(hsize > (1ULL << 10));
257 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
258 sizeof (dmu_buf_impl_t),
259 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
261 for (i = 0; i < DBUF_MUTEXES; i++)
262 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
268 dbuf_hash_table_t *h = &dbuf_hash_table;
271 for (i = 0; i < DBUF_MUTEXES; i++)
272 mutex_destroy(&h->hash_mutexes[i]);
273 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
274 kmem_cache_destroy(dbuf_cache);
283 dbuf_verify(dmu_buf_impl_t *db)
285 dnode_t *dn = db->db_dnode;
287 ASSERT(MUTEX_HELD(&db->db_mtx));
289 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
292 ASSERT(db->db_objset != NULL);
294 ASSERT(db->db_parent == NULL);
295 ASSERT(db->db_blkptr == NULL);
297 ASSERT3U(db->db.db_object, ==, dn->dn_object);
298 ASSERT3P(db->db_objset, ==, dn->dn_objset);
299 ASSERT3U(db->db_level, <, dn->dn_nlevels);
300 ASSERT(db->db_blkid == DB_BONUS_BLKID ||
301 list_head(&dn->dn_dbufs));
303 if (db->db_blkid == DB_BONUS_BLKID) {
305 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
306 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
308 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
312 * We can't assert that db_size matches dn_datablksz because it
313 * can be momentarily different when another thread is doing
316 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
317 dbuf_dirty_record_t *dr = db->db_data_pending;
319 * It should only be modified in syncing context, so
320 * make sure we only have one copy of the data.
322 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
325 /* verify db->db_blkptr */
327 if (db->db_parent == dn->dn_dbuf) {
328 /* db is pointed to by the dnode */
329 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
330 if (db->db.db_object == DMU_META_DNODE_OBJECT)
331 ASSERT(db->db_parent == NULL);
333 ASSERT(db->db_parent != NULL);
334 ASSERT3P(db->db_blkptr, ==,
335 &dn->dn_phys->dn_blkptr[db->db_blkid]);
337 /* db is pointed to by an indirect block */
338 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
339 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
340 ASSERT3U(db->db_parent->db.db_object, ==,
343 * dnode_grow_indblksz() can make this fail if we don't
344 * have the struct_rwlock. XXX indblksz no longer
345 * grows. safe to do this now?
347 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
348 ASSERT3P(db->db_blkptr, ==,
349 ((blkptr_t *)db->db_parent->db.db_data +
350 db->db_blkid % epb));
354 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
355 db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
356 db->db_state != DB_FILL && !dn->dn_free_txg) {
358 * If the blkptr isn't set but they have nonzero data,
359 * it had better be dirty, otherwise we'll lose that
360 * data when we evict this buffer.
362 if (db->db_dirtycnt == 0) {
363 uint64_t *buf = db->db.db_data;
366 for (i = 0; i < db->db.db_size >> 3; i++) {
375 dbuf_update_data(dmu_buf_impl_t *db)
377 ASSERT(MUTEX_HELD(&db->db_mtx));
378 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
379 ASSERT(!refcount_is_zero(&db->db_holds));
380 *db->db_user_data_ptr_ptr = db->db.db_data;
385 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
387 ASSERT(MUTEX_HELD(&db->db_mtx));
388 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
391 ASSERT(buf->b_data != NULL);
392 db->db.db_data = buf->b_data;
393 if (!arc_released(buf))
394 arc_set_callback(buf, dbuf_do_evict, db);
395 dbuf_update_data(db);
398 db->db.db_data = NULL;
399 db->db_state = DB_UNCACHED;
404 dbuf_whichblock(dnode_t *dn, uint64_t offset)
406 if (dn->dn_datablkshift) {
407 return (offset >> dn->dn_datablkshift);
409 ASSERT3U(offset, <, dn->dn_datablksz);
415 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
417 dmu_buf_impl_t *db = vdb;
419 mutex_enter(&db->db_mtx);
420 ASSERT3U(db->db_state, ==, DB_READ);
422 * All reads are synchronous, so we must have a hold on the dbuf
424 ASSERT(refcount_count(&db->db_holds) > 0);
425 ASSERT(db->db_buf == NULL);
426 ASSERT(db->db.db_data == NULL);
427 if (db->db_level == 0 && db->db_freed_in_flight) {
428 /* we were freed in flight; disregard any error */
429 arc_release(buf, db);
430 bzero(buf->b_data, db->db.db_size);
432 db->db_freed_in_flight = FALSE;
433 dbuf_set_data(db, buf);
434 db->db_state = DB_CACHED;
435 } else if (zio == NULL || zio->io_error == 0) {
436 dbuf_set_data(db, buf);
437 db->db_state = DB_CACHED;
439 ASSERT(db->db_blkid != DB_BONUS_BLKID);
440 ASSERT3P(db->db_buf, ==, NULL);
441 VERIFY(arc_buf_remove_ref(buf, db) == 1);
442 db->db_state = DB_UNCACHED;
444 cv_broadcast(&db->db_changed);
445 mutex_exit(&db->db_mtx);
450 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
452 dnode_t *dn = db->db_dnode;
454 uint32_t aflags = ARC_NOWAIT;
457 ASSERT(!refcount_is_zero(&db->db_holds));
458 /* We need the struct_rwlock to prevent db_blkptr from changing. */
459 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
460 ASSERT(MUTEX_HELD(&db->db_mtx));
461 ASSERT(db->db_state == DB_UNCACHED);
462 ASSERT(db->db_buf == NULL);
464 if (db->db_blkid == DB_BONUS_BLKID) {
465 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
467 ASSERT3U(bonuslen, <=, db->db.db_size);
468 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
469 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
470 if (bonuslen < DN_MAX_BONUSLEN)
471 bzero(db->db.db_data, DN_MAX_BONUSLEN);
473 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
474 dbuf_update_data(db);
475 db->db_state = DB_CACHED;
476 mutex_exit(&db->db_mtx);
481 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
482 * processes the delete record and clears the bp while we are waiting
483 * for the dn_mtx (resulting in a "no" from block_freed).
485 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
486 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
487 BP_IS_HOLE(db->db_blkptr)))) {
488 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
490 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
491 db->db.db_size, db, type));
492 bzero(db->db.db_data, db->db.db_size);
493 db->db_state = DB_CACHED;
494 *flags |= DB_RF_CACHED;
495 mutex_exit(&db->db_mtx);
499 db->db_state = DB_READ;
500 mutex_exit(&db->db_mtx);
502 if (DBUF_IS_L2CACHEABLE(db))
503 aflags |= ARC_L2CACHE;
505 zb.zb_objset = db->db_objset->os_dsl_dataset ?
506 db->db_objset->os_dsl_dataset->ds_object : 0;
507 zb.zb_object = db->db.db_object;
508 zb.zb_level = db->db_level;
509 zb.zb_blkid = db->db_blkid;
511 dbuf_add_ref(db, NULL);
512 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
515 pbuf = db->db_parent->db_buf;
517 pbuf = db->db_objset->os_phys_buf;
519 (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
520 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
521 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
523 if (aflags & ARC_CACHED)
524 *flags |= DB_RF_CACHED;
528 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
531 int havepzio = (zio != NULL);
535 * We don't have to hold the mutex to check db_state because it
536 * can't be freed while we have a hold on the buffer.
538 ASSERT(!refcount_is_zero(&db->db_holds));
540 if ((flags & DB_RF_HAVESTRUCT) == 0)
541 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
543 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
544 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
545 DBUF_IS_CACHEABLE(db);
547 mutex_enter(&db->db_mtx);
548 if (db->db_state == DB_CACHED) {
549 mutex_exit(&db->db_mtx);
551 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
552 db->db.db_size, TRUE);
553 if ((flags & DB_RF_HAVESTRUCT) == 0)
554 rw_exit(&db->db_dnode->dn_struct_rwlock);
555 } else if (db->db_state == DB_UNCACHED) {
557 zio = zio_root(db->db_dnode->dn_objset->os_spa,
558 NULL, NULL, ZIO_FLAG_CANFAIL);
560 dbuf_read_impl(db, zio, &flags);
562 /* dbuf_read_impl has dropped db_mtx for us */
565 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
566 db->db.db_size, flags & DB_RF_CACHED);
568 if ((flags & DB_RF_HAVESTRUCT) == 0)
569 rw_exit(&db->db_dnode->dn_struct_rwlock);
574 mutex_exit(&db->db_mtx);
576 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
577 db->db.db_size, TRUE);
578 if ((flags & DB_RF_HAVESTRUCT) == 0)
579 rw_exit(&db->db_dnode->dn_struct_rwlock);
581 mutex_enter(&db->db_mtx);
582 if ((flags & DB_RF_NEVERWAIT) == 0) {
583 while (db->db_state == DB_READ ||
584 db->db_state == DB_FILL) {
585 ASSERT(db->db_state == DB_READ ||
586 (flags & DB_RF_HAVESTRUCT) == 0);
587 cv_wait(&db->db_changed, &db->db_mtx);
589 if (db->db_state == DB_UNCACHED)
592 mutex_exit(&db->db_mtx);
595 ASSERT(err || havepzio || db->db_state == DB_CACHED);
600 dbuf_noread(dmu_buf_impl_t *db)
602 ASSERT(!refcount_is_zero(&db->db_holds));
603 ASSERT(db->db_blkid != DB_BONUS_BLKID);
604 mutex_enter(&db->db_mtx);
605 while (db->db_state == DB_READ || db->db_state == DB_FILL)
606 cv_wait(&db->db_changed, &db->db_mtx);
607 if (db->db_state == DB_UNCACHED) {
608 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
610 ASSERT(db->db_buf == NULL);
611 ASSERT(db->db.db_data == NULL);
612 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
613 db->db.db_size, db, type));
614 db->db_state = DB_FILL;
616 ASSERT3U(db->db_state, ==, DB_CACHED);
618 mutex_exit(&db->db_mtx);
622 * This is our just-in-time copy function. It makes a copy of
623 * buffers, that have been modified in a previous transaction
624 * group, before we modify them in the current active group.
626 * This function is used in two places: when we are dirtying a
627 * buffer for the first time in a txg, and when we are freeing
628 * a range in a dnode that includes this buffer.
630 * Note that when we are called from dbuf_free_range() we do
631 * not put a hold on the buffer, we just traverse the active
632 * dbuf list for the dnode.
635 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
637 dbuf_dirty_record_t *dr = db->db_last_dirty;
639 ASSERT(MUTEX_HELD(&db->db_mtx));
640 ASSERT(db->db.db_data != NULL);
641 ASSERT(db->db_level == 0);
642 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
645 (dr->dt.dl.dr_data !=
646 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
650 * If the last dirty record for this dbuf has not yet synced
651 * and its referencing the dbuf data, either:
652 * reset the reference to point to a new copy,
653 * or (if there a no active holders)
654 * just null out the current db_data pointer.
656 ASSERT(dr->dr_txg >= txg - 2);
657 if (db->db_blkid == DB_BONUS_BLKID) {
658 /* Note that the data bufs here are zio_bufs */
659 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
660 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
661 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
662 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
663 int size = db->db.db_size;
664 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
665 dr->dt.dl.dr_data = arc_buf_alloc(
666 db->db_dnode->dn_objset->os_spa, size, db, type);
667 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
669 dbuf_set_data(db, NULL);
674 dbuf_unoverride(dbuf_dirty_record_t *dr)
676 dmu_buf_impl_t *db = dr->dr_dbuf;
677 uint64_t txg = dr->dr_txg;
679 ASSERT(MUTEX_HELD(&db->db_mtx));
680 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
681 ASSERT(db->db_level == 0);
683 if (db->db_blkid == DB_BONUS_BLKID ||
684 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
687 /* free this block */
688 if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
689 /* XXX can get silent EIO here */
690 (void) dsl_free(NULL,
691 spa_get_dsl(db->db_dnode->dn_objset->os_spa),
692 txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
694 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
696 * Release the already-written buffer, so we leave it in
697 * a consistent dirty state. Note that all callers are
698 * modifying the buffer, so they will immediately do
699 * another (redundant) arc_release(). Therefore, leave
700 * the buf thawed to save the effort of freezing &
701 * immediately re-thawing it.
703 arc_release(dr->dt.dl.dr_data, db);
707 * Evict (if its unreferenced) or clear (if its referenced) any level-0
708 * data blocks in the free range, so that any future readers will find
709 * empty blocks. Also, if we happen accross any level-1 dbufs in the
710 * range that have not already been marked dirty, mark them dirty so
711 * they stay in memory.
714 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
716 dmu_buf_impl_t *db, *db_next;
717 uint64_t txg = tx->tx_txg;
718 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
719 uint64_t first_l1 = start >> epbs;
720 uint64_t last_l1 = end >> epbs;
722 if (end > dn->dn_maxblkid) {
723 end = dn->dn_maxblkid;
724 last_l1 = end >> epbs;
726 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
727 mutex_enter(&dn->dn_dbufs_mtx);
728 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
729 db_next = list_next(&dn->dn_dbufs, db);
730 ASSERT(db->db_blkid != DB_BONUS_BLKID);
732 if (db->db_level == 1 &&
733 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
734 mutex_enter(&db->db_mtx);
735 if (db->db_last_dirty &&
736 db->db_last_dirty->dr_txg < txg) {
737 dbuf_add_ref(db, FTAG);
738 mutex_exit(&db->db_mtx);
739 dbuf_will_dirty(db, tx);
742 mutex_exit(&db->db_mtx);
746 if (db->db_level != 0)
748 dprintf_dbuf(db, "found buf %s\n", "");
749 if (db->db_blkid < start || db->db_blkid > end)
752 /* found a level 0 buffer in the range */
753 if (dbuf_undirty(db, tx))
756 mutex_enter(&db->db_mtx);
757 if (db->db_state == DB_UNCACHED ||
758 db->db_state == DB_EVICTING) {
759 ASSERT(db->db.db_data == NULL);
760 mutex_exit(&db->db_mtx);
763 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
764 /* will be handled in dbuf_read_done or dbuf_rele */
765 db->db_freed_in_flight = TRUE;
766 mutex_exit(&db->db_mtx);
769 if (refcount_count(&db->db_holds) == 0) {
774 /* The dbuf is referenced */
776 if (db->db_last_dirty != NULL) {
777 dbuf_dirty_record_t *dr = db->db_last_dirty;
779 if (dr->dr_txg == txg) {
781 * This buffer is "in-use", re-adjust the file
782 * size to reflect that this buffer may
783 * contain new data when we sync.
785 if (db->db_blkid > dn->dn_maxblkid)
786 dn->dn_maxblkid = db->db_blkid;
790 * This dbuf is not dirty in the open context.
791 * Either uncache it (if its not referenced in
792 * the open context) or reset its contents to
795 dbuf_fix_old_data(db, txg);
798 /* clear the contents if its cached */
799 if (db->db_state == DB_CACHED) {
800 ASSERT(db->db.db_data != NULL);
801 arc_release(db->db_buf, db);
802 bzero(db->db.db_data, db->db.db_size);
803 arc_buf_freeze(db->db_buf);
806 mutex_exit(&db->db_mtx);
808 mutex_exit(&dn->dn_dbufs_mtx);
812 dbuf_block_freeable(dmu_buf_impl_t *db)
814 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
815 uint64_t birth_txg = 0;
818 * We don't need any locking to protect db_blkptr:
819 * If it's syncing, then db_last_dirty will be set
820 * so we'll ignore db_blkptr.
822 ASSERT(MUTEX_HELD(&db->db_mtx));
823 if (db->db_last_dirty)
824 birth_txg = db->db_last_dirty->dr_txg;
825 else if (db->db_blkptr)
826 birth_txg = db->db_blkptr->blk_birth;
828 /* If we don't exist or are in a snapshot, we can't be freed */
830 return (ds == NULL ||
831 dsl_dataset_block_freeable(ds, birth_txg));
837 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
839 arc_buf_t *buf, *obuf;
840 int osize = db->db.db_size;
841 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
843 ASSERT(db->db_blkid != DB_BONUS_BLKID);
845 /* XXX does *this* func really need the lock? */
846 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
849 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
850 * is OK, because there can be no other references to the db
851 * when we are changing its size, so no concurrent DB_FILL can
855 * XXX we should be doing a dbuf_read, checking the return
856 * value and returning that up to our callers
858 dbuf_will_dirty(db, tx);
860 /* create the data buffer for the new block */
861 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
863 /* copy old block data to the new block */
865 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
866 /* zero the remainder */
868 bzero((uint8_t *)buf->b_data + osize, size - osize);
870 mutex_enter(&db->db_mtx);
871 dbuf_set_data(db, buf);
872 VERIFY(arc_buf_remove_ref(obuf, db) == 1);
873 db->db.db_size = size;
875 if (db->db_level == 0) {
876 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
877 db->db_last_dirty->dt.dl.dr_data = buf;
879 mutex_exit(&db->db_mtx);
881 dnode_willuse_space(db->db_dnode, size-osize, tx);
884 dbuf_dirty_record_t *
885 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
887 dnode_t *dn = db->db_dnode;
888 objset_impl_t *os = dn->dn_objset;
889 dbuf_dirty_record_t **drp, *dr;
890 int drop_struct_lock = FALSE;
891 boolean_t do_free_accounting = B_FALSE;
892 int txgoff = tx->tx_txg & TXG_MASK;
894 ASSERT(tx->tx_txg != 0);
895 ASSERT(!refcount_is_zero(&db->db_holds));
896 DMU_TX_DIRTY_BUF(tx, db);
899 * Shouldn't dirty a regular buffer in syncing context. Private
900 * objects may be dirtied in syncing context, but only if they
901 * were already pre-dirtied in open context.
902 * XXX We may want to prohibit dirtying in syncing context even
903 * if they did pre-dirty.
905 ASSERT(!dmu_tx_is_syncing(tx) ||
906 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
907 dn->dn_object == DMU_META_DNODE_OBJECT ||
908 dn->dn_objset->os_dsl_dataset == NULL ||
909 dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
912 * We make this assert for private objects as well, but after we
913 * check if we're already dirty. They are allowed to re-dirty
914 * in syncing context.
916 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
917 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
918 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
920 mutex_enter(&db->db_mtx);
922 * XXX make this true for indirects too? The problem is that
923 * transactions created with dmu_tx_create_assigned() from
924 * syncing context don't bother holding ahead.
926 ASSERT(db->db_level != 0 ||
927 db->db_state == DB_CACHED || db->db_state == DB_FILL);
929 mutex_enter(&dn->dn_mtx);
931 * Don't set dirtyctx to SYNC if we're just modifying this as we
932 * initialize the objset.
934 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
935 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
937 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
938 ASSERT(dn->dn_dirtyctx_firstset == NULL);
939 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
941 mutex_exit(&dn->dn_mtx);
944 * If this buffer is already dirty, we're done.
946 drp = &db->db_last_dirty;
947 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
948 db->db.db_object == DMU_META_DNODE_OBJECT);
949 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
951 if (dr && dr->dr_txg == tx->tx_txg) {
952 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
954 * If this buffer has already been written out,
955 * we now need to reset its state.
958 if (db->db.db_object != DMU_META_DNODE_OBJECT)
959 arc_buf_thaw(db->db_buf);
961 mutex_exit(&db->db_mtx);
966 * Only valid if not already dirty.
968 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
969 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
971 ASSERT3U(dn->dn_nlevels, >, db->db_level);
972 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
973 dn->dn_phys->dn_nlevels > db->db_level ||
974 dn->dn_next_nlevels[txgoff] > db->db_level ||
975 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
976 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
979 * We should only be dirtying in syncing context if it's the
980 * mos, a spa os, or we're initializing the os. However, we are
981 * allowed to dirty in syncing context provided we already
982 * dirtied it in open context. Hence we must make this
983 * assertion only if we're not already dirty.
985 ASSERT(!dmu_tx_is_syncing(tx) ||
986 os->os_dsl_dataset == NULL ||
987 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
988 !BP_IS_HOLE(os->os_rootbp));
989 ASSERT(db->db.db_size != 0);
991 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
993 if (db->db_blkid != DB_BONUS_BLKID) {
995 * Update the accounting.
996 * Note: we delay "free accounting" until after we drop
997 * the db_mtx. This keeps us from grabbing other locks
998 * (and possibly deadlocking) in bp_get_dasize() while
999 * also holding the db_mtx.
1001 dnode_willuse_space(dn, db->db.db_size, tx);
1002 do_free_accounting = dbuf_block_freeable(db);
1006 * If this buffer is dirty in an old transaction group we need
1007 * to make a copy of it so that the changes we make in this
1008 * transaction group won't leak out when we sync the older txg.
1010 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1011 if (db->db_level == 0) {
1012 void *data_old = db->db_buf;
1014 if (db->db_blkid == DB_BONUS_BLKID) {
1015 dbuf_fix_old_data(db, tx->tx_txg);
1016 data_old = db->db.db_data;
1017 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1019 * Release the data buffer from the cache so that we
1020 * can modify it without impacting possible other users
1021 * of this cached data block. Note that indirect
1022 * blocks and private objects are not released until the
1023 * syncing state (since they are only modified then).
1025 arc_release(db->db_buf, db);
1026 dbuf_fix_old_data(db, tx->tx_txg);
1027 data_old = db->db_buf;
1029 ASSERT(data_old != NULL);
1030 dr->dt.dl.dr_data = data_old;
1032 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1033 list_create(&dr->dt.di.dr_children,
1034 sizeof (dbuf_dirty_record_t),
1035 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1038 dr->dr_txg = tx->tx_txg;
1043 * We could have been freed_in_flight between the dbuf_noread
1044 * and dbuf_dirty. We win, as though the dbuf_noread() had
1045 * happened after the free.
1047 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
1048 mutex_enter(&dn->dn_mtx);
1049 dnode_clear_range(dn, db->db_blkid, 1, tx);
1050 mutex_exit(&dn->dn_mtx);
1051 db->db_freed_in_flight = FALSE;
1055 * This buffer is now part of this txg
1057 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1058 db->db_dirtycnt += 1;
1059 ASSERT3U(db->db_dirtycnt, <=, 3);
1061 mutex_exit(&db->db_mtx);
1063 if (db->db_blkid == DB_BONUS_BLKID) {
1064 mutex_enter(&dn->dn_mtx);
1065 ASSERT(!list_link_active(&dr->dr_dirty_node));
1066 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1067 mutex_exit(&dn->dn_mtx);
1068 dnode_setdirty(dn, tx);
1070 } else if (do_free_accounting) {
1071 blkptr_t *bp = db->db_blkptr;
1072 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1073 bp_get_dasize(os->os_spa, bp) : db->db.db_size;
1075 * This is only a guess -- if the dbuf is dirty
1076 * in a previous txg, we don't know how much
1077 * space it will use on disk yet. We should
1078 * really have the struct_rwlock to access
1079 * db_blkptr, but since this is just a guess,
1080 * it's OK if we get an odd answer.
1082 dnode_willuse_space(dn, -willfree, tx);
1085 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1086 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1087 drop_struct_lock = TRUE;
1090 if (db->db_level == 0) {
1091 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1092 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1095 if (db->db_level+1 < dn->dn_nlevels) {
1096 dmu_buf_impl_t *parent = db->db_parent;
1097 dbuf_dirty_record_t *di;
1098 int parent_held = FALSE;
1100 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1101 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1103 parent = dbuf_hold_level(dn, db->db_level+1,
1104 db->db_blkid >> epbs, FTAG);
1107 if (drop_struct_lock)
1108 rw_exit(&dn->dn_struct_rwlock);
1109 ASSERT3U(db->db_level+1, ==, parent->db_level);
1110 di = dbuf_dirty(parent, tx);
1112 dbuf_rele(parent, FTAG);
1114 mutex_enter(&db->db_mtx);
1115 /* possible race with dbuf_undirty() */
1116 if (db->db_last_dirty == dr ||
1117 dn->dn_object == DMU_META_DNODE_OBJECT) {
1118 mutex_enter(&di->dt.di.dr_mtx);
1119 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1120 ASSERT(!list_link_active(&dr->dr_dirty_node));
1121 list_insert_tail(&di->dt.di.dr_children, dr);
1122 mutex_exit(&di->dt.di.dr_mtx);
1125 mutex_exit(&db->db_mtx);
1127 ASSERT(db->db_level+1 == dn->dn_nlevels);
1128 ASSERT(db->db_blkid < dn->dn_nblkptr);
1129 ASSERT(db->db_parent == NULL ||
1130 db->db_parent == db->db_dnode->dn_dbuf);
1131 mutex_enter(&dn->dn_mtx);
1132 ASSERT(!list_link_active(&dr->dr_dirty_node));
1133 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1134 mutex_exit(&dn->dn_mtx);
1135 if (drop_struct_lock)
1136 rw_exit(&dn->dn_struct_rwlock);
1139 dnode_setdirty(dn, tx);
1144 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1146 dnode_t *dn = db->db_dnode;
1147 uint64_t txg = tx->tx_txg;
1148 dbuf_dirty_record_t *dr, **drp;
1151 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1153 mutex_enter(&db->db_mtx);
1156 * If this buffer is not dirty, we're done.
1158 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1159 if (dr->dr_txg <= txg)
1161 if (dr == NULL || dr->dr_txg < txg) {
1162 mutex_exit(&db->db_mtx);
1165 ASSERT(dr->dr_txg == txg);
1168 * If this buffer is currently held, we cannot undirty
1169 * it, since one of the current holders may be in the
1170 * middle of an update. Note that users of dbuf_undirty()
1171 * should not place a hold on the dbuf before the call.
1173 if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1174 mutex_exit(&db->db_mtx);
1175 /* Make sure we don't toss this buffer at sync phase */
1176 mutex_enter(&dn->dn_mtx);
1177 dnode_clear_range(dn, db->db_blkid, 1, tx);
1178 mutex_exit(&dn->dn_mtx);
1182 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1184 ASSERT(db->db.db_size != 0);
1186 /* XXX would be nice to fix up dn_towrite_space[] */
1190 if (dr->dr_parent) {
1191 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1192 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1193 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1194 } else if (db->db_level+1 == dn->dn_nlevels) {
1195 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1196 mutex_enter(&dn->dn_mtx);
1197 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1198 mutex_exit(&dn->dn_mtx);
1201 if (db->db_level == 0) {
1202 dbuf_unoverride(dr);
1204 ASSERT(db->db_buf != NULL);
1205 ASSERT(dr->dt.dl.dr_data != NULL);
1206 if (dr->dt.dl.dr_data != db->db_buf)
1207 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
1209 ASSERT(db->db_buf != NULL);
1210 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1211 mutex_destroy(&dr->dt.di.dr_mtx);
1212 list_destroy(&dr->dt.di.dr_children);
1214 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1216 ASSERT(db->db_dirtycnt > 0);
1217 db->db_dirtycnt -= 1;
1219 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1220 arc_buf_t *buf = db->db_buf;
1222 ASSERT(arc_released(buf));
1223 dbuf_set_data(db, NULL);
1224 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1229 mutex_exit(&db->db_mtx);
1233 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1235 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1237 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1239 ASSERT(tx->tx_txg != 0);
1240 ASSERT(!refcount_is_zero(&db->db_holds));
1242 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1243 rf |= DB_RF_HAVESTRUCT;
1244 (void) dbuf_read(db, NULL, rf);
1245 (void) dbuf_dirty(db, tx);
1249 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1251 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1253 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1254 ASSERT(tx->tx_txg != 0);
1255 ASSERT(db->db_level == 0);
1256 ASSERT(!refcount_is_zero(&db->db_holds));
1258 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1259 dmu_tx_private_ok(tx));
1262 (void) dbuf_dirty(db, tx);
1265 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1268 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1270 mutex_enter(&db->db_mtx);
1273 if (db->db_state == DB_FILL) {
1274 if (db->db_level == 0 && db->db_freed_in_flight) {
1275 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1276 /* we were freed while filling */
1277 /* XXX dbuf_undirty? */
1278 bzero(db->db.db_data, db->db.db_size);
1279 db->db_freed_in_flight = FALSE;
1281 db->db_state = DB_CACHED;
1282 cv_broadcast(&db->db_changed);
1284 mutex_exit(&db->db_mtx);
1288 * "Clear" the contents of this dbuf. This will mark the dbuf
1289 * EVICTING and clear *most* of its references. Unfortunetely,
1290 * when we are not holding the dn_dbufs_mtx, we can't clear the
1291 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1292 * in this case. For callers from the DMU we will usually see:
1293 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1294 * For the arc callback, we will usually see:
1295 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1296 * Sometimes, though, we will get a mix of these two:
1297 * DMU: dbuf_clear()->arc_buf_evict()
1298 * ARC: dbuf_do_evict()->dbuf_destroy()
1301 dbuf_clear(dmu_buf_impl_t *db)
1303 dnode_t *dn = db->db_dnode;
1304 dmu_buf_impl_t *parent = db->db_parent;
1305 dmu_buf_impl_t *dndb = dn->dn_dbuf;
1306 int dbuf_gone = FALSE;
1308 ASSERT(MUTEX_HELD(&db->db_mtx));
1309 ASSERT(refcount_is_zero(&db->db_holds));
1311 dbuf_evict_user(db);
1313 if (db->db_state == DB_CACHED) {
1314 ASSERT(db->db.db_data != NULL);
1315 if (db->db_blkid == DB_BONUS_BLKID) {
1316 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1317 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1319 db->db.db_data = NULL;
1320 db->db_state = DB_UNCACHED;
1323 ASSERT3U(db->db_state, ==, DB_UNCACHED);
1324 ASSERT(db->db_data_pending == NULL);
1326 db->db_state = DB_EVICTING;
1327 db->db_blkptr = NULL;
1329 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1330 list_remove(&dn->dn_dbufs, db);
1332 db->db_dnode = NULL;
1336 dbuf_gone = arc_buf_evict(db->db_buf);
1339 mutex_exit(&db->db_mtx);
1342 * If this dbuf is referened from an indirect dbuf,
1343 * decrement the ref count on the indirect dbuf.
1345 if (parent && parent != dndb)
1346 dbuf_rele(parent, db);
1350 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1351 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1358 ASSERT(blkid != DB_BONUS_BLKID);
1360 if (dn->dn_phys->dn_nlevels == 0)
1363 nlevels = dn->dn_phys->dn_nlevels;
1365 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1367 ASSERT3U(level * epbs, <, 64);
1368 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1369 if (level >= nlevels ||
1370 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1371 /* the buffer has no parent yet */
1373 } else if (level < nlevels-1) {
1374 /* this block is referenced from an indirect block */
1375 int err = dbuf_hold_impl(dn, level+1,
1376 blkid >> epbs, fail_sparse, NULL, parentp);
1379 err = dbuf_read(*parentp, NULL,
1380 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1382 dbuf_rele(*parentp, NULL);
1386 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1387 (blkid & ((1ULL << epbs) - 1));
1390 /* the block is referenced from the dnode */
1391 ASSERT3U(level, ==, nlevels-1);
1392 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1393 blkid < dn->dn_phys->dn_nblkptr);
1395 dbuf_add_ref(dn->dn_dbuf, NULL);
1396 *parentp = dn->dn_dbuf;
1398 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1403 static dmu_buf_impl_t *
1404 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1405 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1407 objset_impl_t *os = dn->dn_objset;
1408 dmu_buf_impl_t *db, *odb;
1410 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1411 ASSERT(dn->dn_type != DMU_OT_NONE);
1413 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1416 db->db.db_object = dn->dn_object;
1417 db->db_level = level;
1418 db->db_blkid = blkid;
1419 db->db_last_dirty = NULL;
1420 db->db_dirtycnt = 0;
1422 db->db_parent = parent;
1423 db->db_blkptr = blkptr;
1425 db->db_user_ptr = NULL;
1426 db->db_user_data_ptr_ptr = NULL;
1427 db->db_evict_func = NULL;
1428 db->db_immediate_evict = 0;
1429 db->db_freed_in_flight = 0;
1431 if (blkid == DB_BONUS_BLKID) {
1432 ASSERT3P(parent, ==, dn->dn_dbuf);
1433 db->db.db_size = DN_MAX_BONUSLEN -
1434 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1435 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1436 db->db.db_offset = DB_BONUS_BLKID;
1437 db->db_state = DB_UNCACHED;
1438 /* the bonus dbuf is not placed in the hash table */
1439 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1443 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1444 db->db.db_size = blocksize;
1445 db->db.db_offset = db->db_blkid * blocksize;
1449 * Hold the dn_dbufs_mtx while we get the new dbuf
1450 * in the hash table *and* added to the dbufs list.
1451 * This prevents a possible deadlock with someone
1452 * trying to look up this dbuf before its added to the
1455 mutex_enter(&dn->dn_dbufs_mtx);
1456 db->db_state = DB_EVICTING;
1457 if ((odb = dbuf_hash_insert(db)) != NULL) {
1458 /* someone else inserted it first */
1459 kmem_cache_free(dbuf_cache, db);
1460 mutex_exit(&dn->dn_dbufs_mtx);
1463 list_insert_head(&dn->dn_dbufs, db);
1464 db->db_state = DB_UNCACHED;
1465 mutex_exit(&dn->dn_dbufs_mtx);
1466 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1468 if (parent && parent != dn->dn_dbuf)
1469 dbuf_add_ref(parent, db);
1471 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1472 refcount_count(&dn->dn_holds) > 0);
1473 (void) refcount_add(&dn->dn_holds, db);
1475 dprintf_dbuf(db, "db=%p\n", db);
1481 dbuf_do_evict(void *private)
1483 arc_buf_t *buf = private;
1484 dmu_buf_impl_t *db = buf->b_private;
1486 if (!MUTEX_HELD(&db->db_mtx))
1487 mutex_enter(&db->db_mtx);
1489 ASSERT(refcount_is_zero(&db->db_holds));
1491 if (db->db_state != DB_EVICTING) {
1492 ASSERT(db->db_state == DB_CACHED);
1497 mutex_exit(&db->db_mtx);
1504 dbuf_destroy(dmu_buf_impl_t *db)
1506 ASSERT(refcount_is_zero(&db->db_holds));
1508 if (db->db_blkid != DB_BONUS_BLKID) {
1510 * If this dbuf is still on the dn_dbufs list,
1511 * remove it from that list.
1514 dnode_t *dn = db->db_dnode;
1516 mutex_enter(&dn->dn_dbufs_mtx);
1517 list_remove(&dn->dn_dbufs, db);
1518 mutex_exit(&dn->dn_dbufs_mtx);
1521 db->db_dnode = NULL;
1523 dbuf_hash_remove(db);
1525 db->db_parent = NULL;
1528 ASSERT(!list_link_active(&db->db_link));
1529 ASSERT(db->db.db_data == NULL);
1530 ASSERT(db->db_hash_next == NULL);
1531 ASSERT(db->db_blkptr == NULL);
1532 ASSERT(db->db_data_pending == NULL);
1534 kmem_cache_free(dbuf_cache, db);
1535 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1539 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1541 dmu_buf_impl_t *db = NULL;
1542 blkptr_t *bp = NULL;
1544 ASSERT(blkid != DB_BONUS_BLKID);
1545 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1547 if (dnode_block_freed(dn, blkid))
1550 /* dbuf_find() returns with db_mtx held */
1551 if (db = dbuf_find(dn, 0, blkid)) {
1552 if (refcount_count(&db->db_holds) > 0) {
1554 * This dbuf is active. We assume that it is
1555 * already CACHED, or else about to be either
1558 mutex_exit(&db->db_mtx);
1561 mutex_exit(&db->db_mtx);
1565 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1566 if (bp && !BP_IS_HOLE(bp)) {
1568 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1570 zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
1571 dn->dn_objset->os_dsl_dataset->ds_object : 0;
1572 zb.zb_object = dn->dn_object;
1574 zb.zb_blkid = blkid;
1579 pbuf = dn->dn_objset->os_phys_buf;
1581 (void) arc_read(NULL, dn->dn_objset->os_spa,
1582 bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1583 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1587 dbuf_rele(db, NULL);
1592 * Returns with db_holds incremented, and db_mtx not held.
1593 * Note: dn_struct_rwlock must be held.
1596 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1597 void *tag, dmu_buf_impl_t **dbp)
1599 dmu_buf_impl_t *db, *parent = NULL;
1601 ASSERT(blkid != DB_BONUS_BLKID);
1602 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1603 ASSERT3U(dn->dn_nlevels, >, level);
1607 /* dbuf_find() returns with db_mtx held */
1608 db = dbuf_find(dn, level, blkid);
1611 blkptr_t *bp = NULL;
1614 ASSERT3P(parent, ==, NULL);
1615 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1617 if (err == 0 && bp && BP_IS_HOLE(bp))
1621 dbuf_rele(parent, NULL);
1625 if (err && err != ENOENT)
1627 db = dbuf_create(dn, level, blkid, parent, bp);
1630 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1631 arc_buf_add_ref(db->db_buf, db);
1632 if (db->db_buf->b_data == NULL) {
1635 dbuf_rele(parent, NULL);
1640 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1643 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1646 * If this buffer is currently syncing out, and we are are
1647 * still referencing it from db_data, we need to make a copy
1648 * of it in case we decide we want to dirty it again in this txg.
1650 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
1651 dn->dn_object != DMU_META_DNODE_OBJECT &&
1652 db->db_state == DB_CACHED && db->db_data_pending) {
1653 dbuf_dirty_record_t *dr = db->db_data_pending;
1655 if (dr->dt.dl.dr_data == db->db_buf) {
1656 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1659 arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1660 db->db.db_size, db, type));
1661 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1666 (void) refcount_add(&db->db_holds, tag);
1667 dbuf_update_data(db);
1669 mutex_exit(&db->db_mtx);
1671 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1673 dbuf_rele(parent, NULL);
1675 ASSERT3P(db->db_dnode, ==, dn);
1676 ASSERT3U(db->db_blkid, ==, blkid);
1677 ASSERT3U(db->db_level, ==, level);
1684 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1687 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1688 return (err ? NULL : db);
1692 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1695 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1696 return (err ? NULL : db);
1700 dbuf_create_bonus(dnode_t *dn)
1702 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1704 ASSERT(dn->dn_bonus == NULL);
1705 dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
1708 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1710 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1712 int64_t holds = refcount_add(&db->db_holds, tag);
1716 #pragma weak dmu_buf_rele = dbuf_rele
1718 dbuf_rele(dmu_buf_impl_t *db, void *tag)
1722 mutex_enter(&db->db_mtx);
1725 holds = refcount_remove(&db->db_holds, tag);
1729 * We can't freeze indirects if there is a possibility that they
1730 * may be modified in the current syncing context.
1732 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
1733 arc_buf_freeze(db->db_buf);
1735 if (holds == db->db_dirtycnt &&
1736 db->db_level == 0 && db->db_immediate_evict)
1737 dbuf_evict_user(db);
1740 if (db->db_blkid == DB_BONUS_BLKID) {
1741 mutex_exit(&db->db_mtx);
1742 dnode_rele(db->db_dnode, db);
1743 } else if (db->db_buf == NULL) {
1745 * This is a special case: we never associated this
1746 * dbuf with any data allocated from the ARC.
1748 ASSERT3U(db->db_state, ==, DB_UNCACHED);
1750 } else if (arc_released(db->db_buf)) {
1751 arc_buf_t *buf = db->db_buf;
1753 * This dbuf has anonymous data associated with it.
1755 dbuf_set_data(db, NULL);
1756 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1759 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
1760 if (!DBUF_IS_CACHEABLE(db))
1763 mutex_exit(&db->db_mtx);
1766 mutex_exit(&db->db_mtx);
1770 #pragma weak dmu_buf_refcount = dbuf_refcount
1772 dbuf_refcount(dmu_buf_impl_t *db)
1774 return (refcount_count(&db->db_holds));
1778 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1779 dmu_buf_evict_func_t *evict_func)
1781 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1782 user_data_ptr_ptr, evict_func));
1786 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1787 dmu_buf_evict_func_t *evict_func)
1789 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1791 db->db_immediate_evict = TRUE;
1792 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1793 user_data_ptr_ptr, evict_func));
1797 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1798 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1800 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1801 ASSERT(db->db_level == 0);
1803 ASSERT((user_ptr == NULL) == (evict_func == NULL));
1805 mutex_enter(&db->db_mtx);
1807 if (db->db_user_ptr == old_user_ptr) {
1808 db->db_user_ptr = user_ptr;
1809 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
1810 db->db_evict_func = evict_func;
1812 dbuf_update_data(db);
1814 old_user_ptr = db->db_user_ptr;
1817 mutex_exit(&db->db_mtx);
1818 return (old_user_ptr);
1822 dmu_buf_get_user(dmu_buf_t *db_fake)
1824 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1825 ASSERT(!refcount_is_zero(&db->db_holds));
1827 return (db->db_user_ptr);
1831 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
1833 /* ASSERT(dmu_tx_is_syncing(tx) */
1834 ASSERT(MUTEX_HELD(&db->db_mtx));
1836 if (db->db_blkptr != NULL)
1839 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
1841 * This buffer was allocated at a time when there was
1842 * no available blkptrs from the dnode, or it was
1843 * inappropriate to hook it in (i.e., nlevels mis-match).
1845 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
1846 ASSERT(db->db_parent == NULL);
1847 db->db_parent = dn->dn_dbuf;
1848 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1851 dmu_buf_impl_t *parent = db->db_parent;
1852 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1854 ASSERT(dn->dn_phys->dn_nlevels > 1);
1855 if (parent == NULL) {
1856 mutex_exit(&db->db_mtx);
1857 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1858 (void) dbuf_hold_impl(dn, db->db_level+1,
1859 db->db_blkid >> epbs, FALSE, db, &parent);
1860 rw_exit(&dn->dn_struct_rwlock);
1861 mutex_enter(&db->db_mtx);
1862 db->db_parent = parent;
1864 db->db_blkptr = (blkptr_t *)parent->db.db_data +
1865 (db->db_blkid & ((1ULL << epbs) - 1));
1871 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1873 dmu_buf_impl_t *db = dr->dr_dbuf;
1874 dnode_t *dn = db->db_dnode;
1877 ASSERT(dmu_tx_is_syncing(tx));
1879 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1881 mutex_enter(&db->db_mtx);
1883 ASSERT(db->db_level > 0);
1886 if (db->db_buf == NULL) {
1887 mutex_exit(&db->db_mtx);
1888 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
1889 mutex_enter(&db->db_mtx);
1891 ASSERT3U(db->db_state, ==, DB_CACHED);
1892 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
1893 ASSERT(db->db_buf != NULL);
1895 dbuf_check_blkptr(dn, db);
1897 db->db_data_pending = dr;
1899 mutex_exit(&db->db_mtx);
1900 dbuf_write(dr, db->db_buf, tx);
1903 mutex_enter(&dr->dt.di.dr_mtx);
1904 dbuf_sync_list(&dr->dt.di.dr_children, tx);
1905 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1906 mutex_exit(&dr->dt.di.dr_mtx);
1911 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1913 arc_buf_t **datap = &dr->dt.dl.dr_data;
1914 dmu_buf_impl_t *db = dr->dr_dbuf;
1915 dnode_t *dn = db->db_dnode;
1916 objset_impl_t *os = dn->dn_objset;
1917 uint64_t txg = tx->tx_txg;
1919 ASSERT(dmu_tx_is_syncing(tx));
1921 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1923 mutex_enter(&db->db_mtx);
1925 * To be synced, we must be dirtied. But we
1926 * might have been freed after the dirty.
1928 if (db->db_state == DB_UNCACHED) {
1929 /* This buffer has been freed since it was dirtied */
1930 ASSERT(db->db.db_data == NULL);
1931 } else if (db->db_state == DB_FILL) {
1932 /* This buffer was freed and is now being re-filled */
1933 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
1935 ASSERT3U(db->db_state, ==, DB_CACHED);
1940 * If this is a bonus buffer, simply copy the bonus data into the
1941 * dnode. It will be written out when the dnode is synced (and it
1942 * will be synced, since it must have been dirty for dbuf_sync to
1945 if (db->db_blkid == DB_BONUS_BLKID) {
1946 dbuf_dirty_record_t **drp;
1948 ASSERT(*datap != NULL);
1949 ASSERT3U(db->db_level, ==, 0);
1950 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
1951 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
1952 if (*datap != db->db.db_data) {
1953 zio_buf_free(*datap, DN_MAX_BONUSLEN);
1954 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1956 db->db_data_pending = NULL;
1957 drp = &db->db_last_dirty;
1959 drp = &(*drp)->dr_next;
1960 ASSERT(dr->dr_next == NULL);
1962 if (dr->dr_dbuf->db_level != 0) {
1963 list_destroy(&dr->dt.di.dr_children);
1964 mutex_destroy(&dr->dt.di.dr_mtx);
1966 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1967 ASSERT(db->db_dirtycnt > 0);
1968 db->db_dirtycnt -= 1;
1969 mutex_exit(&db->db_mtx);
1970 dbuf_rele(db, (void *)(uintptr_t)txg);
1975 * This function may have dropped the db_mtx lock allowing a dmu_sync
1976 * operation to sneak in. As a result, we need to ensure that we
1977 * don't check the dr_override_state until we have returned from
1978 * dbuf_check_blkptr.
1980 dbuf_check_blkptr(dn, db);
1983 * If this buffer is in the middle of an immdiate write,
1984 * wait for the synchronous IO to complete.
1986 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
1987 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1988 cv_wait(&db->db_changed, &db->db_mtx);
1989 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
1993 * If this dbuf has already been written out via an immediate write,
1994 * just complete the write by copying over the new block pointer and
1995 * updating the accounting via the write-completion functions.
1997 if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2000 zio_fake.io_private = &db;
2001 zio_fake.io_error = 0;
2002 zio_fake.io_bp = db->db_blkptr;
2003 zio_fake.io_bp_orig = *db->db_blkptr;
2004 zio_fake.io_txg = txg;
2005 zio_fake.io_flags = 0;
2007 *db->db_blkptr = dr->dt.dl.dr_overridden_by;
2008 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2009 db->db_data_pending = dr;
2010 dr->dr_zio = &zio_fake;
2011 mutex_exit(&db->db_mtx);
2013 ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
2014 BP_IDENTITY(&zio_fake.io_bp_orig)) ||
2015 BP_IS_HOLE(zio_fake.io_bp));
2017 if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
2018 (void) dsl_dataset_block_kill(os->os_dsl_dataset,
2019 &zio_fake.io_bp_orig, dn->dn_zio, tx);
2021 dbuf_write_ready(&zio_fake, db->db_buf, db);
2022 dbuf_write_done(&zio_fake, db->db_buf, db);
2027 if (dn->dn_object != DMU_META_DNODE_OBJECT &&
2028 refcount_count(&db->db_holds) > 1 &&
2029 *datap == db->db_buf) {
2031 * If this buffer is currently "in use" (i.e., there
2032 * are active holds and db_data still references it),
2033 * then make a copy before we start the write so that
2034 * any modifications from the open txg will not leak
2037 * NOTE: this copy does not need to be made for
2038 * objects only modified in the syncing context (e.g.
2039 * DNONE_DNODE blocks).
2041 int blksz = arc_buf_size(*datap);
2042 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2043 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2044 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2047 ASSERT(*datap != NULL);
2048 db->db_data_pending = dr;
2050 mutex_exit(&db->db_mtx);
2052 dbuf_write(dr, *datap, tx);
2054 ASSERT(!list_link_active(&dr->dr_dirty_node));
2055 if (dn->dn_object == DMU_META_DNODE_OBJECT)
2056 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2058 zio_nowait(dr->dr_zio);
2062 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2064 dbuf_dirty_record_t *dr;
2066 while (dr = list_head(list)) {
2067 if (dr->dr_zio != NULL) {
2069 * If we find an already initialized zio then we
2070 * are processing the meta-dnode, and we have finished.
2071 * The dbufs for all dnodes are put back on the list
2072 * during processing, so that we can zio_wait()
2073 * these IOs after initiating all child IOs.
2075 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2076 DMU_META_DNODE_OBJECT);
2079 list_remove(list, dr);
2080 if (dr->dr_dbuf->db_level > 0)
2081 dbuf_sync_indirect(dr, tx);
2083 dbuf_sync_leaf(dr, tx);
2088 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2090 dmu_buf_impl_t *db = dr->dr_dbuf;
2091 dnode_t *dn = db->db_dnode;
2092 objset_impl_t *os = dn->dn_objset;
2093 dmu_buf_impl_t *parent = db->db_parent;
2094 uint64_t txg = tx->tx_txg;
2096 writeprops_t wp = { 0 };
2099 if (!BP_IS_HOLE(db->db_blkptr) &&
2100 (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
2102 * Private object buffers are released here rather
2103 * than in dbuf_dirty() since they are only modified
2104 * in the syncing context and we don't want the
2105 * overhead of making multiple copies of the data.
2107 arc_release(data, db);
2109 ASSERT(arc_released(data));
2110 /* XXX why do we need to thaw here? */
2114 if (parent != dn->dn_dbuf) {
2115 ASSERT(parent && parent->db_data_pending);
2116 ASSERT(db->db_level == parent->db_level-1);
2117 ASSERT(arc_released(parent->db_buf));
2118 zio = parent->db_data_pending->dr_zio;
2120 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
2121 ASSERT3P(db->db_blkptr, ==,
2122 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2126 ASSERT(db->db_level == 0 || data == db->db_buf);
2127 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2130 zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
2131 zb.zb_object = db->db.db_object;
2132 zb.zb_level = db->db_level;
2133 zb.zb_blkid = db->db_blkid;
2135 wp.wp_type = dn->dn_type;
2136 wp.wp_level = db->db_level;
2137 wp.wp_copies = os->os_copies;
2138 wp.wp_dncompress = dn->dn_compress;
2139 wp.wp_oscompress = os->os_compress;
2140 wp.wp_dnchecksum = dn->dn_checksum;
2141 wp.wp_oschecksum = os->os_checksum;
2143 if (BP_IS_OLDER(db->db_blkptr, txg))
2144 (void) dsl_dataset_block_kill(
2145 os->os_dsl_dataset, db->db_blkptr, zio, tx);
2147 dr->dr_zio = arc_write(zio, os->os_spa, &wp,
2148 DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
2149 data, dbuf_write_ready, dbuf_write_done, db,
2150 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2155 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2157 dmu_buf_impl_t *db = vdb;
2158 dnode_t *dn = db->db_dnode;
2159 objset_impl_t *os = dn->dn_objset;
2160 blkptr_t *bp = zio->io_bp;
2161 blkptr_t *bp_orig = &zio->io_bp_orig;
2163 int old_size, new_size, i;
2165 ASSERT(db->db_blkptr == bp);
2167 dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
2169 old_size = bp_get_dasize(os->os_spa, bp_orig);
2170 new_size = bp_get_dasize(os->os_spa, bp);
2172 dnode_diduse_space(dn, new_size - old_size);
2174 if (BP_IS_HOLE(bp)) {
2175 dsl_dataset_t *ds = os->os_dsl_dataset;
2176 dmu_tx_t *tx = os->os_synctx;
2178 if (bp_orig->blk_birth == tx->tx_txg)
2179 (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
2180 ASSERT3U(bp->blk_fill, ==, 0);
2184 ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
2185 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2187 mutex_enter(&db->db_mtx);
2189 if (db->db_level == 0) {
2190 mutex_enter(&dn->dn_mtx);
2191 if (db->db_blkid > dn->dn_phys->dn_maxblkid)
2192 dn->dn_phys->dn_maxblkid = db->db_blkid;
2193 mutex_exit(&dn->dn_mtx);
2195 if (dn->dn_type == DMU_OT_DNODE) {
2196 dnode_phys_t *dnp = db->db.db_data;
2197 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2199 if (dnp->dn_type != DMU_OT_NONE)
2206 blkptr_t *ibp = db->db.db_data;
2207 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2208 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2209 if (BP_IS_HOLE(ibp))
2211 fill += ibp->blk_fill;
2215 bp->blk_fill = fill;
2217 mutex_exit(&db->db_mtx);
2219 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
2220 ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
2222 dsl_dataset_t *ds = os->os_dsl_dataset;
2223 dmu_tx_t *tx = os->os_synctx;
2225 if (bp_orig->blk_birth == tx->tx_txg)
2226 (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
2227 dsl_dataset_block_born(ds, bp, tx);
2233 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2235 dmu_buf_impl_t *db = vdb;
2236 uint64_t txg = zio->io_txg;
2237 dbuf_dirty_record_t **drp, *dr;
2239 ASSERT3U(zio->io_error, ==, 0);
2241 mutex_enter(&db->db_mtx);
2243 drp = &db->db_last_dirty;
2244 while ((dr = *drp) != db->db_data_pending)
2246 ASSERT(!list_link_active(&dr->dr_dirty_node));
2247 ASSERT(dr->dr_txg == txg);
2248 ASSERT(dr->dr_next == NULL);
2251 if (db->db_level == 0) {
2252 ASSERT(db->db_blkid != DB_BONUS_BLKID);
2253 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2255 if (dr->dt.dl.dr_data != db->db_buf)
2256 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
2257 else if (!BP_IS_HOLE(db->db_blkptr))
2258 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2260 ASSERT(arc_released(db->db_buf));
2262 dnode_t *dn = db->db_dnode;
2264 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2265 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2266 if (!BP_IS_HOLE(db->db_blkptr)) {
2268 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2269 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2271 ASSERT3U(dn->dn_phys->dn_maxblkid
2272 >> (db->db_level * epbs), >=, db->db_blkid);
2273 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2275 mutex_destroy(&dr->dt.di.dr_mtx);
2276 list_destroy(&dr->dt.di.dr_children);
2278 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2280 cv_broadcast(&db->db_changed);
2281 ASSERT(db->db_dirtycnt > 0);
2282 db->db_dirtycnt -= 1;
2283 db->db_data_pending = NULL;
2284 mutex_exit(&db->db_mtx);
2286 dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
2288 dbuf_rele(db, (void *)(uintptr_t)txg);