4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
28 #include <sys/dmu_impl.h>
30 #include <sys/dmu_objset.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dmu_tx.h>
36 #include <sys/dmu_zfetch.h>
38 static void dbuf_destroy(dmu_buf_impl_t *db);
39 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
40 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
41 static arc_done_func_t dbuf_write_ready;
42 static arc_done_func_t dbuf_write_done;
45 * Global data structures and functions for the dbuf cache.
47 static kmem_cache_t *dbuf_cache;
51 dbuf_cons(void *vdb, void *unused, int kmflag)
53 dmu_buf_impl_t *db = vdb;
54 bzero(db, sizeof (dmu_buf_impl_t));
56 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
57 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
58 refcount_create(&db->db_holds);
64 dbuf_dest(void *vdb, void *unused)
66 dmu_buf_impl_t *db = vdb;
67 mutex_destroy(&db->db_mtx);
68 cv_destroy(&db->db_changed);
69 refcount_destroy(&db->db_holds);
73 * dbuf hash table routines
75 static dbuf_hash_table_t dbuf_hash_table;
77 static uint64_t dbuf_hash_count;
80 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
82 uintptr_t osv = (uintptr_t)os;
85 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
86 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
93 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
98 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
100 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
101 ((dbuf)->db.db_object == (obj) && \
102 (dbuf)->db_objset == (os) && \
103 (dbuf)->db_level == (level) && \
104 (dbuf)->db_blkid == (blkid))
107 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
109 dbuf_hash_table_t *h = &dbuf_hash_table;
110 objset_impl_t *os = dn->dn_objset;
111 uint64_t obj = dn->dn_object;
112 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
113 uint64_t idx = hv & h->hash_table_mask;
116 mutex_enter(DBUF_HASH_MUTEX(h, idx));
117 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
118 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
119 mutex_enter(&db->db_mtx);
120 if (db->db_state != DB_EVICTING) {
121 mutex_exit(DBUF_HASH_MUTEX(h, idx));
124 mutex_exit(&db->db_mtx);
127 mutex_exit(DBUF_HASH_MUTEX(h, idx));
132 * Insert an entry into the hash table. If there is already an element
133 * equal to elem in the hash table, then the already existing element
134 * will be returned and the new element will not be inserted.
135 * Otherwise returns NULL.
137 static dmu_buf_impl_t *
138 dbuf_hash_insert(dmu_buf_impl_t *db)
140 dbuf_hash_table_t *h = &dbuf_hash_table;
141 objset_impl_t *os = db->db_objset;
142 uint64_t obj = db->db.db_object;
143 int level = db->db_level;
144 uint64_t blkid = db->db_blkid;
145 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
146 uint64_t idx = hv & h->hash_table_mask;
149 mutex_enter(DBUF_HASH_MUTEX(h, idx));
150 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
151 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
152 mutex_enter(&dbf->db_mtx);
153 if (dbf->db_state != DB_EVICTING) {
154 mutex_exit(DBUF_HASH_MUTEX(h, idx));
157 mutex_exit(&dbf->db_mtx);
161 mutex_enter(&db->db_mtx);
162 db->db_hash_next = h->hash_table[idx];
163 h->hash_table[idx] = db;
164 mutex_exit(DBUF_HASH_MUTEX(h, idx));
165 atomic_add_64(&dbuf_hash_count, 1);
171 * Remove an entry from the hash table. This operation will
172 * fail if there are any existing holds on the db.
175 dbuf_hash_remove(dmu_buf_impl_t *db)
177 dbuf_hash_table_t *h = &dbuf_hash_table;
178 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
179 db->db_level, db->db_blkid);
180 uint64_t idx = hv & h->hash_table_mask;
181 dmu_buf_impl_t *dbf, **dbp;
184 * We musn't hold db_mtx to maintin lock ordering:
185 * DBUF_HASH_MUTEX > db_mtx.
187 ASSERT(refcount_is_zero(&db->db_holds));
188 ASSERT(db->db_state == DB_EVICTING);
189 ASSERT(!MUTEX_HELD(&db->db_mtx));
191 mutex_enter(DBUF_HASH_MUTEX(h, idx));
192 dbp = &h->hash_table[idx];
193 while ((dbf = *dbp) != db) {
194 dbp = &dbf->db_hash_next;
197 *dbp = db->db_hash_next;
198 db->db_hash_next = NULL;
199 mutex_exit(DBUF_HASH_MUTEX(h, idx));
200 atomic_add_64(&dbuf_hash_count, -1);
203 static arc_evict_func_t dbuf_do_evict;
206 dbuf_evict_user(dmu_buf_impl_t *db)
208 ASSERT(MUTEX_HELD(&db->db_mtx));
210 if (db->db_level != 0 || db->db_evict_func == NULL)
213 if (db->db_user_data_ptr_ptr)
214 *db->db_user_data_ptr_ptr = db->db.db_data;
215 db->db_evict_func(&db->db, db->db_user_ptr);
216 db->db_user_ptr = NULL;
217 db->db_user_data_ptr_ptr = NULL;
218 db->db_evict_func = NULL;
222 dbuf_evict(dmu_buf_impl_t *db)
224 ASSERT(MUTEX_HELD(&db->db_mtx));
225 ASSERT(db->db_buf == NULL);
226 ASSERT(db->db_data_pending == NULL);
235 uint64_t hsize = 1ULL << 16;
236 dbuf_hash_table_t *h = &dbuf_hash_table;
240 * The hash table is big enough to fill all of physical memory
241 * with an average 4K block size. The table will take up
242 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
244 while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
248 h->hash_table_mask = hsize - 1;
249 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
250 if (h->hash_table == NULL) {
251 /* XXX - we should really return an error instead of assert */
252 ASSERT(hsize > (1ULL << 10));
257 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
258 sizeof (dmu_buf_impl_t),
259 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
261 for (i = 0; i < DBUF_MUTEXES; i++)
262 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
268 dbuf_hash_table_t *h = &dbuf_hash_table;
271 for (i = 0; i < DBUF_MUTEXES; i++)
272 mutex_destroy(&h->hash_mutexes[i]);
273 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
274 kmem_cache_destroy(dbuf_cache);
283 dbuf_verify(dmu_buf_impl_t *db)
285 dnode_t *dn = db->db_dnode;
287 ASSERT(MUTEX_HELD(&db->db_mtx));
289 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
292 ASSERT(db->db_objset != NULL);
294 ASSERT(db->db_parent == NULL);
295 ASSERT(db->db_blkptr == NULL);
297 ASSERT3U(db->db.db_object, ==, dn->dn_object);
298 ASSERT3P(db->db_objset, ==, dn->dn_objset);
299 ASSERT3U(db->db_level, <, dn->dn_nlevels);
300 ASSERT(db->db_blkid == DB_BONUS_BLKID ||
301 list_head(&dn->dn_dbufs));
303 if (db->db_blkid == DB_BONUS_BLKID) {
305 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
306 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
308 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
311 if (db->db_level == 0) {
312 /* we can be momentarily larger in dnode_set_blksz() */
313 if (db->db_blkid != DB_BONUS_BLKID && dn) {
314 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
316 if (db->db.db_object == DMU_META_DNODE_OBJECT) {
317 dbuf_dirty_record_t *dr = db->db_data_pending;
319 * it should only be modified in syncing
320 * context, so make sure we only have
321 * one copy of the data.
323 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
327 /* verify db->db_blkptr */
329 if (db->db_parent == dn->dn_dbuf) {
330 /* db is pointed to by the dnode */
331 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
332 if (db->db.db_object == DMU_META_DNODE_OBJECT)
333 ASSERT(db->db_parent == NULL);
335 ASSERT(db->db_parent != NULL);
336 ASSERT3P(db->db_blkptr, ==,
337 &dn->dn_phys->dn_blkptr[db->db_blkid]);
339 /* db is pointed to by an indirect block */
340 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
341 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
342 ASSERT3U(db->db_parent->db.db_object, ==,
345 * dnode_grow_indblksz() can make this fail if we don't
346 * have the struct_rwlock. XXX indblksz no longer
347 * grows. safe to do this now?
349 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
350 ASSERT3P(db->db_blkptr, ==,
351 ((blkptr_t *)db->db_parent->db.db_data +
352 db->db_blkid % epb));
356 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
357 db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
358 db->db_state != DB_FILL && !dn->dn_free_txg) {
360 * If the blkptr isn't set but they have nonzero data,
361 * it had better be dirty, otherwise we'll lose that
362 * data when we evict this buffer.
364 if (db->db_dirtycnt == 0) {
365 uint64_t *buf = db->db.db_data;
368 for (i = 0; i < db->db.db_size >> 3; i++) {
377 dbuf_update_data(dmu_buf_impl_t *db)
379 ASSERT(MUTEX_HELD(&db->db_mtx));
380 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
381 ASSERT(!refcount_is_zero(&db->db_holds));
382 *db->db_user_data_ptr_ptr = db->db.db_data;
387 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
389 ASSERT(MUTEX_HELD(&db->db_mtx));
390 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
393 ASSERT(buf->b_data != NULL);
394 db->db.db_data = buf->b_data;
395 if (!arc_released(buf))
396 arc_set_callback(buf, dbuf_do_evict, db);
397 dbuf_update_data(db);
400 db->db.db_data = NULL;
401 db->db_state = DB_UNCACHED;
406 dbuf_whichblock(dnode_t *dn, uint64_t offset)
408 if (dn->dn_datablkshift) {
409 return (offset >> dn->dn_datablkshift);
411 ASSERT3U(offset, <, dn->dn_datablksz);
417 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
419 dmu_buf_impl_t *db = vdb;
421 mutex_enter(&db->db_mtx);
422 ASSERT3U(db->db_state, ==, DB_READ);
424 * All reads are synchronous, so we must have a hold on the dbuf
426 ASSERT(refcount_count(&db->db_holds) > 0);
427 ASSERT(db->db_buf == NULL);
428 ASSERT(db->db.db_data == NULL);
429 if (db->db_level == 0 && db->db_freed_in_flight) {
430 /* we were freed in flight; disregard any error */
431 arc_release(buf, db);
432 bzero(buf->b_data, db->db.db_size);
434 db->db_freed_in_flight = FALSE;
435 dbuf_set_data(db, buf);
436 db->db_state = DB_CACHED;
437 } else if (zio == NULL || zio->io_error == 0) {
438 dbuf_set_data(db, buf);
439 db->db_state = DB_CACHED;
441 ASSERT(db->db_blkid != DB_BONUS_BLKID);
442 ASSERT3P(db->db_buf, ==, NULL);
443 VERIFY(arc_buf_remove_ref(buf, db) == 1);
444 db->db_state = DB_UNCACHED;
446 cv_broadcast(&db->db_changed);
447 mutex_exit(&db->db_mtx);
452 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
454 dnode_t *dn = db->db_dnode;
456 uint32_t aflags = ARC_NOWAIT;
459 ASSERT(!refcount_is_zero(&db->db_holds));
460 /* We need the struct_rwlock to prevent db_blkptr from changing. */
461 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
462 ASSERT(MUTEX_HELD(&db->db_mtx));
463 ASSERT(db->db_state == DB_UNCACHED);
464 ASSERT(db->db_buf == NULL);
466 if (db->db_blkid == DB_BONUS_BLKID) {
467 int bonuslen = dn->dn_bonuslen;
469 ASSERT3U(bonuslen, <=, db->db.db_size);
470 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
471 arc_space_consume(DN_MAX_BONUSLEN);
472 if (bonuslen < DN_MAX_BONUSLEN)
473 bzero(db->db.db_data, DN_MAX_BONUSLEN);
474 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
476 dbuf_update_data(db);
477 db->db_state = DB_CACHED;
478 mutex_exit(&db->db_mtx);
483 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
484 * processes the delete record and clears the bp while we are waiting
485 * for the dn_mtx (resulting in a "no" from block_freed).
487 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
488 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
489 BP_IS_HOLE(db->db_blkptr)))) {
490 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
492 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
493 db->db.db_size, db, type));
494 bzero(db->db.db_data, db->db.db_size);
495 db->db_state = DB_CACHED;
496 *flags |= DB_RF_CACHED;
497 mutex_exit(&db->db_mtx);
501 db->db_state = DB_READ;
502 mutex_exit(&db->db_mtx);
504 if (DBUF_IS_L2CACHEABLE(db))
505 aflags |= ARC_L2CACHE;
507 zb.zb_objset = db->db_objset->os_dsl_dataset ?
508 db->db_objset->os_dsl_dataset->ds_object : 0;
509 zb.zb_object = db->db.db_object;
510 zb.zb_level = db->db_level;
511 zb.zb_blkid = db->db_blkid;
513 dbuf_add_ref(db, NULL);
514 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
517 pbuf = db->db_parent->db_buf;
519 pbuf = db->db_objset->os_phys_buf;
521 (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
522 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
523 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
525 if (aflags & ARC_CACHED)
526 *flags |= DB_RF_CACHED;
530 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
533 int havepzio = (zio != NULL);
537 * We don't have to hold the mutex to check db_state because it
538 * can't be freed while we have a hold on the buffer.
540 ASSERT(!refcount_is_zero(&db->db_holds));
542 if ((flags & DB_RF_HAVESTRUCT) == 0)
543 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
545 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
546 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
547 DBUF_IS_CACHEABLE(db);
549 mutex_enter(&db->db_mtx);
550 if (db->db_state == DB_CACHED) {
551 mutex_exit(&db->db_mtx);
553 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
554 db->db.db_size, TRUE);
555 if ((flags & DB_RF_HAVESTRUCT) == 0)
556 rw_exit(&db->db_dnode->dn_struct_rwlock);
557 } else if (db->db_state == DB_UNCACHED) {
559 zio = zio_root(db->db_dnode->dn_objset->os_spa,
560 NULL, NULL, ZIO_FLAG_CANFAIL);
562 dbuf_read_impl(db, zio, &flags);
564 /* dbuf_read_impl has dropped db_mtx for us */
567 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
568 db->db.db_size, flags & DB_RF_CACHED);
570 if ((flags & DB_RF_HAVESTRUCT) == 0)
571 rw_exit(&db->db_dnode->dn_struct_rwlock);
576 mutex_exit(&db->db_mtx);
578 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
579 db->db.db_size, TRUE);
580 if ((flags & DB_RF_HAVESTRUCT) == 0)
581 rw_exit(&db->db_dnode->dn_struct_rwlock);
583 mutex_enter(&db->db_mtx);
584 if ((flags & DB_RF_NEVERWAIT) == 0) {
585 while (db->db_state == DB_READ ||
586 db->db_state == DB_FILL) {
587 ASSERT(db->db_state == DB_READ ||
588 (flags & DB_RF_HAVESTRUCT) == 0);
589 cv_wait(&db->db_changed, &db->db_mtx);
591 if (db->db_state == DB_UNCACHED)
594 mutex_exit(&db->db_mtx);
597 ASSERT(err || havepzio || db->db_state == DB_CACHED);
602 dbuf_noread(dmu_buf_impl_t *db)
604 ASSERT(!refcount_is_zero(&db->db_holds));
605 ASSERT(db->db_blkid != DB_BONUS_BLKID);
606 mutex_enter(&db->db_mtx);
607 while (db->db_state == DB_READ || db->db_state == DB_FILL)
608 cv_wait(&db->db_changed, &db->db_mtx);
609 if (db->db_state == DB_UNCACHED) {
610 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
612 ASSERT(db->db_buf == NULL);
613 ASSERT(db->db.db_data == NULL);
614 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
615 db->db.db_size, db, type));
616 db->db_state = DB_FILL;
618 ASSERT3U(db->db_state, ==, DB_CACHED);
620 mutex_exit(&db->db_mtx);
624 * This is our just-in-time copy function. It makes a copy of
625 * buffers, that have been modified in a previous transaction
626 * group, before we modify them in the current active group.
628 * This function is used in two places: when we are dirtying a
629 * buffer for the first time in a txg, and when we are freeing
630 * a range in a dnode that includes this buffer.
632 * Note that when we are called from dbuf_free_range() we do
633 * not put a hold on the buffer, we just traverse the active
634 * dbuf list for the dnode.
637 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
639 dbuf_dirty_record_t *dr = db->db_last_dirty;
641 ASSERT(MUTEX_HELD(&db->db_mtx));
642 ASSERT(db->db.db_data != NULL);
643 ASSERT(db->db_level == 0);
644 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
647 (dr->dt.dl.dr_data !=
648 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
652 * If the last dirty record for this dbuf has not yet synced
653 * and its referencing the dbuf data, either:
654 * reset the reference to point to a new copy,
655 * or (if there a no active holders)
656 * just null out the current db_data pointer.
658 ASSERT(dr->dr_txg >= txg - 2);
659 if (db->db_blkid == DB_BONUS_BLKID) {
660 /* Note that the data bufs here are zio_bufs */
661 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
662 arc_space_consume(DN_MAX_BONUSLEN);
663 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
664 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
665 int size = db->db.db_size;
666 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
667 dr->dt.dl.dr_data = arc_buf_alloc(
668 db->db_dnode->dn_objset->os_spa, size, db, type);
669 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
671 dbuf_set_data(db, NULL);
676 dbuf_unoverride(dbuf_dirty_record_t *dr)
678 dmu_buf_impl_t *db = dr->dr_dbuf;
679 uint64_t txg = dr->dr_txg;
681 ASSERT(MUTEX_HELD(&db->db_mtx));
682 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
683 ASSERT(db->db_level == 0);
685 if (db->db_blkid == DB_BONUS_BLKID ||
686 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
689 /* free this block */
690 if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
691 /* XXX can get silent EIO here */
692 (void) dsl_free(NULL,
693 spa_get_dsl(db->db_dnode->dn_objset->os_spa),
694 txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
696 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
698 * Release the already-written buffer, so we leave it in
699 * a consistent dirty state. Note that all callers are
700 * modifying the buffer, so they will immediately do
701 * another (redundant) arc_release(). Therefore, leave
702 * the buf thawed to save the effort of freezing &
703 * immediately re-thawing it.
705 arc_release(dr->dt.dl.dr_data, db);
709 * Evict (if its unreferenced) or clear (if its referenced) any level-0
710 * data blocks in the free range, so that any future readers will find
711 * empty blocks. Also, if we happen accross any level-1 dbufs in the
712 * range that have not already been marked dirty, mark them dirty so
713 * they stay in memory.
716 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
718 dmu_buf_impl_t *db, *db_next;
719 uint64_t txg = tx->tx_txg;
720 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
721 uint64_t first_l1 = start >> epbs;
722 uint64_t last_l1 = end >> epbs;
724 if (end > dn->dn_maxblkid) {
725 end = dn->dn_maxblkid;
726 last_l1 = end >> epbs;
728 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
729 mutex_enter(&dn->dn_dbufs_mtx);
730 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
731 db_next = list_next(&dn->dn_dbufs, db);
732 ASSERT(db->db_blkid != DB_BONUS_BLKID);
734 if (db->db_level == 1 &&
735 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
736 mutex_enter(&db->db_mtx);
737 if (db->db_last_dirty &&
738 db->db_last_dirty->dr_txg < txg) {
739 dbuf_add_ref(db, FTAG);
740 mutex_exit(&db->db_mtx);
741 dbuf_will_dirty(db, tx);
744 mutex_exit(&db->db_mtx);
748 if (db->db_level != 0)
750 dprintf_dbuf(db, "found buf %s\n", "");
751 if (db->db_blkid < start || db->db_blkid > end)
754 /* found a level 0 buffer in the range */
755 if (dbuf_undirty(db, tx))
758 mutex_enter(&db->db_mtx);
759 if (db->db_state == DB_UNCACHED ||
760 db->db_state == DB_EVICTING) {
761 ASSERT(db->db.db_data == NULL);
762 mutex_exit(&db->db_mtx);
765 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
766 /* will be handled in dbuf_read_done or dbuf_rele */
767 db->db_freed_in_flight = TRUE;
768 mutex_exit(&db->db_mtx);
771 if (refcount_count(&db->db_holds) == 0) {
776 /* The dbuf is referenced */
778 if (db->db_last_dirty != NULL) {
779 dbuf_dirty_record_t *dr = db->db_last_dirty;
781 if (dr->dr_txg == txg) {
783 * This buffer is "in-use", re-adjust the file
784 * size to reflect that this buffer may
785 * contain new data when we sync.
787 if (db->db_blkid > dn->dn_maxblkid)
788 dn->dn_maxblkid = db->db_blkid;
792 * This dbuf is not dirty in the open context.
793 * Either uncache it (if its not referenced in
794 * the open context) or reset its contents to
797 dbuf_fix_old_data(db, txg);
800 /* clear the contents if its cached */
801 if (db->db_state == DB_CACHED) {
802 ASSERT(db->db.db_data != NULL);
803 arc_release(db->db_buf, db);
804 bzero(db->db.db_data, db->db.db_size);
805 arc_buf_freeze(db->db_buf);
808 mutex_exit(&db->db_mtx);
810 mutex_exit(&dn->dn_dbufs_mtx);
814 dbuf_block_freeable(dmu_buf_impl_t *db)
816 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
817 uint64_t birth_txg = 0;
820 * We don't need any locking to protect db_blkptr:
821 * If it's syncing, then db_last_dirty will be set
822 * so we'll ignore db_blkptr.
824 ASSERT(MUTEX_HELD(&db->db_mtx));
825 if (db->db_last_dirty)
826 birth_txg = db->db_last_dirty->dr_txg;
827 else if (db->db_blkptr)
828 birth_txg = db->db_blkptr->blk_birth;
830 /* If we don't exist or are in a snapshot, we can't be freed */
832 return (ds == NULL ||
833 dsl_dataset_block_freeable(ds, birth_txg));
839 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
841 arc_buf_t *buf, *obuf;
842 int osize = db->db.db_size;
843 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
845 ASSERT(db->db_blkid != DB_BONUS_BLKID);
847 /* XXX does *this* func really need the lock? */
848 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
851 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
852 * is OK, because there can be no other references to the db
853 * when we are changing its size, so no concurrent DB_FILL can
857 * XXX we should be doing a dbuf_read, checking the return
858 * value and returning that up to our callers
860 dbuf_will_dirty(db, tx);
862 /* create the data buffer for the new block */
863 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
865 /* copy old block data to the new block */
867 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
868 /* zero the remainder */
870 bzero((uint8_t *)buf->b_data + osize, size - osize);
872 mutex_enter(&db->db_mtx);
873 dbuf_set_data(db, buf);
874 VERIFY(arc_buf_remove_ref(obuf, db) == 1);
875 db->db.db_size = size;
877 if (db->db_level == 0) {
878 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
879 db->db_last_dirty->dt.dl.dr_data = buf;
881 mutex_exit(&db->db_mtx);
883 dnode_willuse_space(db->db_dnode, size-osize, tx);
886 dbuf_dirty_record_t *
887 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
889 dnode_t *dn = db->db_dnode;
890 objset_impl_t *os = dn->dn_objset;
891 dbuf_dirty_record_t **drp, *dr;
892 int drop_struct_lock = FALSE;
893 boolean_t do_free_accounting = B_FALSE;
894 int txgoff = tx->tx_txg & TXG_MASK;
896 ASSERT(tx->tx_txg != 0);
897 ASSERT(!refcount_is_zero(&db->db_holds));
898 DMU_TX_DIRTY_BUF(tx, db);
901 * Shouldn't dirty a regular buffer in syncing context. Private
902 * objects may be dirtied in syncing context, but only if they
903 * were already pre-dirtied in open context.
904 * XXX We may want to prohibit dirtying in syncing context even
905 * if they did pre-dirty.
907 ASSERT(!dmu_tx_is_syncing(tx) ||
908 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
909 dn->dn_object == DMU_META_DNODE_OBJECT ||
910 dn->dn_objset->os_dsl_dataset == NULL ||
911 dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
914 * We make this assert for private objects as well, but after we
915 * check if we're already dirty. They are allowed to re-dirty
916 * in syncing context.
918 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
919 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
920 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
922 mutex_enter(&db->db_mtx);
924 * XXX make this true for indirects too? The problem is that
925 * transactions created with dmu_tx_create_assigned() from
926 * syncing context don't bother holding ahead.
928 ASSERT(db->db_level != 0 ||
929 db->db_state == DB_CACHED || db->db_state == DB_FILL);
931 mutex_enter(&dn->dn_mtx);
933 * Don't set dirtyctx to SYNC if we're just modifying this as we
934 * initialize the objset.
936 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
937 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
939 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
940 ASSERT(dn->dn_dirtyctx_firstset == NULL);
941 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
943 mutex_exit(&dn->dn_mtx);
946 * If this buffer is already dirty, we're done.
948 drp = &db->db_last_dirty;
949 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
950 db->db.db_object == DMU_META_DNODE_OBJECT);
951 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
953 if (dr && dr->dr_txg == tx->tx_txg) {
954 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
956 * If this buffer has already been written out,
957 * we now need to reset its state.
960 if (db->db.db_object != DMU_META_DNODE_OBJECT)
961 arc_buf_thaw(db->db_buf);
963 mutex_exit(&db->db_mtx);
968 * Only valid if not already dirty.
970 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
971 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
973 ASSERT3U(dn->dn_nlevels, >, db->db_level);
974 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
975 dn->dn_phys->dn_nlevels > db->db_level ||
976 dn->dn_next_nlevels[txgoff] > db->db_level ||
977 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
978 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
981 * We should only be dirtying in syncing context if it's the
982 * mos, a spa os, or we're initializing the os. However, we are
983 * allowed to dirty in syncing context provided we already
984 * dirtied it in open context. Hence we must make this
985 * assertion only if we're not already dirty.
987 ASSERT(!dmu_tx_is_syncing(tx) ||
988 os->os_dsl_dataset == NULL ||
989 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
990 !BP_IS_HOLE(os->os_rootbp));
991 ASSERT(db->db.db_size != 0);
993 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
995 if (db->db_blkid != DB_BONUS_BLKID) {
997 * Update the accounting.
998 * Note: we delay "free accounting" until after we drop
999 * the db_mtx. This keeps us from grabbing other locks
1000 * (and possibly deadlocking) in bp_get_dasize() while
1001 * also holding the db_mtx.
1003 dnode_willuse_space(dn, db->db.db_size, tx);
1004 do_free_accounting = dbuf_block_freeable(db);
1008 * If this buffer is dirty in an old transaction group we need
1009 * to make a copy of it so that the changes we make in this
1010 * transaction group won't leak out when we sync the older txg.
1012 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1013 if (db->db_level == 0) {
1014 void *data_old = db->db_buf;
1016 if (db->db_blkid == DB_BONUS_BLKID) {
1017 dbuf_fix_old_data(db, tx->tx_txg);
1018 data_old = db->db.db_data;
1019 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1021 * Release the data buffer from the cache so that we
1022 * can modify it without impacting possible other users
1023 * of this cached data block. Note that indirect
1024 * blocks and private objects are not released until the
1025 * syncing state (since they are only modified then).
1027 arc_release(db->db_buf, db);
1028 dbuf_fix_old_data(db, tx->tx_txg);
1029 data_old = db->db_buf;
1031 ASSERT(data_old != NULL);
1032 dr->dt.dl.dr_data = data_old;
1034 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1035 list_create(&dr->dt.di.dr_children,
1036 sizeof (dbuf_dirty_record_t),
1037 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1040 dr->dr_txg = tx->tx_txg;
1045 * We could have been freed_in_flight between the dbuf_noread
1046 * and dbuf_dirty. We win, as though the dbuf_noread() had
1047 * happened after the free.
1049 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
1050 mutex_enter(&dn->dn_mtx);
1051 dnode_clear_range(dn, db->db_blkid, 1, tx);
1052 mutex_exit(&dn->dn_mtx);
1053 db->db_freed_in_flight = FALSE;
1057 * This buffer is now part of this txg
1059 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1060 db->db_dirtycnt += 1;
1061 ASSERT3U(db->db_dirtycnt, <=, 3);
1063 mutex_exit(&db->db_mtx);
1065 if (db->db_blkid == DB_BONUS_BLKID) {
1066 mutex_enter(&dn->dn_mtx);
1067 ASSERT(!list_link_active(&dr->dr_dirty_node));
1068 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1069 mutex_exit(&dn->dn_mtx);
1070 dnode_setdirty(dn, tx);
1072 } else if (do_free_accounting) {
1073 blkptr_t *bp = db->db_blkptr;
1074 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1075 bp_get_dasize(os->os_spa, bp) : db->db.db_size;
1077 * This is only a guess -- if the dbuf is dirty
1078 * in a previous txg, we don't know how much
1079 * space it will use on disk yet. We should
1080 * really have the struct_rwlock to access
1081 * db_blkptr, but since this is just a guess,
1082 * it's OK if we get an odd answer.
1084 dnode_willuse_space(dn, -willfree, tx);
1087 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1088 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1089 drop_struct_lock = TRUE;
1092 if (db->db_level == 0) {
1093 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1094 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1097 if (db->db_level+1 < dn->dn_nlevels) {
1098 dmu_buf_impl_t *parent = db->db_parent;
1099 dbuf_dirty_record_t *di;
1100 int parent_held = FALSE;
1102 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1103 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1105 parent = dbuf_hold_level(dn, db->db_level+1,
1106 db->db_blkid >> epbs, FTAG);
1109 if (drop_struct_lock)
1110 rw_exit(&dn->dn_struct_rwlock);
1111 ASSERT3U(db->db_level+1, ==, parent->db_level);
1112 di = dbuf_dirty(parent, tx);
1114 dbuf_rele(parent, FTAG);
1116 mutex_enter(&db->db_mtx);
1117 /* possible race with dbuf_undirty() */
1118 if (db->db_last_dirty == dr ||
1119 dn->dn_object == DMU_META_DNODE_OBJECT) {
1120 mutex_enter(&di->dt.di.dr_mtx);
1121 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1122 ASSERT(!list_link_active(&dr->dr_dirty_node));
1123 list_insert_tail(&di->dt.di.dr_children, dr);
1124 mutex_exit(&di->dt.di.dr_mtx);
1127 mutex_exit(&db->db_mtx);
1129 ASSERT(db->db_level+1 == dn->dn_nlevels);
1130 ASSERT(db->db_blkid < dn->dn_nblkptr);
1131 ASSERT(db->db_parent == NULL ||
1132 db->db_parent == db->db_dnode->dn_dbuf);
1133 mutex_enter(&dn->dn_mtx);
1134 ASSERT(!list_link_active(&dr->dr_dirty_node));
1135 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1136 mutex_exit(&dn->dn_mtx);
1137 if (drop_struct_lock)
1138 rw_exit(&dn->dn_struct_rwlock);
1141 dnode_setdirty(dn, tx);
1146 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1148 dnode_t *dn = db->db_dnode;
1149 uint64_t txg = tx->tx_txg;
1150 dbuf_dirty_record_t *dr, **drp;
1153 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1155 mutex_enter(&db->db_mtx);
1158 * If this buffer is not dirty, we're done.
1160 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1161 if (dr->dr_txg <= txg)
1163 if (dr == NULL || dr->dr_txg < txg) {
1164 mutex_exit(&db->db_mtx);
1167 ASSERT(dr->dr_txg == txg);
1170 * If this buffer is currently held, we cannot undirty
1171 * it, since one of the current holders may be in the
1172 * middle of an update. Note that users of dbuf_undirty()
1173 * should not place a hold on the dbuf before the call.
1175 if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1176 mutex_exit(&db->db_mtx);
1177 /* Make sure we don't toss this buffer at sync phase */
1178 mutex_enter(&dn->dn_mtx);
1179 dnode_clear_range(dn, db->db_blkid, 1, tx);
1180 mutex_exit(&dn->dn_mtx);
1184 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1186 ASSERT(db->db.db_size != 0);
1188 /* XXX would be nice to fix up dn_towrite_space[] */
1192 if (dr->dr_parent) {
1193 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1194 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1195 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1196 } else if (db->db_level+1 == dn->dn_nlevels) {
1197 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1198 mutex_enter(&dn->dn_mtx);
1199 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1200 mutex_exit(&dn->dn_mtx);
1203 if (db->db_level == 0) {
1204 dbuf_unoverride(dr);
1206 ASSERT(db->db_buf != NULL);
1207 ASSERT(dr->dt.dl.dr_data != NULL);
1208 if (dr->dt.dl.dr_data != db->db_buf)
1209 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
1211 ASSERT(db->db_buf != NULL);
1212 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1213 mutex_destroy(&dr->dt.di.dr_mtx);
1214 list_destroy(&dr->dt.di.dr_children);
1216 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1218 ASSERT(db->db_dirtycnt > 0);
1219 db->db_dirtycnt -= 1;
1221 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1222 arc_buf_t *buf = db->db_buf;
1224 ASSERT(arc_released(buf));
1225 dbuf_set_data(db, NULL);
1226 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1231 mutex_exit(&db->db_mtx);
1235 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1237 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1239 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1241 ASSERT(tx->tx_txg != 0);
1242 ASSERT(!refcount_is_zero(&db->db_holds));
1244 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1245 rf |= DB_RF_HAVESTRUCT;
1246 (void) dbuf_read(db, NULL, rf);
1247 (void) dbuf_dirty(db, tx);
1251 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1253 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1255 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1256 ASSERT(tx->tx_txg != 0);
1257 ASSERT(db->db_level == 0);
1258 ASSERT(!refcount_is_zero(&db->db_holds));
1260 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1261 dmu_tx_private_ok(tx));
1264 (void) dbuf_dirty(db, tx);
1267 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1270 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1272 mutex_enter(&db->db_mtx);
1275 if (db->db_state == DB_FILL) {
1276 if (db->db_level == 0 && db->db_freed_in_flight) {
1277 ASSERT(db->db_blkid != DB_BONUS_BLKID);
1278 /* we were freed while filling */
1279 /* XXX dbuf_undirty? */
1280 bzero(db->db.db_data, db->db.db_size);
1281 db->db_freed_in_flight = FALSE;
1283 db->db_state = DB_CACHED;
1284 cv_broadcast(&db->db_changed);
1286 mutex_exit(&db->db_mtx);
1290 * "Clear" the contents of this dbuf. This will mark the dbuf
1291 * EVICTING and clear *most* of its references. Unfortunetely,
1292 * when we are not holding the dn_dbufs_mtx, we can't clear the
1293 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1294 * in this case. For callers from the DMU we will usually see:
1295 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1296 * For the arc callback, we will usually see:
1297 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1298 * Sometimes, though, we will get a mix of these two:
1299 * DMU: dbuf_clear()->arc_buf_evict()
1300 * ARC: dbuf_do_evict()->dbuf_destroy()
1303 dbuf_clear(dmu_buf_impl_t *db)
1305 dnode_t *dn = db->db_dnode;
1306 dmu_buf_impl_t *parent = db->db_parent;
1307 dmu_buf_impl_t *dndb = dn->dn_dbuf;
1308 int dbuf_gone = FALSE;
1310 ASSERT(MUTEX_HELD(&db->db_mtx));
1311 ASSERT(refcount_is_zero(&db->db_holds));
1313 dbuf_evict_user(db);
1315 if (db->db_state == DB_CACHED) {
1316 ASSERT(db->db.db_data != NULL);
1317 if (db->db_blkid == DB_BONUS_BLKID) {
1318 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1319 arc_space_return(DN_MAX_BONUSLEN);
1321 db->db.db_data = NULL;
1322 db->db_state = DB_UNCACHED;
1325 ASSERT3U(db->db_state, ==, DB_UNCACHED);
1326 ASSERT(db->db_data_pending == NULL);
1328 db->db_state = DB_EVICTING;
1329 db->db_blkptr = NULL;
1331 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1332 list_remove(&dn->dn_dbufs, db);
1334 db->db_dnode = NULL;
1338 dbuf_gone = arc_buf_evict(db->db_buf);
1341 mutex_exit(&db->db_mtx);
1344 * If this dbuf is referened from an indirect dbuf,
1345 * decrement the ref count on the indirect dbuf.
1347 if (parent && parent != dndb)
1348 dbuf_rele(parent, db);
1352 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1353 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1360 ASSERT(blkid != DB_BONUS_BLKID);
1362 if (dn->dn_phys->dn_nlevels == 0)
1365 nlevels = dn->dn_phys->dn_nlevels;
1367 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1369 ASSERT3U(level * epbs, <, 64);
1370 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1371 if (level >= nlevels ||
1372 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1373 /* the buffer has no parent yet */
1375 } else if (level < nlevels-1) {
1376 /* this block is referenced from an indirect block */
1377 int err = dbuf_hold_impl(dn, level+1,
1378 blkid >> epbs, fail_sparse, NULL, parentp);
1381 err = dbuf_read(*parentp, NULL,
1382 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1384 dbuf_rele(*parentp, NULL);
1388 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1389 (blkid & ((1ULL << epbs) - 1));
1392 /* the block is referenced from the dnode */
1393 ASSERT3U(level, ==, nlevels-1);
1394 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1395 blkid < dn->dn_phys->dn_nblkptr);
1397 dbuf_add_ref(dn->dn_dbuf, NULL);
1398 *parentp = dn->dn_dbuf;
1400 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1405 static dmu_buf_impl_t *
1406 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1407 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1409 objset_impl_t *os = dn->dn_objset;
1410 dmu_buf_impl_t *db, *odb;
1412 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1413 ASSERT(dn->dn_type != DMU_OT_NONE);
1415 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1418 db->db.db_object = dn->dn_object;
1419 db->db_level = level;
1420 db->db_blkid = blkid;
1421 db->db_last_dirty = NULL;
1422 db->db_dirtycnt = 0;
1424 db->db_parent = parent;
1425 db->db_blkptr = blkptr;
1427 db->db_user_ptr = NULL;
1428 db->db_user_data_ptr_ptr = NULL;
1429 db->db_evict_func = NULL;
1430 db->db_immediate_evict = 0;
1431 db->db_freed_in_flight = 0;
1433 if (blkid == DB_BONUS_BLKID) {
1434 ASSERT3P(parent, ==, dn->dn_dbuf);
1435 db->db.db_size = DN_MAX_BONUSLEN -
1436 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1437 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1438 db->db.db_offset = DB_BONUS_BLKID;
1439 db->db_state = DB_UNCACHED;
1440 /* the bonus dbuf is not placed in the hash table */
1441 arc_space_consume(sizeof (dmu_buf_impl_t));
1445 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1446 db->db.db_size = blocksize;
1447 db->db.db_offset = db->db_blkid * blocksize;
1451 * Hold the dn_dbufs_mtx while we get the new dbuf
1452 * in the hash table *and* added to the dbufs list.
1453 * This prevents a possible deadlock with someone
1454 * trying to look up this dbuf before its added to the
1457 mutex_enter(&dn->dn_dbufs_mtx);
1458 db->db_state = DB_EVICTING;
1459 if ((odb = dbuf_hash_insert(db)) != NULL) {
1460 /* someone else inserted it first */
1461 kmem_cache_free(dbuf_cache, db);
1462 mutex_exit(&dn->dn_dbufs_mtx);
1465 list_insert_head(&dn->dn_dbufs, db);
1466 db->db_state = DB_UNCACHED;
1467 mutex_exit(&dn->dn_dbufs_mtx);
1468 arc_space_consume(sizeof (dmu_buf_impl_t));
1470 if (parent && parent != dn->dn_dbuf)
1471 dbuf_add_ref(parent, db);
1473 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1474 refcount_count(&dn->dn_holds) > 0);
1475 (void) refcount_add(&dn->dn_holds, db);
1477 dprintf_dbuf(db, "db=%p\n", db);
1483 dbuf_do_evict(void *private)
1485 arc_buf_t *buf = private;
1486 dmu_buf_impl_t *db = buf->b_private;
1488 if (!MUTEX_HELD(&db->db_mtx))
1489 mutex_enter(&db->db_mtx);
1491 ASSERT(refcount_is_zero(&db->db_holds));
1493 if (db->db_state != DB_EVICTING) {
1494 ASSERT(db->db_state == DB_CACHED);
1499 mutex_exit(&db->db_mtx);
1506 dbuf_destroy(dmu_buf_impl_t *db)
1508 ASSERT(refcount_is_zero(&db->db_holds));
1510 if (db->db_blkid != DB_BONUS_BLKID) {
1512 * If this dbuf is still on the dn_dbufs list,
1513 * remove it from that list.
1516 dnode_t *dn = db->db_dnode;
1518 mutex_enter(&dn->dn_dbufs_mtx);
1519 list_remove(&dn->dn_dbufs, db);
1520 mutex_exit(&dn->dn_dbufs_mtx);
1523 db->db_dnode = NULL;
1525 dbuf_hash_remove(db);
1527 db->db_parent = NULL;
1530 ASSERT(!list_link_active(&db->db_link));
1531 ASSERT(db->db.db_data == NULL);
1532 ASSERT(db->db_hash_next == NULL);
1533 ASSERT(db->db_blkptr == NULL);
1534 ASSERT(db->db_data_pending == NULL);
1536 kmem_cache_free(dbuf_cache, db);
1537 arc_space_return(sizeof (dmu_buf_impl_t));
1541 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1543 dmu_buf_impl_t *db = NULL;
1544 blkptr_t *bp = NULL;
1546 ASSERT(blkid != DB_BONUS_BLKID);
1547 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1549 if (dnode_block_freed(dn, blkid))
1552 /* dbuf_find() returns with db_mtx held */
1553 if (db = dbuf_find(dn, 0, blkid)) {
1554 if (refcount_count(&db->db_holds) > 0) {
1556 * This dbuf is active. We assume that it is
1557 * already CACHED, or else about to be either
1560 mutex_exit(&db->db_mtx);
1563 mutex_exit(&db->db_mtx);
1567 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1568 if (bp && !BP_IS_HOLE(bp)) {
1570 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1572 zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
1573 dn->dn_objset->os_dsl_dataset->ds_object : 0;
1574 zb.zb_object = dn->dn_object;
1576 zb.zb_blkid = blkid;
1581 pbuf = dn->dn_objset->os_phys_buf;
1583 (void) arc_read(NULL, dn->dn_objset->os_spa,
1584 bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1585 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1589 dbuf_rele(db, NULL);
1594 * Returns with db_holds incremented, and db_mtx not held.
1595 * Note: dn_struct_rwlock must be held.
1598 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1599 void *tag, dmu_buf_impl_t **dbp)
1601 dmu_buf_impl_t *db, *parent = NULL;
1603 ASSERT(blkid != DB_BONUS_BLKID);
1604 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1605 ASSERT3U(dn->dn_nlevels, >, level);
1609 /* dbuf_find() returns with db_mtx held */
1610 db = dbuf_find(dn, level, blkid);
1613 blkptr_t *bp = NULL;
1616 ASSERT3P(parent, ==, NULL);
1617 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1619 if (err == 0 && bp && BP_IS_HOLE(bp))
1623 dbuf_rele(parent, NULL);
1627 if (err && err != ENOENT)
1629 db = dbuf_create(dn, level, blkid, parent, bp);
1632 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1633 arc_buf_add_ref(db->db_buf, db);
1634 if (db->db_buf->b_data == NULL) {
1637 dbuf_rele(parent, NULL);
1642 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1645 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1648 * If this buffer is currently syncing out, and we are are
1649 * still referencing it from db_data, we need to make a copy
1650 * of it in case we decide we want to dirty it again in this txg.
1652 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
1653 dn->dn_object != DMU_META_DNODE_OBJECT &&
1654 db->db_state == DB_CACHED && db->db_data_pending) {
1655 dbuf_dirty_record_t *dr = db->db_data_pending;
1657 if (dr->dt.dl.dr_data == db->db_buf) {
1658 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1661 arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1662 db->db.db_size, db, type));
1663 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1668 (void) refcount_add(&db->db_holds, tag);
1669 dbuf_update_data(db);
1671 mutex_exit(&db->db_mtx);
1673 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1675 dbuf_rele(parent, NULL);
1677 ASSERT3P(db->db_dnode, ==, dn);
1678 ASSERT3U(db->db_blkid, ==, blkid);
1679 ASSERT3U(db->db_level, ==, level);
1686 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1689 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1690 return (err ? NULL : db);
1694 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1697 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1698 return (err ? NULL : db);
1702 dbuf_create_bonus(dnode_t *dn)
1704 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1706 ASSERT(dn->dn_bonus == NULL);
1707 dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
1710 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1712 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1714 int64_t holds = refcount_add(&db->db_holds, tag);
1718 #pragma weak dmu_buf_rele = dbuf_rele
1720 dbuf_rele(dmu_buf_impl_t *db, void *tag)
1724 mutex_enter(&db->db_mtx);
1727 holds = refcount_remove(&db->db_holds, tag);
1731 * We can't freeze indirects if there is a possibility that they
1732 * may be modified in the current syncing context.
1734 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
1735 arc_buf_freeze(db->db_buf);
1737 if (holds == db->db_dirtycnt &&
1738 db->db_level == 0 && db->db_immediate_evict)
1739 dbuf_evict_user(db);
1742 if (db->db_blkid == DB_BONUS_BLKID) {
1743 mutex_exit(&db->db_mtx);
1744 dnode_rele(db->db_dnode, db);
1745 } else if (db->db_buf == NULL) {
1747 * This is a special case: we never associated this
1748 * dbuf with any data allocated from the ARC.
1750 ASSERT3U(db->db_state, ==, DB_UNCACHED);
1752 } else if (arc_released(db->db_buf)) {
1753 arc_buf_t *buf = db->db_buf;
1755 * This dbuf has anonymous data associated with it.
1757 dbuf_set_data(db, NULL);
1758 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1761 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
1762 if (!DBUF_IS_CACHEABLE(db))
1765 mutex_exit(&db->db_mtx);
1768 mutex_exit(&db->db_mtx);
1772 #pragma weak dmu_buf_refcount = dbuf_refcount
1774 dbuf_refcount(dmu_buf_impl_t *db)
1776 return (refcount_count(&db->db_holds));
1780 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1781 dmu_buf_evict_func_t *evict_func)
1783 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1784 user_data_ptr_ptr, evict_func));
1788 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1789 dmu_buf_evict_func_t *evict_func)
1791 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1793 db->db_immediate_evict = TRUE;
1794 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1795 user_data_ptr_ptr, evict_func));
1799 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1800 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1802 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1803 ASSERT(db->db_level == 0);
1805 ASSERT((user_ptr == NULL) == (evict_func == NULL));
1807 mutex_enter(&db->db_mtx);
1809 if (db->db_user_ptr == old_user_ptr) {
1810 db->db_user_ptr = user_ptr;
1811 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
1812 db->db_evict_func = evict_func;
1814 dbuf_update_data(db);
1816 old_user_ptr = db->db_user_ptr;
1819 mutex_exit(&db->db_mtx);
1820 return (old_user_ptr);
1824 dmu_buf_get_user(dmu_buf_t *db_fake)
1826 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1827 ASSERT(!refcount_is_zero(&db->db_holds));
1829 return (db->db_user_ptr);
1833 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
1835 /* ASSERT(dmu_tx_is_syncing(tx) */
1836 ASSERT(MUTEX_HELD(&db->db_mtx));
1838 if (db->db_blkptr != NULL)
1841 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
1843 * This buffer was allocated at a time when there was
1844 * no available blkptrs from the dnode, or it was
1845 * inappropriate to hook it in (i.e., nlevels mis-match).
1847 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
1848 ASSERT(db->db_parent == NULL);
1849 db->db_parent = dn->dn_dbuf;
1850 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1853 dmu_buf_impl_t *parent = db->db_parent;
1854 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1856 ASSERT(dn->dn_phys->dn_nlevels > 1);
1857 if (parent == NULL) {
1858 mutex_exit(&db->db_mtx);
1859 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1860 (void) dbuf_hold_impl(dn, db->db_level+1,
1861 db->db_blkid >> epbs, FALSE, db, &parent);
1862 rw_exit(&dn->dn_struct_rwlock);
1863 mutex_enter(&db->db_mtx);
1864 db->db_parent = parent;
1866 db->db_blkptr = (blkptr_t *)parent->db.db_data +
1867 (db->db_blkid & ((1ULL << epbs) - 1));
1873 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1875 dmu_buf_impl_t *db = dr->dr_dbuf;
1876 dnode_t *dn = db->db_dnode;
1879 ASSERT(dmu_tx_is_syncing(tx));
1881 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1883 mutex_enter(&db->db_mtx);
1885 ASSERT(db->db_level > 0);
1888 if (db->db_buf == NULL) {
1889 mutex_exit(&db->db_mtx);
1890 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
1891 mutex_enter(&db->db_mtx);
1893 ASSERT3U(db->db_state, ==, DB_CACHED);
1894 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
1895 ASSERT(db->db_buf != NULL);
1897 dbuf_check_blkptr(dn, db);
1899 db->db_data_pending = dr;
1901 mutex_exit(&db->db_mtx);
1902 dbuf_write(dr, db->db_buf, tx);
1905 mutex_enter(&dr->dt.di.dr_mtx);
1906 dbuf_sync_list(&dr->dt.di.dr_children, tx);
1907 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1908 mutex_exit(&dr->dt.di.dr_mtx);
1913 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1915 arc_buf_t **datap = &dr->dt.dl.dr_data;
1916 dmu_buf_impl_t *db = dr->dr_dbuf;
1917 dnode_t *dn = db->db_dnode;
1918 objset_impl_t *os = dn->dn_objset;
1919 uint64_t txg = tx->tx_txg;
1922 ASSERT(dmu_tx_is_syncing(tx));
1924 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1926 mutex_enter(&db->db_mtx);
1928 * To be synced, we must be dirtied. But we
1929 * might have been freed after the dirty.
1931 if (db->db_state == DB_UNCACHED) {
1932 /* This buffer has been freed since it was dirtied */
1933 ASSERT(db->db.db_data == NULL);
1934 } else if (db->db_state == DB_FILL) {
1935 /* This buffer was freed and is now being re-filled */
1936 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
1938 ASSERT3U(db->db_state, ==, DB_CACHED);
1943 * If this is a bonus buffer, simply copy the bonus data into the
1944 * dnode. It will be written out when the dnode is synced (and it
1945 * will be synced, since it must have been dirty for dbuf_sync to
1948 if (db->db_blkid == DB_BONUS_BLKID) {
1949 dbuf_dirty_record_t **drp;
1951 ASSERT(*datap != NULL);
1952 ASSERT3U(db->db_level, ==, 0);
1953 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
1954 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
1955 if (*datap != db->db.db_data) {
1956 zio_buf_free(*datap, DN_MAX_BONUSLEN);
1957 arc_space_return(DN_MAX_BONUSLEN);
1959 db->db_data_pending = NULL;
1960 drp = &db->db_last_dirty;
1962 drp = &(*drp)->dr_next;
1963 ASSERT(dr->dr_next == NULL);
1965 if (dr->dr_dbuf->db_level != 0) {
1966 list_destroy(&dr->dt.di.dr_children);
1967 mutex_destroy(&dr->dt.di.dr_mtx);
1969 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1970 ASSERT(db->db_dirtycnt > 0);
1971 db->db_dirtycnt -= 1;
1972 mutex_exit(&db->db_mtx);
1973 dbuf_rele(db, (void *)(uintptr_t)txg);
1978 * This function may have dropped the db_mtx lock allowing a dmu_sync
1979 * operation to sneak in. As a result, we need to ensure that we
1980 * don't check the dr_override_state until we have returned from
1981 * dbuf_check_blkptr.
1983 dbuf_check_blkptr(dn, db);
1986 * If this buffer is in the middle of an immdiate write,
1987 * wait for the synchronous IO to complete.
1989 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
1990 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1991 cv_wait(&db->db_changed, &db->db_mtx);
1992 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
1996 * If this dbuf has already been written out via an immediate write,
1997 * just complete the write by copying over the new block pointer and
1998 * updating the accounting via the write-completion functions.
2000 if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2003 zio_fake.io_private = &db;
2004 zio_fake.io_error = 0;
2005 zio_fake.io_bp = db->db_blkptr;
2006 zio_fake.io_bp_orig = *db->db_blkptr;
2007 zio_fake.io_txg = txg;
2008 zio_fake.io_flags = 0;
2010 *db->db_blkptr = dr->dt.dl.dr_overridden_by;
2011 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2012 db->db_data_pending = dr;
2013 dr->dr_zio = &zio_fake;
2014 mutex_exit(&db->db_mtx);
2016 ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
2017 BP_IDENTITY(&zio_fake.io_bp_orig)) ||
2018 BP_IS_HOLE(zio_fake.io_bp));
2020 if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
2021 (void) dsl_dataset_block_kill(os->os_dsl_dataset,
2022 &zio_fake.io_bp_orig, dn->dn_zio, tx);
2024 dbuf_write_ready(&zio_fake, db->db_buf, db);
2025 dbuf_write_done(&zio_fake, db->db_buf, db);
2030 blksz = arc_buf_size(*datap);
2032 if (dn->dn_object != DMU_META_DNODE_OBJECT) {
2034 * If this buffer is currently "in use" (i.e., there are
2035 * active holds and db_data still references it), then make
2036 * a copy before we start the write so that any modifications
2037 * from the open txg will not leak into this write.
2039 * NOTE: this copy does not need to be made for objects only
2040 * modified in the syncing context (e.g. DNONE_DNODE blocks).
2042 if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
2043 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2044 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2045 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2049 ASSERT(*datap != NULL);
2050 db->db_data_pending = dr;
2052 mutex_exit(&db->db_mtx);
2054 dbuf_write(dr, *datap, tx);
2056 ASSERT(!list_link_active(&dr->dr_dirty_node));
2057 if (dn->dn_object == DMU_META_DNODE_OBJECT)
2058 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2060 zio_nowait(dr->dr_zio);
2064 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2066 dbuf_dirty_record_t *dr;
2068 while (dr = list_head(list)) {
2069 if (dr->dr_zio != NULL) {
2071 * If we find an already initialized zio then we
2072 * are processing the meta-dnode, and we have finished.
2073 * The dbufs for all dnodes are put back on the list
2074 * during processing, so that we can zio_wait()
2075 * these IOs after initiating all child IOs.
2077 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2078 DMU_META_DNODE_OBJECT);
2081 list_remove(list, dr);
2082 if (dr->dr_dbuf->db_level > 0)
2083 dbuf_sync_indirect(dr, tx);
2085 dbuf_sync_leaf(dr, tx);
2090 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2092 dmu_buf_impl_t *db = dr->dr_dbuf;
2093 dnode_t *dn = db->db_dnode;
2094 objset_impl_t *os = dn->dn_objset;
2095 dmu_buf_impl_t *parent = db->db_parent;
2096 uint64_t txg = tx->tx_txg;
2098 writeprops_t wp = { 0 };
2101 if (!BP_IS_HOLE(db->db_blkptr) &&
2102 (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
2104 * Private object buffers are released here rather
2105 * than in dbuf_dirty() since they are only modified
2106 * in the syncing context and we don't want the
2107 * overhead of making multiple copies of the data.
2109 arc_release(data, db);
2111 ASSERT(arc_released(data));
2112 /* XXX why do we need to thaw here? */
2116 if (parent != dn->dn_dbuf) {
2117 ASSERT(parent && parent->db_data_pending);
2118 ASSERT(db->db_level == parent->db_level-1);
2119 ASSERT(arc_released(parent->db_buf));
2120 zio = parent->db_data_pending->dr_zio;
2122 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
2123 ASSERT3P(db->db_blkptr, ==,
2124 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2128 ASSERT(db->db_level == 0 || data == db->db_buf);
2129 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2132 zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
2133 zb.zb_object = db->db.db_object;
2134 zb.zb_level = db->db_level;
2135 zb.zb_blkid = db->db_blkid;
2137 wp.wp_type = dn->dn_type;
2138 wp.wp_level = db->db_level;
2139 wp.wp_copies = os->os_copies;
2140 wp.wp_dncompress = dn->dn_compress;
2141 wp.wp_oscompress = os->os_compress;
2142 wp.wp_dnchecksum = dn->dn_checksum;
2143 wp.wp_oschecksum = os->os_checksum;
2145 if (BP_IS_OLDER(db->db_blkptr, txg))
2146 (void) dsl_dataset_block_kill(
2147 os->os_dsl_dataset, db->db_blkptr, zio, tx);
2149 dr->dr_zio = arc_write(zio, os->os_spa, &wp,
2150 DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
2151 data, dbuf_write_ready, dbuf_write_done, db,
2152 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2157 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2159 dmu_buf_impl_t *db = vdb;
2160 dnode_t *dn = db->db_dnode;
2161 objset_impl_t *os = dn->dn_objset;
2162 blkptr_t *bp = zio->io_bp;
2163 blkptr_t *bp_orig = &zio->io_bp_orig;
2165 int old_size, new_size, i;
2167 ASSERT(db->db_blkptr == bp);
2169 dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
2171 old_size = bp_get_dasize(os->os_spa, bp_orig);
2172 new_size = bp_get_dasize(os->os_spa, bp);
2174 dnode_diduse_space(dn, new_size - old_size);
2176 if (BP_IS_HOLE(bp)) {
2177 dsl_dataset_t *ds = os->os_dsl_dataset;
2178 dmu_tx_t *tx = os->os_synctx;
2180 if (bp_orig->blk_birth == tx->tx_txg)
2181 (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
2182 ASSERT3U(bp->blk_fill, ==, 0);
2186 ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
2187 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2189 mutex_enter(&db->db_mtx);
2191 if (db->db_level == 0) {
2192 mutex_enter(&dn->dn_mtx);
2193 if (db->db_blkid > dn->dn_phys->dn_maxblkid)
2194 dn->dn_phys->dn_maxblkid = db->db_blkid;
2195 mutex_exit(&dn->dn_mtx);
2197 if (dn->dn_type == DMU_OT_DNODE) {
2198 dnode_phys_t *dnp = db->db.db_data;
2199 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2201 if (dnp->dn_type != DMU_OT_NONE)
2208 blkptr_t *ibp = db->db.db_data;
2209 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2210 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2211 if (BP_IS_HOLE(ibp))
2213 ASSERT3U(BP_GET_LSIZE(ibp), ==,
2214 db->db_level == 1 ? dn->dn_datablksz :
2215 (1<<dn->dn_phys->dn_indblkshift));
2216 fill += ibp->blk_fill;
2220 bp->blk_fill = fill;
2222 mutex_exit(&db->db_mtx);
2224 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
2225 ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
2227 dsl_dataset_t *ds = os->os_dsl_dataset;
2228 dmu_tx_t *tx = os->os_synctx;
2230 if (bp_orig->blk_birth == tx->tx_txg)
2231 (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
2232 dsl_dataset_block_born(ds, bp, tx);
2238 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2240 dmu_buf_impl_t *db = vdb;
2241 uint64_t txg = zio->io_txg;
2242 dbuf_dirty_record_t **drp, *dr;
2244 ASSERT3U(zio->io_error, ==, 0);
2246 mutex_enter(&db->db_mtx);
2248 drp = &db->db_last_dirty;
2249 while ((dr = *drp) != db->db_data_pending)
2251 ASSERT(!list_link_active(&dr->dr_dirty_node));
2252 ASSERT(dr->dr_txg == txg);
2253 ASSERT(dr->dr_next == NULL);
2256 if (db->db_level == 0) {
2257 ASSERT(db->db_blkid != DB_BONUS_BLKID);
2258 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2260 if (dr->dt.dl.dr_data != db->db_buf)
2261 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
2262 else if (!BP_IS_HOLE(db->db_blkptr))
2263 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2265 ASSERT(arc_released(db->db_buf));
2267 dnode_t *dn = db->db_dnode;
2269 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2270 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2271 if (!BP_IS_HOLE(db->db_blkptr)) {
2273 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2274 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2276 ASSERT3U(dn->dn_phys->dn_maxblkid
2277 >> (db->db_level * epbs), >=, db->db_blkid);
2278 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2280 mutex_destroy(&dr->dt.di.dr_mtx);
2281 list_destroy(&dr->dt.di.dr_children);
2283 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2285 cv_broadcast(&db->db_changed);
2286 ASSERT(db->db_dirtycnt > 0);
2287 db->db_dirtycnt -= 1;
2288 db->db_data_pending = NULL;
2289 mutex_exit(&db->db_mtx);
2291 dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
2293 dbuf_rele(db, (void *)(uintptr_t)txg);