stand/libsa/zfs/zfsimpl.c

   1 /*-
   2  * Copyright (c) 2007 Doug Rabson
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 /*
  31  *      Stand-alone ZFS file reader.
  32  */
  33
  34 #include <sys/endian.h>
  35 #include <sys/stat.h>
  36 #include <sys/stdint.h>
  37 #include <sys/list.h>
  38 #include <machine/_inttypes.h>
  39
  40 #include "zfsimpl.h"
  41 #include "zfssubr.c"
  42
  43
  44 struct zfsmount {
  45         const spa_t     *spa;
  46         objset_phys_t   objset;
  47         uint64_t        rootobj;
  48 };
  49 static struct zfsmount zfsmount __unused;
  50
  51 /*
  52  * The indirect_child_t represents the vdev that we will read from, when we
  53  * need to read all copies of the data (e.g. for scrub or reconstruction).
  54  * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
  55  * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
  56  * ic_vdev is a child of the mirror.
  57  */
  58 typedef struct indirect_child {
  59         void *ic_data;
  60         vdev_t *ic_vdev;
  61 } indirect_child_t;
  62
  63 /*
  64  * The indirect_split_t represents one mapped segment of an i/o to the
  65  * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
  66  * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
  67  * For split blocks, there will be several of these.
  68  */
  69 typedef struct indirect_split {
  70         list_node_t is_node; /* link on iv_splits */
  71
  72         /*
  73          * is_split_offset is the offset into the i/o.
  74          * This is the sum of the previous splits' is_size's.
  75          */
  76         uint64_t is_split_offset;
  77
  78         vdev_t *is_vdev; /* top-level vdev */
  79         uint64_t is_target_offset; /* offset on is_vdev */
  80         uint64_t is_size;
  81         int is_children; /* number of entries in is_child[] */
  82
  83         /*
  84          * is_good_child is the child that we are currently using to
  85          * attempt reconstruction.
  86          */
  87         int is_good_child;
  88
  89         indirect_child_t is_child[1]; /* variable-length */
  90 } indirect_split_t;
  91
  92 /*
  93  * The indirect_vsd_t is associated with each i/o to the indirect vdev.
  94  * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
  95  */
  96 typedef struct indirect_vsd {
  97         boolean_t iv_split_block;
  98         boolean_t iv_reconstruct;
  99
 100         list_t iv_splits; /* list of indirect_split_t's */
 101 } indirect_vsd_t;
 102
 103 /*
 104  * List of all vdevs, chained through v_alllink.
 105  */
 106 static vdev_list_t zfs_vdevs;
 107
 108 /*
 109  * List of ZFS features supported for read
 110  */
 111 static const char *features_for_read[] = {
 112         "org.illumos:lz4_compress",
 113         "com.delphix:hole_birth",
 114         "com.delphix:extensible_dataset",
 115         "com.delphix:embedded_data",
 116         "org.open-zfs:large_blocks",
 117         "org.illumos:sha512",
 118         "org.illumos:skein",
 119         "org.zfsonlinux:large_dnode",
 120         "com.joyent:multi_vdev_crash_dump",
 121         "com.delphix:spacemap_histogram",
 122         "com.delphix:zpool_checkpoint",
 123         "com.delphix:spacemap_v2",
 124         "com.datto:encryption",
 125         "org.zfsonlinux:allocation_classes",
 126         "com.datto:resilver_defer",
 127         "com.delphix:device_removal",
 128         "com.delphix:obsolete_counts",
 129         "com.intel:allocation_classes",
 130         NULL
 131 };
 132
 133 /*
 134  * List of all pools, chained through spa_link.
 135  */
 136 static spa_list_t zfs_pools;
 137
 138 static const dnode_phys_t *dnode_cache_obj;
 139 static uint64_t dnode_cache_bn;
 140 static char *dnode_cache_buf;
 141
 142 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
 143 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
 144 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
 145 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
 146     const char *name, uint64_t integer_size, uint64_t num_integers,
 147     void *value);
 148 static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t,
 149     dnode_phys_t *);
 150 static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *,
 151     size_t);
 152 static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t,
 153     size_t);
 154 static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t);
 155 vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *,
 156     uint64_t);
 157 vdev_indirect_mapping_entry_phys_t *
 158     vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t,
 159     uint64_t, uint64_t *);
 160
 161 static void
 162 zfs_init(void)
 163 {
 164         STAILQ_INIT(&zfs_vdevs);
 165         STAILQ_INIT(&zfs_pools);
 166
 167         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
 168
 169         zfs_init_crc();
 170 }
 171
 172 static int
 173 nvlist_check_features_for_read(nvlist_t *nvl)
 174 {
 175         nvlist_t *features = NULL;
 176         nvs_data_t *data;
 177         nvp_header_t *nvp;
 178         nv_string_t *nvp_name;
 179         int rc;
 180
 181         rc = nvlist_find(nvl, ZPOOL_CONFIG_FEATURES_FOR_READ,
 182             DATA_TYPE_NVLIST, NULL, &features, NULL);
 183         if (rc != 0)
 184                 return (rc);
 185
 186         data = (nvs_data_t *)features->nv_data;
 187         nvp = &data->nvl_pair;  /* first pair in nvlist */
 188
 189         while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
 190                 int i, found;
 191
 192                 nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp));
 193                 found = 0;
 194
 195                 for (i = 0; features_for_read[i] != NULL; i++) {
 196                         if (memcmp(nvp_name->nv_data, features_for_read[i],
 197                             nvp_name->nv_size) == 0) {
 198                                 found = 1;
 199                                 break;
 200                         }
 201                 }
 202
 203                 if (!found) {
 204                         printf("ZFS: unsupported feature: %.*s\n",
 205                             nvp_name->nv_size, nvp_name->nv_data);
 206                         rc = EIO;
 207                 }
 208                 nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size);
 209         }
 210         nvlist_destroy(features);
 211
 212         return (rc);
 213 }
 214
 215 static int
 216 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
 217     off_t offset, size_t size)
 218 {
 219         size_t psize;
 220         int rc;
 221
 222         if (!vdev->v_phys_read)
 223                 return (EIO);
 224
 225         if (bp) {
 226                 psize = BP_GET_PSIZE(bp);
 227         } else {
 228                 psize = size;
 229         }
 230
 231         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
 232         if (rc == 0) {
 233                 if (bp != NULL)
 234                         rc = zio_checksum_verify(vdev->v_spa, bp, buf);
 235         }
 236
 237         return (rc);
 238 }
 239
 240 typedef struct remap_segment {
 241         vdev_t *rs_vd;
 242         uint64_t rs_offset;
 243         uint64_t rs_asize;
 244         uint64_t rs_split_offset;
 245         list_node_t rs_node;
 246 } remap_segment_t;
 247
 248 static remap_segment_t *
 249 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
 250 {
 251         remap_segment_t *rs = malloc(sizeof (remap_segment_t));
 252
 253         if (rs != NULL) {
 254                 rs->rs_vd = vd;
 255                 rs->rs_offset = offset;
 256                 rs->rs_asize = asize;
 257                 rs->rs_split_offset = split_offset;
 258         }
 259
 260         return (rs);
 261 }
 262
 263 vdev_indirect_mapping_t *
 264 vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os,
 265     uint64_t mapping_object)
 266 {
 267         vdev_indirect_mapping_t *vim;
 268         vdev_indirect_mapping_phys_t *vim_phys;
 269         int rc;
 270
 271         vim = calloc(1, sizeof (*vim));
 272         if (vim == NULL)
 273                 return (NULL);
 274
 275         vim->vim_dn = calloc(1, sizeof (*vim->vim_dn));
 276         if (vim->vim_dn == NULL) {
 277                 free(vim);
 278                 return (NULL);
 279         }
 280
 281         rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn);
 282         if (rc != 0) {
 283                 free(vim->vim_dn);
 284                 free(vim);
 285                 return (NULL);
 286         }
 287
 288         vim->vim_spa = spa;
 289         vim->vim_phys = malloc(sizeof (*vim->vim_phys));
 290         if (vim->vim_phys == NULL) {
 291                 free(vim->vim_dn);
 292                 free(vim);
 293                 return (NULL);
 294         }
 295
 296         vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn);
 297         *vim->vim_phys = *vim_phys;
 298
 299         vim->vim_objset = os;
 300         vim->vim_object = mapping_object;
 301         vim->vim_entries = NULL;
 302
 303         vim->vim_havecounts =
 304             (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
 305
 306         return (vim);
 307 }
 308
 309 /*
 310  * Compare an offset with an indirect mapping entry; there are three
 311  * possible scenarios:
 312  *
 313  *     1. The offset is "less than" the mapping entry; meaning the
 314  *        offset is less than the source offset of the mapping entry. In
 315  *        this case, there is no overlap between the offset and the
 316  *        mapping entry and -1 will be returned.
 317  *
 318  *     2. The offset is "greater than" the mapping entry; meaning the
 319  *        offset is greater than the mapping entry's source offset plus
 320  *        the entry's size. In this case, there is no overlap between
 321  *        the offset and the mapping entry and 1 will be returned.
 322  *
 323  *        NOTE: If the offset is actually equal to the entry's offset
 324  *        plus size, this is considered to be "greater" than the entry,
 325  *        and this case applies (i.e. 1 will be returned). Thus, the
 326  *        entry's "range" can be considered to be inclusive at its
 327  *        start, but exclusive at its end: e.g. [src, src + size).
 328  *
 329  *     3. The last case to consider is if the offset actually falls
 330  *        within the mapping entry's range. If this is the case, the
 331  *        offset is considered to be "equal to" the mapping entry and
 332  *        0 will be returned.
 333  *
 334  *        NOTE: If the offset is equal to the entry's source offset,
 335  *        this case applies and 0 will be returned. If the offset is
 336  *        equal to the entry's source plus its size, this case does
 337  *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
 338  *        returned.
 339  */
 340 static int
 341 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
 342 {
 343         const uint64_t *key = v_key;
 344         const vdev_indirect_mapping_entry_phys_t *array_elem =
 345             v_array_elem;
 346         uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
 347
 348         if (*key < src_offset) {
 349                 return (-1);
 350         } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
 351                 return (0);
 352         } else {
 353                 return (1);
 354         }
 355 }
 356
 357 /*
 358  * Return array entry.
 359  */
 360 static vdev_indirect_mapping_entry_phys_t *
 361 vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index)
 362 {
 363         uint64_t size;
 364         off_t offset = 0;
 365         int rc;
 366
 367         if (vim->vim_phys->vimp_num_entries == 0)
 368                 return (NULL);
 369
 370         if (vim->vim_entries == NULL) {
 371                 uint64_t bsize;
 372
 373                 bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 374                 size = vim->vim_phys->vimp_num_entries *
 375                     sizeof (*vim->vim_entries);
 376                 if (size > bsize) {
 377                         size = bsize / sizeof (*vim->vim_entries);
 378                         size *= sizeof (*vim->vim_entries);
 379                 }
 380                 vim->vim_entries = malloc(size);
 381                 if (vim->vim_entries == NULL)
 382                         return (NULL);
 383                 vim->vim_num_entries = size / sizeof (*vim->vim_entries);
 384                 offset = index * sizeof (*vim->vim_entries);
 385         }
 386
 387         /* We have data in vim_entries */
 388         if (offset == 0) {
 389                 if (index >= vim->vim_entry_offset &&
 390                     index <= vim->vim_entry_offset + vim->vim_num_entries) {
 391                         index -= vim->vim_entry_offset;
 392                         return (&vim->vim_entries[index]);
 393                 }
 394                 offset = index * sizeof (*vim->vim_entries);
 395         }
 396
 397         vim->vim_entry_offset = index;
 398         size = vim->vim_num_entries * sizeof (*vim->vim_entries);
 399         rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries,
 400             size);
 401         if (rc != 0) {
 402                 /* Read error, invalidate vim_entries. */
 403                 free(vim->vim_entries);
 404                 vim->vim_entries = NULL;
 405                 return (NULL);
 406         }
 407         index -= vim->vim_entry_offset;
 408         return (&vim->vim_entries[index]);
 409 }
 410
 411 /*
 412  * Returns the mapping entry for the given offset.
 413  *
 414  * It's possible that the given offset will not be in the mapping table
 415  * (i.e. no mapping entries contain this offset), in which case, the
 416  * return value value depends on the "next_if_missing" parameter.
 417  *
 418  * If the offset is not found in the table and "next_if_missing" is
 419  * B_FALSE, then NULL will always be returned. The behavior is intended
 420  * to allow consumers to get the entry corresponding to the offset
 421  * parameter, iff the offset overlaps with an entry in the table.
 422  *
 423  * If the offset is not found in the table and "next_if_missing" is
 424  * B_TRUE, then the entry nearest to the given offset will be returned,
 425  * such that the entry's source offset is greater than the offset
 426  * passed in (i.e. the "next" mapping entry in the table is returned, if
 427  * the offset is missing from the table). If there are no entries whose
 428  * source offset is greater than the passed in offset, NULL is returned.
 429  */
 430 static vdev_indirect_mapping_entry_phys_t *
 431 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
 432     uint64_t offset)
 433 {
 434         ASSERT(vim->vim_phys->vimp_num_entries > 0);
 435
 436         vdev_indirect_mapping_entry_phys_t *entry;
 437
 438         uint64_t last = vim->vim_phys->vimp_num_entries - 1;
 439         uint64_t base = 0;
 440
 441         /*
 442          * We don't define these inside of the while loop because we use
 443          * their value in the case that offset isn't in the mapping.
 444          */
 445         uint64_t mid;
 446         int result;
 447
 448         while (last >= base) {
 449                 mid = base + ((last - base) >> 1);
 450
 451                 entry = vdev_indirect_mapping_entry(vim, mid);
 452                 if (entry == NULL)
 453                         break;
 454                 result = dva_mapping_overlap_compare(&offset, entry);
 455
 456                 if (result == 0) {
 457                         break;
 458                 } else if (result < 0) {
 459                         last = mid - 1;
 460                 } else {
 461                         base = mid + 1;
 462                 }
 463         }
 464         return (entry);
 465 }
 466
 467 /*
 468  * Given an indirect vdev and an extent on that vdev, it duplicates the
 469  * physical entries of the indirect mapping that correspond to the extent
 470  * to a new array and returns a pointer to it. In addition, copied_entries
 471  * is populated with the number of mapping entries that were duplicated.
 472  *
 473  * Finally, since we are doing an allocation, it is up to the caller to
 474  * free the array allocated in this function.
 475  */
 476 vdev_indirect_mapping_entry_phys_t *
 477 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
 478     uint64_t asize, uint64_t *copied_entries)
 479 {
 480         vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
 481         vdev_indirect_mapping_t *vim = vd->v_mapping;
 482         uint64_t entries = 0;
 483
 484         vdev_indirect_mapping_entry_phys_t *first_mapping =
 485             vdev_indirect_mapping_entry_for_offset(vim, offset);
 486         ASSERT3P(first_mapping, !=, NULL);
 487
 488         vdev_indirect_mapping_entry_phys_t *m = first_mapping;
 489         while (asize > 0) {
 490                 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
 491                 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
 492                 uint64_t inner_size = MIN(asize, size - inner_offset);
 493
 494                 offset += inner_size;
 495                 asize -= inner_size;
 496                 entries++;
 497                 m++;
 498         }
 499
 500         size_t copy_length = entries * sizeof (*first_mapping);
 501         duplicate_mappings = malloc(copy_length);
 502         if (duplicate_mappings != NULL)
 503                 bcopy(first_mapping, duplicate_mappings, copy_length);
 504         else
 505                 entries = 0;
 506
 507         *copied_entries = entries;
 508
 509         return (duplicate_mappings);
 510 }
 511
 512 static vdev_t *
 513 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 514 {
 515         vdev_t *rvd;
 516         vdev_list_t *vlist;
 517
 518         vlist = &spa->spa_root_vdev->v_children;
 519         STAILQ_FOREACH(rvd, vlist, v_childlink)
 520                 if (rvd->v_id == vdev)
 521                         break;
 522
 523         return (rvd);
 524 }
 525
 526 /*
 527  * This is a callback for vdev_indirect_remap() which allocates an
 528  * indirect_split_t for each split segment and adds it to iv_splits.
 529  */
 530 static void
 531 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
 532     uint64_t size, void *arg)
 533 {
 534         int n = 1;
 535         zio_t *zio = arg;
 536         indirect_vsd_t *iv = zio->io_vsd;
 537
 538         if (vd->v_read == vdev_indirect_read)
 539                 return;
 540
 541         if (vd->v_read == vdev_mirror_read)
 542                 n = vd->v_nchildren;
 543
 544         indirect_split_t *is =
 545             malloc(offsetof(indirect_split_t, is_child[n]));
 546         if (is == NULL) {
 547                 zio->io_error = ENOMEM;
 548                 return;
 549         }
 550         bzero(is, offsetof(indirect_split_t, is_child[n]));
 551
 552         is->is_children = n;
 553         is->is_size = size;
 554         is->is_split_offset = split_offset;
 555         is->is_target_offset = offset;
 556         is->is_vdev = vd;
 557
 558         /*
 559          * Note that we only consider multiple copies of the data for
 560          * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
 561          * though they use the same ops as mirror, because there's only one
 562          * "good" copy under the replacing/spare.
 563          */
 564         if (vd->v_read == vdev_mirror_read) {
 565                 int i = 0;
 566                 vdev_t *kid;
 567
 568                 STAILQ_FOREACH(kid, &vd->v_children, v_childlink) {
 569                         is->is_child[i++].ic_vdev = kid;
 570                 }
 571         } else {
 572                 is->is_child[0].ic_vdev = vd;
 573         }
 574
 575         list_insert_tail(&iv->iv_splits, is);
 576 }
 577
 578 static void
 579 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
 580 {
 581         list_t stack;
 582         spa_t *spa = vd->v_spa;
 583         zio_t *zio = arg;
 584         remap_segment_t *rs;
 585
 586         list_create(&stack, sizeof (remap_segment_t),
 587             offsetof(remap_segment_t, rs_node));
 588
 589         rs = rs_alloc(vd, offset, asize, 0);
 590         if (rs == NULL) {
 591                 printf("vdev_indirect_remap: out of memory.\n");
 592                 zio->io_error = ENOMEM;
 593         }
 594         for (; rs != NULL; rs = list_remove_head(&stack)) {
 595                 vdev_t *v = rs->rs_vd;
 596                 uint64_t num_entries = 0;
 597                 /* vdev_indirect_mapping_t *vim = v->v_mapping; */
 598                 vdev_indirect_mapping_entry_phys_t *mapping =
 599                     vdev_indirect_mapping_duplicate_adjacent_entries(v,
 600                     rs->rs_offset, rs->rs_asize, &num_entries);
 601
 602                 if (num_entries == 0)
 603                         zio->io_error = ENOMEM;
 604
 605                 for (uint64_t i = 0; i < num_entries; i++) {
 606                         vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
 607                         uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
 608                         uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
 609                         uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
 610                         uint64_t inner_offset = rs->rs_offset -
 611                             DVA_MAPPING_GET_SRC_OFFSET(m);
 612                         uint64_t inner_size =
 613                             MIN(rs->rs_asize, size - inner_offset);
 614                         vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
 615
 616                         if (dst_v->v_read == vdev_indirect_read) {
 617                                 remap_segment_t *o;
 618
 619                                 o = rs_alloc(dst_v, dst_offset + inner_offset,
 620                                     inner_size, rs->rs_split_offset);
 621                                 if (o == NULL) {
 622                                         printf("vdev_indirect_remap: "
 623                                             "out of memory.\n");
 624                                         zio->io_error = ENOMEM;
 625                                         break;
 626                                 }
 627
 628                                 list_insert_head(&stack, o);
 629                         }
 630                         vdev_indirect_gather_splits(rs->rs_split_offset, dst_v,
 631                             dst_offset + inner_offset,
 632                             inner_size, arg);
 633
 634                         /*
 635                          * vdev_indirect_gather_splits can have memory
 636                          * allocation error, we can not recover from it.
 637                          */
 638                         if (zio->io_error != 0)
 639                                 break;
 640                         rs->rs_offset += inner_size;
 641                         rs->rs_asize -= inner_size;
 642                         rs->rs_split_offset += inner_size;
 643                 }
 644
 645                 free(mapping);
 646                 free(rs);
 647                 if (zio->io_error != 0)
 648                         break;
 649         }
 650
 651         list_destroy(&stack);
 652 }
 653
 654 static void
 655 vdev_indirect_map_free(zio_t *zio)
 656 {
 657         indirect_vsd_t *iv = zio->io_vsd;
 658         indirect_split_t *is;
 659
 660         while ((is = list_head(&iv->iv_splits)) != NULL) {
 661                 for (int c = 0; c < is->is_children; c++) {
 662                         indirect_child_t *ic = &is->is_child[c];
 663                         free(ic->ic_data);
 664                 }
 665                 list_remove(&iv->iv_splits, is);
 666                 free(is);
 667         }
 668         free(iv);
 669 }
 670
 671 static int
 672 vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
 673     off_t offset, size_t bytes)
 674 {
 675         zio_t zio;
 676         spa_t *spa = vdev->v_spa;
 677         indirect_vsd_t *iv;
 678         indirect_split_t *first;
 679         int rc = EIO;
 680
 681         iv = calloc(1, sizeof(*iv));
 682         if (iv == NULL)
 683                 return (ENOMEM);
 684
 685         list_create(&iv->iv_splits,
 686             sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
 687
 688         bzero(&zio, sizeof(zio));
 689         zio.io_spa = spa;
 690         zio.io_bp = (blkptr_t *)bp;
 691         zio.io_data = buf;
 692         zio.io_size = bytes;
 693         zio.io_offset = offset;
 694         zio.io_vd = vdev;
 695         zio.io_vsd = iv;
 696
 697         if (vdev->v_mapping == NULL) {
 698                 vdev_indirect_config_t *vic;
 699
 700                 vic = &vdev->vdev_indirect_config;
 701                 vdev->v_mapping = vdev_indirect_mapping_open(spa,
 702                     &spa->spa_mos, vic->vic_mapping_object);
 703         }
 704
 705         vdev_indirect_remap(vdev, offset, bytes, &zio);
 706         if (zio.io_error != 0)
 707                 return (zio.io_error);
 708
 709         first = list_head(&iv->iv_splits);
 710         if (first->is_size == zio.io_size) {
 711                 /*
 712                  * This is not a split block; we are pointing to the entire
 713                  * data, which will checksum the same as the original data.
 714                  * Pass the BP down so that the child i/o can verify the
 715                  * checksum, and try a different location if available
 716                  * (e.g. on a mirror).
 717                  *
 718                  * While this special case could be handled the same as the
 719                  * general (split block) case, doing it this way ensures
 720                  * that the vast majority of blocks on indirect vdevs
 721                  * (which are not split) are handled identically to blocks
 722                  * on non-indirect vdevs.  This allows us to be less strict
 723                  * about performance in the general (but rare) case.
 724                  */
 725                 rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp,
 726                     zio.io_data, first->is_target_offset, bytes);
 727         } else {
 728                 iv->iv_split_block = B_TRUE;
 729                 /*
 730                  * Read one copy of each split segment, from the
 731                  * top-level vdev.  Since we don't know the
 732                  * checksum of each split individually, the child
 733                  * zio can't ensure that we get the right data.
 734                  * E.g. if it's a mirror, it will just read from a
 735                  * random (healthy) leaf vdev.  We have to verify
 736                  * the checksum in vdev_indirect_io_done().
 737                  */
 738                 for (indirect_split_t *is = list_head(&iv->iv_splits);
 739                     is != NULL; is = list_next(&iv->iv_splits, is)) {
 740                         char *ptr = zio.io_data;
 741
 742                         rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp,
 743                             ptr + is->is_split_offset, is->is_target_offset,
 744                             is->is_size);
 745                 }
 746                 if (zio_checksum_verify(spa, zio.io_bp, zio.io_data))
 747                         rc = ECKSUM;
 748                 else
 749                         rc = 0;
 750         }
 751
 752         vdev_indirect_map_free(&zio);
 753         if (rc == 0)
 754                 rc = zio.io_error;
 755
 756         return (rc);
 757 }
 758
 759 static int
 760 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
 761     off_t offset, size_t bytes)
 762 {
 763
 764         return (vdev_read_phys(vdev, bp, buf,
 765             offset + VDEV_LABEL_START_SIZE, bytes));
 766 }
 767
 768 static int
 769 vdev_missing_read(vdev_t *vdev __unused, const blkptr_t *bp __unused,
 770     void *buf __unused, off_t offset __unused, size_t bytes __unused)
 771 {
 772
 773         return (ENOTSUP);
 774 }
 775
 776 static int
 777 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
 778     off_t offset, size_t bytes)
 779 {
 780         vdev_t *kid;
 781         int rc;
 782
 783         rc = EIO;
 784         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
 785                 if (kid->v_state != VDEV_STATE_HEALTHY)
 786                         continue;
 787                 rc = kid->v_read(kid, bp, buf, offset, bytes);
 788                 if (!rc)
 789                         return (0);
 790         }
 791
 792         return (rc);
 793 }
 794
 795 static int
 796 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
 797     off_t offset, size_t bytes)
 798 {
 799         vdev_t *kid;
 800
 801         /*
 802          * Here we should have two kids:
 803          * First one which is the one we are replacing and we can trust
 804          * only this one to have valid data, but it might not be present.
 805          * Second one is that one we are replacing with. It is most likely
 806          * healthy, but we can't trust it has needed data, so we won't use it.
 807          */
 808         kid = STAILQ_FIRST(&vdev->v_children);
 809         if (kid == NULL)
 810                 return (EIO);
 811         if (kid->v_state != VDEV_STATE_HEALTHY)
 812                 return (EIO);
 813         return (kid->v_read(kid, bp, buf, offset, bytes));
 814 }
 815
 816 static vdev_t *
 817 vdev_find(uint64_t guid)
 818 {
 819         vdev_t *vdev;
 820
 821         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
 822                 if (vdev->v_guid == guid)
 823                         return (vdev);
 824
 825         return (0);
 826 }
 827
 828 static vdev_t *
 829 vdev_create(uint64_t guid, vdev_read_t *_read)
 830 {
 831         vdev_t *vdev;
 832         vdev_indirect_config_t *vic;
 833
 834         vdev = calloc(1, sizeof(vdev_t));
 835         if (vdev != NULL) {
 836                 STAILQ_INIT(&vdev->v_children);
 837                 vdev->v_guid = guid;
 838                 vdev->v_read = _read;
 839
 840                 /*
 841                  * root vdev has no read function, we use this fact to
 842                  * skip setting up data we do not need for root vdev.
 843                  * We only point root vdev from spa.
 844                  */
 845                 if (_read != NULL) {
 846                         vic = &vdev->vdev_indirect_config;
 847                         vic->vic_prev_indirect_vdev = UINT64_MAX;
 848                         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
 849                 }
 850         }
 851
 852         return (vdev);
 853 }
 854
 855 static void
 856 vdev_set_initial_state(vdev_t *vdev, const nvlist_t *nvlist)
 857 {
 858         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
 859         uint64_t is_log;
 860
 861         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
 862         is_log = 0;
 863         (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
 864             &is_offline, NULL);
 865         (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
 866             &is_removed, NULL);
 867         (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
 868             &is_faulted, NULL);
 869         (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
 870             NULL, &is_degraded, NULL);
 871         (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
 872             NULL, &isnt_present, NULL);
 873         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
 874             &is_log, NULL);
 875
 876         if (is_offline != 0)
 877                 vdev->v_state = VDEV_STATE_OFFLINE;
 878         else if (is_removed != 0)
 879                 vdev->v_state = VDEV_STATE_REMOVED;
 880         else if (is_faulted != 0)
 881                 vdev->v_state = VDEV_STATE_FAULTED;
 882         else if (is_degraded != 0)
 883                 vdev->v_state = VDEV_STATE_DEGRADED;
 884         else if (isnt_present != 0)
 885                 vdev->v_state = VDEV_STATE_CANT_OPEN;
 886
 887         vdev->v_islog = is_log != 0;
 888 }
 889
 890 static int
 891 vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp)
 892 {
 893         uint64_t id, ashift, asize, nparity;
 894         const char *path;
 895         const char *type;
 896         int len, pathlen;
 897         char *name;
 898         vdev_t *vdev;
 899
 900         if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id,
 901             NULL) ||
 902             nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL,
 903             &type, &len)) {
 904                 return (ENOENT);
 905         }
 906
 907         if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 &&
 908             memcmp(type, VDEV_TYPE_DISK, len) != 0 &&
 909 #ifdef ZFS_TEST
 910             memcmp(type, VDEV_TYPE_FILE, len) != 0 &&
 911 #endif
 912             memcmp(type, VDEV_TYPE_RAIDZ, len) != 0 &&
 913             memcmp(type, VDEV_TYPE_INDIRECT, len) != 0 &&
 914             memcmp(type, VDEV_TYPE_REPLACING, len) != 0 &&
 915             memcmp(type, VDEV_TYPE_HOLE, len) != 0) {
 916                 printf("ZFS: can only boot from disk, mirror, raidz1, "
 917                     "raidz2 and raidz3 vdevs, got: %.*s\n", len, type);
 918                 return (EIO);
 919         }
 920
 921         if (memcmp(type, VDEV_TYPE_MIRROR, len) == 0)
 922                 vdev = vdev_create(guid, vdev_mirror_read);
 923         else if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0)
 924                 vdev = vdev_create(guid, vdev_raidz_read);
 925         else if (memcmp(type, VDEV_TYPE_REPLACING, len) == 0)
 926                 vdev = vdev_create(guid, vdev_replacing_read);
 927         else if (memcmp(type, VDEV_TYPE_INDIRECT, len) == 0) {
 928                 vdev_indirect_config_t *vic;
 929
 930                 vdev = vdev_create(guid, vdev_indirect_read);
 931                 if (vdev != NULL) {
 932                         vdev->v_state = VDEV_STATE_HEALTHY;
 933                         vic = &vdev->vdev_indirect_config;
 934
 935                         nvlist_find(nvlist,
 936                             ZPOOL_CONFIG_INDIRECT_OBJECT,
 937                             DATA_TYPE_UINT64,
 938                             NULL, &vic->vic_mapping_object, NULL);
 939                         nvlist_find(nvlist,
 940                             ZPOOL_CONFIG_INDIRECT_BIRTHS,
 941                             DATA_TYPE_UINT64,
 942                             NULL, &vic->vic_births_object, NULL);
 943                         nvlist_find(nvlist,
 944                             ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 945                             DATA_TYPE_UINT64,
 946                             NULL, &vic->vic_prev_indirect_vdev, NULL);
 947                 }
 948         } else if (memcmp(type, VDEV_TYPE_HOLE, len) == 0) {
 949                 vdev = vdev_create(guid, vdev_missing_read);
 950         } else {
 951                 vdev = vdev_create(guid, vdev_disk_read);
 952         }
 953
 954         if (vdev == NULL)
 955                 return (ENOMEM);
 956
 957         vdev_set_initial_state(vdev, nvlist);
 958         vdev->v_id = id;
 959         if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
 960             DATA_TYPE_UINT64, NULL, &ashift, NULL) == 0)
 961                 vdev->v_ashift = ashift;
 962
 963         if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
 964             DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) {
 965                 vdev->v_psize = asize +
 966                     VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 967         }
 968
 969         if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
 970             DATA_TYPE_UINT64, NULL, &nparity, NULL) == 0)
 971                 vdev->v_nparity = nparity;
 972
 973         if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
 974             DATA_TYPE_STRING, NULL, &path, &pathlen) == 0) {
 975                 char prefix[] = "/dev/";
 976
 977                 len = strlen(prefix);
 978                 if (len < pathlen && memcmp(path, prefix, len) == 0) {
 979                         path += len;
 980                         pathlen -= len;
 981                 }
 982                 name = malloc(pathlen + 1);
 983                 bcopy(path, name, pathlen);
 984                 name[pathlen] = '\0';
 985                 vdev->v_name = name;
 986         } else {
 987                 name = NULL;
 988                 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) {
 989                         if (vdev->v_nparity < 1 ||
 990                             vdev->v_nparity > 3) {
 991                                 printf("ZFS: invalid raidz parity: %d\n",
 992                                     vdev->v_nparity);
 993                                 return (EIO);
 994                         }
 995                         (void) asprintf(&name, "%.*s%d-%" PRIu64, len, type,
 996                             vdev->v_nparity, id);
 997                 } else {
 998                         (void) asprintf(&name, "%.*s-%" PRIu64, len, type, id);
 999                 }
1000                 vdev->v_name = name;
1001         }
1002         *vdevp = vdev;
1003         return (0);
1004 }
1005
1006 /*
1007  * Find slot for vdev. We return either NULL to signal to use
1008  * STAILQ_INSERT_HEAD, or we return link element to be used with
1009  * STAILQ_INSERT_AFTER.
1010  */
1011 static vdev_t *
1012 vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
1013 {
1014         vdev_t *v, *previous;
1015
1016         if (STAILQ_EMPTY(&top_vdev->v_children))
1017                 return (NULL);
1018
1019         previous = NULL;
1020         STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
1021                 if (v->v_id > vdev->v_id)
1022                         return (previous);
1023
1024                 if (v->v_id == vdev->v_id)
1025                         return (v);
1026
1027                 if (v->v_id < vdev->v_id)
1028                         previous = v;
1029         }
1030         return (previous);
1031 }
1032
1033 static size_t
1034 vdev_child_count(vdev_t *vdev)
1035 {
1036         vdev_t *v;
1037         size_t count;
1038
1039         count = 0;
1040         STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
1041                 count++;
1042         }
1043         return (count);
1044 }
1045
1046 /*
1047  * Insert vdev into top_vdev children list. List is ordered by v_id.
1048  */
1049 static void
1050 vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
1051 {
1052         vdev_t *previous;
1053         size_t count;
1054
1055         /*
1056          * The top level vdev can appear in random order, depending how
1057          * the firmware is presenting the disk devices.
1058          * However, we will insert vdev to create list ordered by v_id,
1059          * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
1060          * as STAILQ does not have insert before.
1061          */
1062         previous = vdev_find_previous(top_vdev, vdev);
1063
1064         if (previous == NULL) {
1065                 STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
1066         } else if (previous->v_id == vdev->v_id) {
1067                 /*
1068                  * This vdev was configured from label config,
1069                  * do not insert duplicate.
1070                  */
1071                 return;
1072         } else {
1073                 STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev,
1074                     v_childlink);
1075         }
1076
1077         count = vdev_child_count(top_vdev);
1078         if (top_vdev->v_nchildren < count)
1079                 top_vdev->v_nchildren = count;
1080 }
1081
1082 static int
1083 vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist)
1084 {
1085         vdev_t *top_vdev, *vdev;
1086         nvlist_t *kids = NULL;
1087         int rc, nkids;
1088
1089         /* Get top vdev. */
1090         top_vdev = vdev_find(top_guid);
1091         if (top_vdev == NULL) {
1092                 rc = vdev_init(top_guid, nvlist, &top_vdev);
1093                 if (rc != 0)
1094                         return (rc);
1095                 top_vdev->v_spa = spa;
1096                 top_vdev->v_top = top_vdev;
1097                 vdev_insert(spa->spa_root_vdev, top_vdev);
1098         }
1099
1100         /* Add children if there are any. */
1101         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1102             &nkids, &kids, NULL);
1103         if (rc == 0) {
1104                 for (int i = 0; i < nkids; i++) {
1105                         uint64_t guid;
1106
1107                         rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1108                             DATA_TYPE_UINT64, NULL, &guid, NULL);
1109                         if (rc != 0) {
1110                                 nvlist_destroy(kids);
1111                                 return (rc);
1112                         }
1113                         rc = vdev_init(guid, kids, &vdev);
1114                         if (rc != 0)
1115                                 return (rc);
1116
1117                         vdev->v_spa = spa;
1118                         vdev->v_top = top_vdev;
1119                         vdev_insert(top_vdev, vdev);
1120
1121                         rc = nvlist_next(kids);
1122                 }
1123         } else {
1124                 /*
1125                  * When there are no children, nvlist_find() does return
1126                  * error, reset it because leaf devices have no children.
1127                  */
1128                 rc = 0;
1129         }
1130         nvlist_destroy(kids);
1131
1132         return (rc);
1133 }
1134
1135 static int
1136 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
1137 {
1138         uint64_t pool_guid, top_guid;
1139         nvlist_t *vdevs;
1140         int rc;
1141
1142         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1143             NULL, &pool_guid, NULL) ||
1144             nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
1145             NULL, &top_guid, NULL) ||
1146             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1147             NULL, &vdevs, NULL)) {
1148                 printf("ZFS: can't find vdev details\n");
1149                 return (ENOENT);
1150         }
1151
1152         rc = vdev_from_nvlist(spa, top_guid, vdevs);
1153         nvlist_destroy(vdevs);
1154         return (rc);
1155 }
1156
1157 static void
1158 vdev_set_state(vdev_t *vdev)
1159 {
1160         vdev_t *kid;
1161         int good_kids;
1162         int bad_kids;
1163
1164         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1165                 vdev_set_state(kid);
1166         }
1167
1168         /*
1169          * A mirror or raidz is healthy if all its kids are healthy. A
1170          * mirror is degraded if any of its kids is healthy; a raidz
1171          * is degraded if at most nparity kids are offline.
1172          */
1173         if (STAILQ_FIRST(&vdev->v_children)) {
1174                 good_kids = 0;
1175                 bad_kids = 0;
1176                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1177                         if (kid->v_state == VDEV_STATE_HEALTHY)
1178                                 good_kids++;
1179                         else
1180                                 bad_kids++;
1181                 }
1182                 if (bad_kids == 0) {
1183                         vdev->v_state = VDEV_STATE_HEALTHY;
1184                 } else {
1185                         if (vdev->v_read == vdev_mirror_read) {
1186                                 if (good_kids) {
1187                                         vdev->v_state = VDEV_STATE_DEGRADED;
1188                                 } else {
1189                                         vdev->v_state = VDEV_STATE_OFFLINE;
1190                                 }
1191                         } else if (vdev->v_read == vdev_raidz_read) {
1192                                 if (bad_kids > vdev->v_nparity) {
1193                                         vdev->v_state = VDEV_STATE_OFFLINE;
1194                                 } else {
1195                                         vdev->v_state = VDEV_STATE_DEGRADED;
1196                                 }
1197                         }
1198                 }
1199         }
1200 }
1201
1202 static int
1203 vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist)
1204 {
1205         vdev_t *vdev;
1206         nvlist_t *kids = NULL;
1207         int rc, nkids;
1208
1209         /* Update top vdev. */
1210         vdev = vdev_find(top_guid);
1211         if (vdev != NULL)
1212                 vdev_set_initial_state(vdev, nvlist);
1213
1214         /* Update children if there are any. */
1215         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1216             &nkids, &kids, NULL);
1217         if (rc == 0) {
1218                 for (int i = 0; i < nkids; i++) {
1219                         uint64_t guid;
1220
1221                         rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1222                             DATA_TYPE_UINT64, NULL, &guid, NULL);
1223                         if (rc != 0)
1224                                 break;
1225
1226                         vdev = vdev_find(guid);
1227                         if (vdev != NULL)
1228                                 vdev_set_initial_state(vdev, kids);
1229
1230                         rc = nvlist_next(kids);
1231                 }
1232         } else {
1233                 rc = 0;
1234         }
1235         nvlist_destroy(kids);
1236
1237         return (rc);
1238 }
1239
1240 static int
1241 vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
1242 {
1243         uint64_t pool_guid, vdev_children;
1244         nvlist_t *vdevs = NULL, *kids = NULL;
1245         int rc, nkids;
1246
1247         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1248             NULL, &pool_guid, NULL) ||
1249             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64,
1250             NULL, &vdev_children, NULL) ||
1251             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1252             NULL, &vdevs, NULL)) {
1253                 printf("ZFS: can't find vdev details\n");
1254                 return (ENOENT);
1255         }
1256
1257         /* Wrong guid?! */
1258         if (spa->spa_guid != pool_guid) {
1259                 nvlist_destroy(vdevs);
1260                 return (EINVAL);
1261         }
1262
1263         spa->spa_root_vdev->v_nchildren = vdev_children;
1264
1265         rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1266             &nkids, &kids, NULL);
1267         nvlist_destroy(vdevs);
1268
1269         /*
1270          * MOS config has at least one child for root vdev.
1271          */
1272         if (rc != 0)
1273                 return (rc);
1274
1275         for (int i = 0; i < nkids; i++) {
1276                 uint64_t guid;
1277                 vdev_t *vdev;
1278
1279                 rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1280                     NULL, &guid, NULL);
1281                 if (rc != 0)
1282                         break;
1283                 vdev = vdev_find(guid);
1284                 /*
1285                  * Top level vdev is missing, create it.
1286                  */
1287                 if (vdev == NULL)
1288                         rc = vdev_from_nvlist(spa, guid, kids);
1289                 else
1290                         rc = vdev_update_from_nvlist(guid, kids);
1291                 if (rc != 0)
1292                         break;
1293                 nvlist_next(kids);
1294         }
1295         nvlist_destroy(kids);
1296
1297         /*
1298          * Re-evaluate top-level vdev state.
1299          */
1300         vdev_set_state(spa->spa_root_vdev);
1301
1302         return (rc);
1303 }
1304
1305 static spa_t *
1306 spa_find_by_guid(uint64_t guid)
1307 {
1308         spa_t *spa;
1309
1310         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1311                 if (spa->spa_guid == guid)
1312                         return (spa);
1313
1314         return (NULL);
1315 }
1316
1317 static spa_t *
1318 spa_find_by_name(const char *name)
1319 {
1320         spa_t *spa;
1321
1322         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1323                 if (strcmp(spa->spa_name, name) == 0)
1324                         return (spa);
1325
1326         return (NULL);
1327 }
1328
1329 #ifdef BOOT2
1330 static spa_t *
1331 spa_get_primary(void)
1332 {
1333
1334         return (STAILQ_FIRST(&zfs_pools));
1335 }
1336
1337 static vdev_t *
1338 spa_get_primary_vdev(const spa_t *spa)
1339 {
1340         vdev_t *vdev;
1341         vdev_t *kid;
1342
1343         if (spa == NULL)
1344                 spa = spa_get_primary();
1345         if (spa == NULL)
1346                 return (NULL);
1347         vdev = spa->spa_root_vdev;
1348         if (vdev == NULL)
1349                 return (NULL);
1350         for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
1351             kid = STAILQ_FIRST(&vdev->v_children))
1352                 vdev = kid;
1353         return (vdev);
1354 }
1355 #endif
1356
1357 static spa_t *
1358 spa_create(uint64_t guid, const char *name)
1359 {
1360         spa_t *spa;
1361
1362         if ((spa = calloc(1, sizeof(spa_t))) == NULL)
1363                 return (NULL);
1364         if ((spa->spa_name = strdup(name)) == NULL) {
1365                 free(spa);
1366                 return (NULL);
1367         }
1368         spa->spa_guid = guid;
1369         spa->spa_root_vdev = vdev_create(guid, NULL);
1370         if (spa->spa_root_vdev == NULL) {
1371                 free(spa->spa_name);
1372                 free(spa);
1373                 return (NULL);
1374         }
1375         spa->spa_root_vdev->v_name = strdup("root");
1376         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
1377
1378         return (spa);
1379 }
1380
1381 static const char *
1382 state_name(vdev_state_t state)
1383 {
1384         static const char *names[] = {
1385                 "UNKNOWN",
1386                 "CLOSED",
1387                 "OFFLINE",
1388                 "REMOVED",
1389                 "CANT_OPEN",
1390                 "FAULTED",
1391                 "DEGRADED",
1392                 "ONLINE"
1393         };
1394         return (names[state]);
1395 }
1396
1397 #ifdef BOOT2
1398
1399 #define pager_printf printf
1400
1401 #else
1402
1403 static int
1404 pager_printf(const char *fmt, ...)
1405 {
1406         char line[80];
1407         va_list args;
1408
1409         va_start(args, fmt);
1410         vsnprintf(line, sizeof(line), fmt, args);
1411         va_end(args);
1412         return (pager_output(line));
1413 }
1414
1415 #endif
1416
1417 #define STATUS_FORMAT   "        %s %s\n"
1418
1419 static int
1420 print_state(int indent, const char *name, vdev_state_t state)
1421 {
1422         int i;
1423         char buf[512];
1424
1425         buf[0] = 0;
1426         for (i = 0; i < indent; i++)
1427                 strcat(buf, "  ");
1428         strcat(buf, name);
1429         return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
1430 }
1431
1432 static int
1433 vdev_status(vdev_t *vdev, int indent)
1434 {
1435         vdev_t *kid;
1436         int ret;
1437
1438         if (vdev->v_islog) {
1439                 (void) pager_output("        logs\n");
1440                 indent++;
1441         }
1442
1443         ret = print_state(indent, vdev->v_name, vdev->v_state);
1444         if (ret != 0)
1445                 return (ret);
1446
1447         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1448                 ret = vdev_status(kid, indent + 1);
1449                 if (ret != 0)
1450                         return (ret);
1451         }
1452         return (ret);
1453 }
1454
1455 static int
1456 spa_status(spa_t *spa)
1457 {
1458         static char bootfs[ZFS_MAXNAMELEN];
1459         uint64_t rootid;
1460         vdev_list_t *vlist;
1461         vdev_t *vdev;
1462         int good_kids, bad_kids, degraded_kids, ret;
1463         vdev_state_t state;
1464
1465         ret = pager_printf("  pool: %s\n", spa->spa_name);
1466         if (ret != 0)
1467                 return (ret);
1468
1469         if (zfs_get_root(spa, &rootid) == 0 &&
1470             zfs_rlookup(spa, rootid, bootfs) == 0) {
1471                 if (bootfs[0] == '\0')
1472                         ret = pager_printf("bootfs: %s\n", spa->spa_name);
1473                 else
1474                         ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
1475                             bootfs);
1476                 if (ret != 0)
1477                         return (ret);
1478         }
1479         ret = pager_printf("config:\n\n");
1480         if (ret != 0)
1481                 return (ret);
1482         ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
1483         if (ret != 0)
1484                 return (ret);
1485
1486         good_kids = 0;
1487         degraded_kids = 0;
1488         bad_kids = 0;
1489         vlist = &spa->spa_root_vdev->v_children;
1490         STAILQ_FOREACH(vdev, vlist, v_childlink) {
1491                 if (vdev->v_state == VDEV_STATE_HEALTHY)
1492                         good_kids++;
1493                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
1494                         degraded_kids++;
1495                 else
1496                         bad_kids++;
1497         }
1498
1499         state = VDEV_STATE_CLOSED;
1500         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
1501                 state = VDEV_STATE_HEALTHY;
1502         else if ((good_kids + degraded_kids) > 0)
1503                 state = VDEV_STATE_DEGRADED;
1504
1505         ret = print_state(0, spa->spa_name, state);
1506         if (ret != 0)
1507                 return (ret);
1508
1509         STAILQ_FOREACH(vdev, vlist, v_childlink) {
1510                 ret = vdev_status(vdev, 1);
1511                 if (ret != 0)
1512                         return (ret);
1513         }
1514         return (ret);
1515 }
1516
1517 static int
1518 spa_all_status(void)
1519 {
1520         spa_t *spa;
1521         int first = 1, ret = 0;
1522
1523         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1524                 if (!first) {
1525                         ret = pager_printf("\n");
1526                         if (ret != 0)
1527                                 return (ret);
1528                 }
1529                 first = 0;
1530                 ret = spa_status(spa);
1531                 if (ret != 0)
1532                         return (ret);
1533         }
1534         return (ret);
1535 }
1536
1537 static uint64_t
1538 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
1539 {
1540         uint64_t label_offset;
1541
1542         if (l < VDEV_LABELS / 2)
1543                 label_offset = 0;
1544         else
1545                 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
1546
1547         return (offset + l * sizeof (vdev_label_t) + label_offset);
1548 }
1549
1550 static int
1551 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
1552 {
1553         unsigned int seq1 = 0;
1554         unsigned int seq2 = 0;
1555         int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
1556
1557         if (cmp != 0)
1558                 return (cmp);
1559
1560         cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
1561         if (cmp != 0)
1562                 return (cmp);
1563
1564         if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
1565                 seq1 = MMP_SEQ(ub1);
1566
1567         if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
1568                 seq2 = MMP_SEQ(ub2);
1569
1570         return (AVL_CMP(seq1, seq2));
1571 }
1572
1573 static int
1574 uberblock_verify(uberblock_t *ub)
1575 {
1576         if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
1577                 byteswap_uint64_array(ub, sizeof (uberblock_t));
1578         }
1579
1580         if (ub->ub_magic != UBERBLOCK_MAGIC ||
1581             !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
1582                 return (EINVAL);
1583
1584         return (0);
1585 }
1586
1587 static int
1588 vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
1589     size_t size)
1590 {
1591         blkptr_t bp;
1592         off_t off;
1593
1594         off = vdev_label_offset(vd->v_psize, l, offset);
1595
1596         BP_ZERO(&bp);
1597         BP_SET_LSIZE(&bp, size);
1598         BP_SET_PSIZE(&bp, size);
1599         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1600         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1601         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
1602         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1603
1604         return (vdev_read_phys(vd, &bp, buf, off, size));
1605 }
1606
1607 static nvlist_t *
1608 vdev_label_read_config(vdev_t *vd, uint64_t txg)
1609 {
1610         vdev_phys_t *label;
1611         uint64_t best_txg = 0;
1612         uint64_t label_txg = 0;
1613         uint64_t asize;
1614         nvlist_t *nvl = NULL, *tmp;
1615         int error;
1616
1617         label = malloc(sizeof (vdev_phys_t));
1618         if (label == NULL)
1619                 return (NULL);
1620
1621         for (int l = 0; l < VDEV_LABELS; l++) {
1622                 const unsigned char *nvlist;
1623
1624                 if (vdev_label_read(vd, l, label,
1625                     offsetof(vdev_label_t, vl_vdev_phys),
1626                     sizeof (vdev_phys_t)))
1627                         continue;
1628
1629                 nvlist = (const unsigned char *) label->vp_nvlist;
1630                 tmp = nvlist_import(nvlist + 4, nvlist[0], nvlist[1]);
1631                 if (tmp == NULL)
1632                         continue;
1633
1634                 error = nvlist_find(tmp, ZPOOL_CONFIG_POOL_TXG,
1635                     DATA_TYPE_UINT64, NULL, &label_txg, NULL);
1636                 if (error != 0 || label_txg == 0) {
1637                         nvlist_destroy(nvl);
1638                         nvl = tmp;
1639                         goto done;
1640                 }
1641
1642                 if (label_txg <= txg && label_txg > best_txg) {
1643                         best_txg = label_txg;
1644                         nvlist_destroy(nvl);
1645                         nvl = tmp;
1646                         tmp = NULL;
1647
1648                         /*
1649                          * Use asize from pool config. We need this
1650                          * because we can get bad value from BIOS.
1651                          */
1652                         if (nvlist_find(nvl, ZPOOL_CONFIG_ASIZE,
1653                             DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) {
1654                                 vd->v_psize = asize +
1655                                     VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1656                         }
1657                 }
1658                 nvlist_destroy(tmp);
1659         }
1660
1661         if (best_txg == 0) {
1662                 nvlist_destroy(nvl);
1663                 nvl = NULL;
1664         }
1665 done:
1666         free(label);
1667         return (nvl);
1668 }
1669
1670 static void
1671 vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
1672 {
1673         uberblock_t *buf;
1674
1675         buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
1676         if (buf == NULL)
1677                 return;
1678
1679         for (int l = 0; l < VDEV_LABELS; l++) {
1680                 for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
1681                         if (vdev_label_read(vd, l, buf,
1682                             VDEV_UBERBLOCK_OFFSET(vd, n),
1683                             VDEV_UBERBLOCK_SIZE(vd)))
1684                                 continue;
1685                         if (uberblock_verify(buf) != 0)
1686                                 continue;
1687
1688                         if (vdev_uberblock_compare(buf, ub) > 0)
1689                                 *ub = *buf;
1690                 }
1691         }
1692         free(buf);
1693 }
1694
1695 static int
1696 vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
1697 {
1698         vdev_t vtmp;
1699         spa_t *spa;
1700         vdev_t *vdev;
1701         nvlist_t *nvl;
1702         uint64_t val;
1703         uint64_t guid, vdev_children;
1704         uint64_t pool_txg, pool_guid;
1705         const char *pool_name;
1706         int rc, namelen;
1707
1708         /*
1709          * Load the vdev label and figure out which
1710          * uberblock is most current.
1711          */
1712         memset(&vtmp, 0, sizeof(vtmp));
1713         vtmp.v_phys_read = _read;
1714         vtmp.v_read_priv = read_priv;
1715         vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
1716             (uint64_t)sizeof (vdev_label_t));
1717
1718         /* Test for minimum device size. */
1719         if (vtmp.v_psize < SPA_MINDEVSIZE)
1720                 return (EIO);
1721
1722         nvl = vdev_label_read_config(&vtmp, UINT64_MAX);
1723         if (nvl == NULL)
1724                 return (EIO);
1725
1726         if (nvlist_find(nvl, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
1727             NULL, &val, NULL) != 0) {
1728                 nvlist_destroy(nvl);
1729                 return (EIO);
1730         }
1731
1732         if (!SPA_VERSION_IS_SUPPORTED(val)) {
1733                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1734                     (unsigned)val, (unsigned)SPA_VERSION);
1735                 nvlist_destroy(nvl);
1736                 return (EIO);
1737         }
1738
1739         /* Check ZFS features for read */
1740         rc = nvlist_check_features_for_read(nvl);
1741         if (rc != 0) {
1742                 nvlist_destroy(nvl);
1743                 return (EIO);
1744         }
1745
1746         if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
1747             NULL, &val, NULL) != 0) {
1748                 nvlist_destroy(nvl);
1749                 return (EIO);
1750         }
1751
1752         if (val == POOL_STATE_DESTROYED) {
1753                 /* We don't boot only from destroyed pools. */
1754                 nvlist_destroy(nvl);
1755                 return (EIO);
1756         }
1757
1758         if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
1759             NULL, &pool_txg, NULL) != 0 ||
1760             nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1761             NULL, &pool_guid, NULL) != 0 ||
1762             nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
1763             NULL, &pool_name, &namelen) != 0) {
1764                 /*
1765                  * Cache and spare devices end up here - just ignore
1766                  * them.
1767                  */
1768                 nvlist_destroy(nvl);
1769                 return (EIO);
1770         }
1771
1772         /*
1773          * Create the pool if this is the first time we've seen it.
1774          */
1775         spa = spa_find_by_guid(pool_guid);
1776         if (spa == NULL) {
1777                 char *name;
1778
1779                 nvlist_find(nvl, ZPOOL_CONFIG_VDEV_CHILDREN,
1780                     DATA_TYPE_UINT64, NULL, &vdev_children, NULL);
1781                 name = malloc(namelen + 1);
1782                 if (name == NULL) {
1783                         nvlist_destroy(nvl);
1784                         return (ENOMEM);
1785                 }
1786                 bcopy(pool_name, name, namelen);
1787                 name[namelen] = '\0';
1788                 spa = spa_create(pool_guid, name);
1789                 free(name);
1790                 if (spa == NULL) {
1791                         nvlist_destroy(nvl);
1792                         return (ENOMEM);
1793                 }
1794                 spa->spa_root_vdev->v_nchildren = vdev_children;
1795         }
1796         if (pool_txg > spa->spa_txg)
1797                 spa->spa_txg = pool_txg;
1798
1799         /*
1800          * Get the vdev tree and create our in-core copy of it.
1801          * If we already have a vdev with this guid, this must
1802          * be some kind of alias (overlapping slices, dangerously dedicated
1803          * disks etc).
1804          */
1805         if (nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1806             NULL, &guid, NULL) != 0) {
1807                 nvlist_destroy(nvl);
1808                 return (EIO);
1809         }
1810         vdev = vdev_find(guid);
1811         /* Has this vdev already been inited? */
1812         if (vdev && vdev->v_phys_read) {
1813                 nvlist_destroy(nvl);
1814                 return (EIO);
1815         }
1816
1817         rc = vdev_init_from_label(spa, nvl);
1818         nvlist_destroy(nvl);
1819         if (rc != 0)
1820                 return (rc);
1821
1822         /*
1823          * We should already have created an incomplete vdev for this
1824          * vdev. Find it and initialise it with our read proc.
1825          */
1826         vdev = vdev_find(guid);
1827         if (vdev != NULL) {
1828                 vdev->v_phys_read = _read;
1829                 vdev->v_read_priv = read_priv;
1830                 vdev->v_psize = vtmp.v_psize;
1831                 /*
1832                  * If no other state is set, mark vdev healthy.
1833                  */
1834                 if (vdev->v_state == VDEV_STATE_UNKNOWN)
1835                         vdev->v_state = VDEV_STATE_HEALTHY;
1836         } else {
1837                 printf("ZFS: inconsistent nvlist contents\n");
1838                 return (EIO);
1839         }
1840
1841         if (vdev->v_islog)
1842                 spa->spa_with_log = vdev->v_islog;
1843
1844         /*
1845          * Re-evaluate top-level vdev state.
1846          */
1847         vdev_set_state(vdev->v_top);
1848
1849         /*
1850          * Ok, we are happy with the pool so far. Lets find
1851          * the best uberblock and then we can actually access
1852          * the contents of the pool.
1853          */
1854         vdev_uberblock_load(vdev, &spa->spa_uberblock);
1855
1856         if (spap != NULL)
1857                 *spap = spa;
1858         return (0);
1859 }
1860
1861 static int
1862 ilog2(int n)
1863 {
1864         int v;
1865
1866         for (v = 0; v < 32; v++)
1867                 if (n == (1 << v))
1868                         return (v);
1869         return (-1);
1870 }
1871
1872 static int
1873 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1874 {
1875         blkptr_t gbh_bp;
1876         zio_gbh_phys_t zio_gb;
1877         char *pbuf;
1878         int i;
1879
1880         /* Artificial BP for gang block header. */
1881         gbh_bp = *bp;
1882         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1883         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1884         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1885         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1886         for (i = 0; i < SPA_DVAS_PER_BP; i++)
1887                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1888
1889         /* Read gang header block using the artificial BP. */
1890         if (zio_read(spa, &gbh_bp, &zio_gb))
1891                 return (EIO);
1892
1893         pbuf = buf;
1894         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1895                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1896
1897                 if (BP_IS_HOLE(gbp))
1898                         continue;
1899                 if (zio_read(spa, gbp, pbuf))
1900                         return (EIO);
1901                 pbuf += BP_GET_PSIZE(gbp);
1902         }
1903
1904         if (zio_checksum_verify(spa, bp, buf))
1905                 return (EIO);
1906         return (0);
1907 }
1908
1909 static int
1910 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1911 {
1912         int cpfunc = BP_GET_COMPRESS(bp);
1913         uint64_t align, size;
1914         void *pbuf;
1915         int i, error;
1916
1917         /*
1918          * Process data embedded in block pointer
1919          */
1920         if (BP_IS_EMBEDDED(bp)) {
1921                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1922
1923                 size = BPE_GET_PSIZE(bp);
1924                 ASSERT(size <= BPE_PAYLOAD_SIZE);
1925
1926                 if (cpfunc != ZIO_COMPRESS_OFF)
1927                         pbuf = malloc(size);
1928                 else
1929                         pbuf = buf;
1930
1931                 if (pbuf == NULL)
1932                         return (ENOMEM);
1933
1934                 decode_embedded_bp_compressed(bp, pbuf);
1935                 error = 0;
1936
1937                 if (cpfunc != ZIO_COMPRESS_OFF) {
1938                         error = zio_decompress_data(cpfunc, pbuf,
1939                             size, buf, BP_GET_LSIZE(bp));
1940                         free(pbuf);
1941                 }
1942                 if (error != 0)
1943                         printf("ZFS: i/o error - unable to decompress "
1944                             "block pointer data, error %d\n", error);
1945                 return (error);
1946         }
1947
1948         error = EIO;
1949
1950         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1951                 const dva_t *dva = &bp->blk_dva[i];
1952                 vdev_t *vdev;
1953                 vdev_list_t *vlist;
1954                 uint64_t vdevid;
1955                 off_t offset;
1956
1957                 if (!dva->dva_word[0] && !dva->dva_word[1])
1958                         continue;
1959
1960                 vdevid = DVA_GET_VDEV(dva);
1961                 offset = DVA_GET_OFFSET(dva);
1962                 vlist = &spa->spa_root_vdev->v_children;
1963                 STAILQ_FOREACH(vdev, vlist, v_childlink) {
1964                         if (vdev->v_id == vdevid)
1965                                 break;
1966                 }
1967                 if (!vdev || !vdev->v_read)
1968                         continue;
1969
1970                 size = BP_GET_PSIZE(bp);
1971                 if (vdev->v_read == vdev_raidz_read) {
1972                         align = 1ULL << vdev->v_ashift;
1973                         if (P2PHASE(size, align) != 0)
1974                                 size = P2ROUNDUP(size, align);
1975                 }
1976                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1977                         pbuf = malloc(size);
1978                 else
1979                         pbuf = buf;
1980
1981                 if (pbuf == NULL) {
1982                         error = ENOMEM;
1983                         break;
1984                 }
1985
1986                 if (DVA_GET_GANG(dva))
1987                         error = zio_read_gang(spa, bp, pbuf);
1988                 else
1989                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
1990                 if (error == 0) {
1991                         if (cpfunc != ZIO_COMPRESS_OFF)
1992                                 error = zio_decompress_data(cpfunc, pbuf,
1993                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1994                         else if (size != BP_GET_PSIZE(bp))
1995                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1996                 } else {
1997                         printf("zio_read error: %d\n", error);
1998                 }
1999                 if (buf != pbuf)
2000                         free(pbuf);
2001                 if (error == 0)
2002                         break;
2003         }
2004         if (error != 0)
2005                 printf("ZFS: i/o error - all block copies unavailable\n");
2006
2007         return (error);
2008 }
2009
2010 static int
2011 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset,
2012     void *buf, size_t buflen)
2013 {
2014         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
2015         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2016         int nlevels = dnode->dn_nlevels;
2017         int i, rc;
2018
2019         if (bsize > SPA_MAXBLOCKSIZE) {
2020                 printf("ZFS: I/O error - blocks larger than %llu are not "
2021                     "supported\n", SPA_MAXBLOCKSIZE);
2022                 return (EIO);
2023         }
2024
2025         /*
2026          * Note: bsize may not be a power of two here so we need to do an
2027          * actual divide rather than a bitshift.
2028          */
2029         while (buflen > 0) {
2030                 uint64_t bn = offset / bsize;
2031                 int boff = offset % bsize;
2032                 int ibn;
2033                 const blkptr_t *indbp;
2034                 blkptr_t bp;
2035
2036                 if (bn > dnode->dn_maxblkid)
2037                         return (EIO);
2038
2039                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
2040                         goto cached;
2041
2042                 indbp = dnode->dn_blkptr;
2043                 for (i = 0; i < nlevels; i++) {
2044                         /*
2045                          * Copy the bp from the indirect array so that
2046                          * we can re-use the scratch buffer for multi-level
2047                          * objects.
2048                          */
2049                         ibn = bn >> ((nlevels - i - 1) * ibshift);
2050                         ibn &= ((1 << ibshift) - 1);
2051                         bp = indbp[ibn];
2052                         if (BP_IS_HOLE(&bp)) {
2053                                 memset(dnode_cache_buf, 0, bsize);
2054                                 break;
2055                         }
2056                         rc = zio_read(spa, &bp, dnode_cache_buf);
2057                         if (rc)
2058                                 return (rc);
2059                         indbp = (const blkptr_t *) dnode_cache_buf;
2060                 }
2061                 dnode_cache_obj = dnode;
2062                 dnode_cache_bn = bn;
2063         cached:
2064
2065                 /*
2066                  * The buffer contains our data block. Copy what we
2067                  * need from it and loop.
2068                  */
2069                 i = bsize - boff;
2070                 if (i > buflen) i = buflen;
2071                 memcpy(buf, &dnode_cache_buf[boff], i);
2072                 buf = ((char *)buf) + i;
2073                 offset += i;
2074                 buflen -= i;
2075         }
2076
2077         return (0);
2078 }
2079
2080 /*
2081  * Lookup a value in a microzap directory.
2082  */
2083 static int
2084 mzap_lookup(const mzap_phys_t *mz, size_t size, const char *name,
2085     uint64_t *value)
2086 {
2087         const mzap_ent_phys_t *mze;
2088         int chunks, i;
2089
2090         /*
2091          * Microzap objects use exactly one block. Read the whole
2092          * thing.
2093          */
2094         chunks = size / MZAP_ENT_LEN - 1;
2095         for (i = 0; i < chunks; i++) {
2096                 mze = &mz->mz_chunk[i];
2097                 if (strcmp(mze->mze_name, name) == 0) {
2098                         *value = mze->mze_value;
2099                         return (0);
2100                 }
2101         }
2102
2103         return (ENOENT);
2104 }
2105
2106 /*
2107  * Compare a name with a zap leaf entry. Return non-zero if the name
2108  * matches.
2109  */
2110 static int
2111 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2112     const char *name)
2113 {
2114         size_t namelen;
2115         const zap_leaf_chunk_t *nc;
2116         const char *p;
2117
2118         namelen = zc->l_entry.le_name_numints;
2119
2120         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2121         p = name;
2122         while (namelen > 0) {
2123                 size_t len;
2124
2125                 len = namelen;
2126                 if (len > ZAP_LEAF_ARRAY_BYTES)
2127                         len = ZAP_LEAF_ARRAY_BYTES;
2128                 if (memcmp(p, nc->l_array.la_array, len))
2129                         return (0);
2130                 p += len;
2131                 namelen -= len;
2132                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2133         }
2134
2135         return (1);
2136 }
2137
2138 /*
2139  * Extract a uint64_t value from a zap leaf entry.
2140  */
2141 static uint64_t
2142 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
2143 {
2144         const zap_leaf_chunk_t *vc;
2145         int i;
2146         uint64_t value;
2147         const uint8_t *p;
2148
2149         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
2150         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
2151                 value = (value << 8) | p[i];
2152         }
2153
2154         return (value);
2155 }
2156
2157 static void
2158 stv(int len, void *addr, uint64_t value)
2159 {
2160         switch (len) {
2161         case 1:
2162                 *(uint8_t *)addr = value;
2163                 return;
2164         case 2:
2165                 *(uint16_t *)addr = value;
2166                 return;
2167         case 4:
2168                 *(uint32_t *)addr = value;
2169                 return;
2170         case 8:
2171                 *(uint64_t *)addr = value;
2172                 return;
2173         }
2174 }
2175
2176 /*
2177  * Extract a array from a zap leaf entry.
2178  */
2179 static void
2180 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2181     uint64_t integer_size, uint64_t num_integers, void *buf)
2182 {
2183         uint64_t array_int_len = zc->l_entry.le_value_intlen;
2184         uint64_t value = 0;
2185         uint64_t *u64 = buf;
2186         char *p = buf;
2187         int len = MIN(zc->l_entry.le_value_numints, num_integers);
2188         int chunk = zc->l_entry.le_value_chunk;
2189         int byten = 0;
2190
2191         if (integer_size == 8 && len == 1) {
2192                 *u64 = fzap_leaf_value(zl, zc);
2193                 return;
2194         }
2195
2196         while (len > 0) {
2197                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
2198                 int i;
2199
2200                 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
2201                 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
2202                         value = (value << 8) | la->la_array[i];
2203                         byten++;
2204                         if (byten == array_int_len) {
2205                                 stv(integer_size, p, value);
2206                                 byten = 0;
2207                                 len--;
2208                                 if (len == 0)
2209                                         return;
2210                                 p += integer_size;
2211                         }
2212                 }
2213                 chunk = la->la_next;
2214         }
2215 }
2216
2217 static int
2218 fzap_check_size(uint64_t integer_size, uint64_t num_integers)
2219 {
2220
2221         switch (integer_size) {
2222         case 1:
2223         case 2:
2224         case 4:
2225         case 8:
2226                 break;
2227         default:
2228                 return (EINVAL);
2229         }
2230
2231         if (integer_size * num_integers > ZAP_MAXVALUELEN)
2232                 return (E2BIG);
2233
2234         return (0);
2235 }
2236
2237 static void
2238 zap_leaf_free(zap_leaf_t *leaf)
2239 {
2240         free(leaf->l_phys);
2241         free(leaf);
2242 }
2243
2244 static int
2245 zap_get_leaf_byblk(fat_zap_t *zap, uint64_t blk, zap_leaf_t **lp)
2246 {
2247         int bs = FZAP_BLOCK_SHIFT(zap);
2248         int err;
2249
2250         *lp = malloc(sizeof(**lp));
2251         if (*lp == NULL)
2252                 return (ENOMEM);
2253
2254         (*lp)->l_bs = bs;
2255         (*lp)->l_phys = malloc(1 << bs);
2256
2257         if ((*lp)->l_phys == NULL) {
2258                 free(*lp);
2259                 return (ENOMEM);
2260         }
2261         err = dnode_read(zap->zap_spa, zap->zap_dnode, blk << bs, (*lp)->l_phys,
2262             1 << bs);
2263         if (err != 0) {
2264                 zap_leaf_free(*lp);
2265         }
2266         return (err);
2267 }
2268
2269 static int
2270 zap_table_load(fat_zap_t *zap, zap_table_phys_t *tbl, uint64_t idx,
2271     uint64_t *valp)
2272 {
2273         int bs = FZAP_BLOCK_SHIFT(zap);
2274         uint64_t blk = idx >> (bs - 3);
2275         uint64_t off = idx & ((1 << (bs - 3)) - 1);
2276         uint64_t *buf;
2277         int rc;
2278
2279         buf = malloc(1 << zap->zap_block_shift);
2280         if (buf == NULL)
2281                 return (ENOMEM);
2282         rc = dnode_read(zap->zap_spa, zap->zap_dnode, (tbl->zt_blk + blk) << bs,
2283             buf, 1 << zap->zap_block_shift);
2284         if (rc == 0)
2285                 *valp = buf[off];
2286         free(buf);
2287         return (rc);
2288 }
2289
2290 static int
2291 zap_idx_to_blk(fat_zap_t *zap, uint64_t idx, uint64_t *valp)
2292 {
2293         if (zap->zap_phys->zap_ptrtbl.zt_numblks == 0) {
2294                 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
2295                 return (0);
2296         } else {
2297                 return (zap_table_load(zap, &zap->zap_phys->zap_ptrtbl,
2298                     idx, valp));
2299         }
2300 }
2301
2302 #define ZAP_HASH_IDX(hash, n)   (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
2303 static int
2304 zap_deref_leaf(fat_zap_t *zap, uint64_t h, zap_leaf_t **lp)
2305 {
2306         uint64_t idx, blk;
2307         int err;
2308
2309         idx = ZAP_HASH_IDX(h, zap->zap_phys->zap_ptrtbl.zt_shift);
2310         err = zap_idx_to_blk(zap, idx, &blk);
2311         if (err != 0)
2312                 return (err);
2313         return (zap_get_leaf_byblk(zap, blk, lp));
2314 }
2315
2316 #define CHAIN_END       0xffff  /* end of the chunk chain */
2317 #define LEAF_HASH(l, h) \
2318         ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
2319         ((h) >> \
2320         (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len)))
2321 #define LEAF_HASH_ENTPTR(l, h)  (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
2322
2323 static int
2324 zap_leaf_lookup(zap_leaf_t *zl, uint64_t hash, const char *name,
2325     uint64_t integer_size, uint64_t num_integers, void *value)
2326 {
2327         int rc;
2328         uint16_t *chunkp;
2329         struct zap_leaf_entry *le;
2330
2331         /*
2332          * Make sure this chunk matches our hash.
2333          */
2334         if (zl->l_phys->l_hdr.lh_prefix_len > 0 &&
2335             zl->l_phys->l_hdr.lh_prefix !=
2336             hash >> (64 - zl->l_phys->l_hdr.lh_prefix_len))
2337                 return (EIO);
2338
2339         rc = ENOENT;
2340         for (chunkp = LEAF_HASH_ENTPTR(zl, hash);
2341             *chunkp != CHAIN_END; chunkp = &le->le_next) {
2342                 zap_leaf_chunk_t *zc;
2343                 uint16_t chunk = *chunkp;
2344
2345                 le = ZAP_LEAF_ENTRY(zl, chunk);
2346                 if (le->le_hash != hash)
2347                         continue;
2348                 zc = &ZAP_LEAF_CHUNK(zl, chunk);
2349                 if (fzap_name_equal(zl, zc, name)) {
2350                         if (zc->l_entry.le_value_intlen > integer_size) {
2351                                 rc = EINVAL;
2352                         } else {
2353                                 fzap_leaf_array(zl, zc, integer_size,
2354                                     num_integers, value);
2355                                 rc = 0;
2356                         }
2357                         break;
2358                 }
2359         }
2360         return (rc);
2361 }
2362
2363 /*
2364  * Lookup a value in a fatzap directory.
2365  */
2366 static int
2367 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2368     const char *name, uint64_t integer_size, uint64_t num_integers,
2369     void *value)
2370 {
2371         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2372         fat_zap_t z;
2373         zap_leaf_t *zl;
2374         uint64_t hash;
2375         int rc;
2376
2377         if (zh->zap_magic != ZAP_MAGIC)
2378                 return (EIO);
2379
2380         if ((rc = fzap_check_size(integer_size, num_integers)) != 0)
2381                 return (rc);
2382
2383         z.zap_block_shift = ilog2(bsize);
2384         z.zap_phys = zh;
2385         z.zap_spa = spa;
2386         z.zap_dnode = dnode;
2387
2388         hash = zap_hash(zh->zap_salt, name);
2389         rc = zap_deref_leaf(&z, hash, &zl);
2390         if (rc != 0)
2391                 return (rc);
2392
2393         rc = zap_leaf_lookup(zl, hash, name, integer_size, num_integers, value);
2394
2395         zap_leaf_free(zl);
2396         return (rc);
2397 }
2398
2399 /*
2400  * Lookup a name in a zap object and return its value as a uint64_t.
2401  */
2402 static int
2403 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
2404     uint64_t integer_size, uint64_t num_integers, void *value)
2405 {
2406         int rc;
2407         zap_phys_t *zap;
2408         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2409
2410         zap = malloc(size);
2411         if (zap == NULL)
2412                 return (ENOMEM);
2413
2414         rc = dnode_read(spa, dnode, 0, zap, size);
2415         if (rc)
2416                 goto done;
2417
2418         switch (zap->zap_block_type) {
2419         case ZBT_MICRO:
2420                 rc = mzap_lookup((const mzap_phys_t *)zap, size, name, value);
2421                 break;
2422         case ZBT_HEADER:
2423                 rc = fzap_lookup(spa, dnode, zap, name, integer_size,
2424                     num_integers, value);
2425                 break;
2426         default:
2427                 printf("ZFS: invalid zap_type=%" PRIx64 "\n",
2428                     zap->zap_block_type);
2429                 rc = EIO;
2430         }
2431 done:
2432         free(zap);
2433         return (rc);
2434 }
2435
2436 /*
2437  * List a microzap directory.
2438  */
2439 static int
2440 mzap_list(const mzap_phys_t *mz, size_t size,
2441     int (*callback)(const char *, uint64_t))
2442 {
2443         const mzap_ent_phys_t *mze;
2444         int chunks, i, rc;
2445
2446         /*
2447          * Microzap objects use exactly one block. Read the whole
2448          * thing.
2449          */
2450         rc = 0;
2451         chunks = size / MZAP_ENT_LEN - 1;
2452         for (i = 0; i < chunks; i++) {
2453                 mze = &mz->mz_chunk[i];
2454                 if (mze->mze_name[0]) {
2455                         rc = callback(mze->mze_name, mze->mze_value);
2456                         if (rc != 0)
2457                                 break;
2458                 }
2459         }
2460
2461         return (rc);
2462 }
2463
2464 /*
2465  * List a fatzap directory.
2466  */
2467 static int
2468 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2469     int (*callback)(const char *, uint64_t))
2470 {
2471         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2472         fat_zap_t z;
2473         uint64_t i;
2474         int j, rc;
2475
2476         if (zh->zap_magic != ZAP_MAGIC)
2477                 return (EIO);
2478
2479         z.zap_block_shift = ilog2(bsize);
2480         z.zap_phys = zh;
2481
2482         /*
2483          * This assumes that the leaf blocks start at block 1. The
2484          * documentation isn't exactly clear on this.
2485          */
2486         zap_leaf_t zl;
2487         zl.l_bs = z.zap_block_shift;
2488         zl.l_phys = malloc(bsize);
2489         if (zl.l_phys == NULL)
2490                 return (ENOMEM);
2491
2492         for (i = 0; i < zh->zap_num_leafs; i++) {
2493                 off_t off = ((off_t)(i + 1)) << zl.l_bs;
2494                 char name[256], *p;
2495                 uint64_t value;
2496
2497                 if (dnode_read(spa, dnode, off, zl.l_phys, bsize)) {
2498                         free(zl.l_phys);
2499                         return (EIO);
2500                 }
2501
2502                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
2503                         zap_leaf_chunk_t *zc, *nc;
2504                         int namelen;
2505
2506                         zc = &ZAP_LEAF_CHUNK(&zl, j);
2507                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
2508                                 continue;
2509                         namelen = zc->l_entry.le_name_numints;
2510                         if (namelen > sizeof(name))
2511                                 namelen = sizeof(name);
2512
2513                         /*
2514                          * Paste the name back together.
2515                          */
2516                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
2517                         p = name;
2518                         while (namelen > 0) {
2519                                 int len;
2520                                 len = namelen;
2521                                 if (len > ZAP_LEAF_ARRAY_BYTES)
2522                                         len = ZAP_LEAF_ARRAY_BYTES;
2523                                 memcpy(p, nc->l_array.la_array, len);
2524                                 p += len;
2525                                 namelen -= len;
2526                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
2527                         }
2528
2529                         /*
2530                          * Assume the first eight bytes of the value are
2531                          * a uint64_t.
2532                          */
2533                         value = fzap_leaf_value(&zl, zc);
2534
2535                         /* printf("%s 0x%jx\n", name, (uintmax_t)value); */
2536                         rc = callback((const char *)name, value);
2537                         if (rc != 0) {
2538                                 free(zl.l_phys);
2539                                 return (rc);
2540                         }
2541                 }
2542         }
2543
2544         free(zl.l_phys);
2545         return (0);
2546 }
2547
2548 static int zfs_printf(const char *name, uint64_t value __unused)
2549 {
2550
2551         printf("%s\n", name);
2552
2553         return (0);
2554 }
2555
2556 /*
2557  * List a zap directory.
2558  */
2559 static int
2560 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
2561 {
2562         zap_phys_t *zap;
2563         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2564         int rc;
2565
2566         zap = malloc(size);
2567         if (zap == NULL)
2568                 return (ENOMEM);
2569
2570         rc = dnode_read(spa, dnode, 0, zap, size);
2571         if (rc == 0) {
2572                 if (zap->zap_block_type == ZBT_MICRO)
2573                         rc = mzap_list((const mzap_phys_t *)zap, size,
2574                             zfs_printf);
2575                 else
2576                         rc = fzap_list(spa, dnode, zap, zfs_printf);
2577         }
2578         free(zap);
2579         return (rc);
2580 }
2581
2582 static int
2583 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum,
2584     dnode_phys_t *dnode)
2585 {
2586         off_t offset;
2587
2588         offset = objnum * sizeof(dnode_phys_t);
2589         return dnode_read(spa, &os->os_meta_dnode, offset,
2590                 dnode, sizeof(dnode_phys_t));
2591 }
2592
2593 /*
2594  * Lookup a name in a microzap directory.
2595  */
2596 static int
2597 mzap_rlookup(const mzap_phys_t *mz, size_t size, char *name, uint64_t value)
2598 {
2599         const mzap_ent_phys_t *mze;
2600         int chunks, i;
2601
2602         /*
2603          * Microzap objects use exactly one block. Read the whole
2604          * thing.
2605          */
2606         chunks = size / MZAP_ENT_LEN - 1;
2607         for (i = 0; i < chunks; i++) {
2608                 mze = &mz->mz_chunk[i];
2609                 if (value == mze->mze_value) {
2610                         strcpy(name, mze->mze_name);
2611                         return (0);
2612                 }
2613         }
2614
2615         return (ENOENT);
2616 }
2617
2618 static void
2619 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
2620 {
2621         size_t namelen;
2622         const zap_leaf_chunk_t *nc;
2623         char *p;
2624
2625         namelen = zc->l_entry.le_name_numints;
2626
2627         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2628         p = name;
2629         while (namelen > 0) {
2630                 size_t len;
2631                 len = namelen;
2632                 if (len > ZAP_LEAF_ARRAY_BYTES)
2633                         len = ZAP_LEAF_ARRAY_BYTES;
2634                 memcpy(p, nc->l_array.la_array, len);
2635                 p += len;
2636                 namelen -= len;
2637                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2638         }
2639
2640         *p = '\0';
2641 }
2642
2643 static int
2644 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2645     char *name, uint64_t value)
2646 {
2647         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2648         fat_zap_t z;
2649         uint64_t i;
2650         int j, rc;
2651
2652         if (zh->zap_magic != ZAP_MAGIC)
2653                 return (EIO);
2654
2655         z.zap_block_shift = ilog2(bsize);
2656         z.zap_phys = zh;
2657
2658         /*
2659          * This assumes that the leaf blocks start at block 1. The
2660          * documentation isn't exactly clear on this.
2661          */
2662         zap_leaf_t zl;
2663         zl.l_bs = z.zap_block_shift;
2664         zl.l_phys = malloc(bsize);
2665         if (zl.l_phys == NULL)
2666                 return (ENOMEM);
2667
2668         for (i = 0; i < zh->zap_num_leafs; i++) {
2669                 off_t off = ((off_t)(i + 1)) << zl.l_bs;
2670
2671                 rc = dnode_read(spa, dnode, off, zl.l_phys, bsize);
2672                 if (rc != 0)
2673                         goto done;
2674
2675                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
2676                         zap_leaf_chunk_t *zc;
2677
2678                         zc = &ZAP_LEAF_CHUNK(&zl, j);
2679                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
2680                                 continue;
2681                         if (zc->l_entry.le_value_intlen != 8 ||
2682                             zc->l_entry.le_value_numints != 1)
2683                                 continue;
2684
2685                         if (fzap_leaf_value(&zl, zc) == value) {
2686                                 fzap_name_copy(&zl, zc, name);
2687                                 goto done;
2688                         }
2689                 }
2690         }
2691
2692         rc = ENOENT;
2693 done:
2694         free(zl.l_phys);
2695         return (rc);
2696 }
2697
2698 static int
2699 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name,
2700     uint64_t value)
2701 {
2702         zap_phys_t *zap;
2703         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2704         int rc;
2705
2706         zap = malloc(size);
2707         if (zap == NULL)
2708                 return (ENOMEM);
2709
2710         rc = dnode_read(spa, dnode, 0, zap, size);
2711         if (rc == 0) {
2712                 if (zap->zap_block_type == ZBT_MICRO)
2713                         rc = mzap_rlookup((const mzap_phys_t *)zap, size,
2714                             name, value);
2715                 else
2716                         rc = fzap_rlookup(spa, dnode, zap, name, value);
2717         }
2718         free(zap);
2719         return (rc);
2720 }
2721
2722 static int
2723 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
2724 {
2725         char name[256];
2726         char component[256];
2727         uint64_t dir_obj, parent_obj, child_dir_zapobj;
2728         dnode_phys_t child_dir_zap, dataset, dir, parent;
2729         dsl_dir_phys_t *dd;
2730         dsl_dataset_phys_t *ds;
2731         char *p;
2732         int len;
2733
2734         p = &name[sizeof(name) - 1];
2735         *p = '\0';
2736
2737         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2738                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2739                 return (EIO);
2740         }
2741         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
2742         dir_obj = ds->ds_dir_obj;
2743
2744         for (;;) {
2745                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
2746                         return (EIO);
2747                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2748
2749                 /* Actual loop condition. */
2750                 parent_obj = dd->dd_parent_obj;
2751                 if (parent_obj == 0)
2752                         break;
2753
2754                 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj,
2755                     &parent) != 0)
2756                         return (EIO);
2757                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
2758                 child_dir_zapobj = dd->dd_child_dir_zapobj;
2759                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj,
2760                     &child_dir_zap) != 0)
2761                         return (EIO);
2762                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
2763                         return (EIO);
2764
2765                 len = strlen(component);
2766                 p -= len;
2767                 memcpy(p, component, len);
2768                 --p;
2769                 *p = '/';
2770
2771                 /* Actual loop iteration. */
2772                 dir_obj = parent_obj;
2773         }
2774
2775         if (*p != '\0')
2776                 ++p;
2777         strcpy(result, p);
2778
2779         return (0);
2780 }
2781
2782 static int
2783 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
2784 {
2785         char element[256];
2786         uint64_t dir_obj, child_dir_zapobj;
2787         dnode_phys_t child_dir_zap, dir;
2788         dsl_dir_phys_t *dd;
2789         const char *p, *q;
2790
2791         if (objset_get_dnode(spa, &spa->spa_mos,
2792             DMU_POOL_DIRECTORY_OBJECT, &dir))
2793                 return (EIO);
2794         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
2795             1, &dir_obj))
2796                 return (EIO);
2797
2798         p = name;
2799         for (;;) {
2800                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
2801                         return (EIO);
2802                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2803
2804                 while (*p == '/')
2805                         p++;
2806                 /* Actual loop condition #1. */
2807                 if (*p == '\0')
2808                         break;
2809
2810                 q = strchr(p, '/');
2811                 if (q) {
2812                         memcpy(element, p, q - p);
2813                         element[q - p] = '\0';
2814                         p = q + 1;
2815                 } else {
2816                         strcpy(element, p);
2817                         p += strlen(p);
2818                 }
2819
2820                 child_dir_zapobj = dd->dd_child_dir_zapobj;
2821                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj,
2822                     &child_dir_zap) != 0)
2823                         return (EIO);
2824
2825                 /* Actual loop condition #2. */
2826                 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
2827                     1, &dir_obj) != 0)
2828                         return (ENOENT);
2829         }
2830
2831         *objnum = dd->dd_head_dataset_obj;
2832         return (0);
2833 }
2834
2835 #ifndef BOOT2
2836 static int
2837 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
2838 {
2839         uint64_t dir_obj, child_dir_zapobj;
2840         dnode_phys_t child_dir_zap, dir, dataset;
2841         dsl_dataset_phys_t *ds;
2842         dsl_dir_phys_t *dd;
2843
2844         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2845                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2846                 return (EIO);
2847         }
2848         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
2849         dir_obj = ds->ds_dir_obj;
2850
2851         if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
2852                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2853                 return (EIO);
2854         }
2855         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2856
2857         child_dir_zapobj = dd->dd_child_dir_zapobj;
2858         if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj,
2859             &child_dir_zap) != 0) {
2860                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2861                 return (EIO);
2862         }
2863
2864         return (zap_list(spa, &child_dir_zap) != 0);
2865 }
2866
2867 int
2868 zfs_callback_dataset(const spa_t *spa, uint64_t objnum,
2869     int (*callback)(const char *, uint64_t))
2870 {
2871         uint64_t dir_obj, child_dir_zapobj;
2872         dnode_phys_t child_dir_zap, dir, dataset;
2873         dsl_dataset_phys_t *ds;
2874         dsl_dir_phys_t *dd;
2875         zap_phys_t *zap;
2876         size_t size;
2877         int err;
2878
2879         err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
2880         if (err != 0) {
2881                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2882                 return (err);
2883         }
2884         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
2885         dir_obj = ds->ds_dir_obj;
2886
2887         err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
2888         if (err != 0) {
2889                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2890                 return (err);
2891         }
2892         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2893
2894         child_dir_zapobj = dd->dd_child_dir_zapobj;
2895         err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj,
2896             &child_dir_zap);
2897         if (err != 0) {
2898                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2899                 return (err);
2900         }
2901
2902         size = child_dir_zap.dn_datablkszsec << SPA_MINBLOCKSHIFT;
2903         zap = malloc(size);
2904         if (zap != NULL) {
2905                 err = dnode_read(spa, &child_dir_zap, 0, zap, size);
2906                 if (err != 0)
2907                         goto done;
2908
2909                 if (zap->zap_block_type == ZBT_MICRO)
2910                         err = mzap_list((const mzap_phys_t *)zap, size,
2911                             callback);
2912                 else
2913                         err = fzap_list(spa, &child_dir_zap, zap, callback);
2914         } else {
2915                 err = ENOMEM;
2916         }
2917 done:
2918         free(zap);
2919         return (err);
2920 }
2921 #endif
2922
2923 /*
2924  * Find the object set given the object number of its dataset object
2925  * and return its details in *objset
2926  */
2927 static int
2928 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
2929 {
2930         dnode_phys_t dataset;
2931         dsl_dataset_phys_t *ds;
2932
2933         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2934                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2935                 return (EIO);
2936         }
2937
2938         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
2939         if (zio_read(spa, &ds->ds_bp, objset)) {
2940                 printf("ZFS: can't read object set for dataset %ju\n",
2941                     (uintmax_t)objnum);
2942                 return (EIO);
2943         }
2944
2945         return (0);
2946 }
2947
2948 /*
2949  * Find the object set pointed to by the BOOTFS property or the root
2950  * dataset if there is none and return its details in *objset
2951  */
2952 static int
2953 zfs_get_root(const spa_t *spa, uint64_t *objid)
2954 {
2955         dnode_phys_t dir, propdir;
2956         uint64_t props, bootfs, root;
2957
2958         *objid = 0;
2959
2960         /*
2961          * Start with the MOS directory object.
2962          */
2963         if (objset_get_dnode(spa, &spa->spa_mos,
2964             DMU_POOL_DIRECTORY_OBJECT, &dir)) {
2965                 printf("ZFS: can't read MOS object directory\n");
2966                 return (EIO);
2967         }
2968
2969         /*
2970          * Lookup the pool_props and see if we can find a bootfs.
2971          */
2972         if (zap_lookup(spa, &dir, DMU_POOL_PROPS,
2973             sizeof(props), 1, &props) == 0 &&
2974             objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 &&
2975             zap_lookup(spa, &propdir, "bootfs",
2976             sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) {
2977                 *objid = bootfs;
2978                 return (0);
2979         }
2980         /*
2981          * Lookup the root dataset directory
2982          */
2983         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET,
2984             sizeof(root), 1, &root) ||
2985             objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
2986                 printf("ZFS: can't find root dsl_dir\n");
2987                 return (EIO);
2988         }
2989
2990         /*
2991          * Use the information from the dataset directory's bonus buffer
2992          * to find the dataset object and from that the object set itself.
2993          */
2994         dsl_dir_phys_t *dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2995         *objid = dd->dd_head_dataset_obj;
2996         return (0);
2997 }
2998
2999 static int
3000 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
3001 {
3002
3003         mount->spa = spa;
3004
3005         /*
3006          * Find the root object set if not explicitly provided
3007          */
3008         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
3009                 printf("ZFS: can't find root filesystem\n");
3010                 return (EIO);
3011         }
3012
3013         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
3014                 printf("ZFS: can't open root filesystem\n");
3015                 return (EIO);
3016         }
3017
3018         mount->rootobj = rootobj;
3019
3020         return (0);
3021 }
3022
3023 /*
3024  * callback function for feature name checks.
3025  */
3026 static int
3027 check_feature(const char *name, uint64_t value)
3028 {
3029         int i;
3030
3031         if (value == 0)
3032                 return (0);
3033         if (name[0] == '\0')
3034                 return (0);
3035
3036         for (i = 0; features_for_read[i] != NULL; i++) {
3037                 if (strcmp(name, features_for_read[i]) == 0)
3038                         return (0);
3039         }
3040         printf("ZFS: unsupported feature: %s\n", name);
3041         return (EIO);
3042 }
3043
3044 /*
3045  * Checks whether the MOS features that are active are supported.
3046  */
3047 static int
3048 check_mos_features(const spa_t *spa)
3049 {
3050         dnode_phys_t dir;
3051         zap_phys_t *zap;
3052         uint64_t objnum;
3053         size_t size;
3054         int rc;
3055
3056         if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
3057             &dir)) != 0)
3058                 return (rc);
3059         if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
3060             sizeof (objnum), 1, &objnum)) != 0) {
3061                 /*
3062                  * It is older pool without features. As we have already
3063                  * tested the label, just return without raising the error.
3064                  */
3065                 return (0);
3066         }
3067
3068         if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
3069                 return (rc);
3070
3071         if (dir.dn_type != DMU_OTN_ZAP_METADATA)
3072                 return (EIO);
3073
3074         size = dir.dn_datablkszsec << SPA_MINBLOCKSHIFT;
3075         zap = malloc(size);
3076         if (zap == NULL)
3077                 return (ENOMEM);
3078
3079         if (dnode_read(spa, &dir, 0, zap, size)) {
3080                 free(zap);
3081                 return (EIO);
3082         }
3083
3084         if (zap->zap_block_type == ZBT_MICRO)
3085                 rc = mzap_list((const mzap_phys_t *)zap, size, check_feature);
3086         else
3087                 rc = fzap_list(spa, &dir, zap, check_feature);
3088
3089         free(zap);
3090         return (rc);
3091 }
3092
3093 static int
3094 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
3095 {
3096         dnode_phys_t dir;
3097         size_t size;
3098         int rc;
3099         unsigned char *nv;
3100
3101         *value = NULL;
3102         if ((rc = objset_get_dnode(spa, &spa->spa_mos, obj, &dir)) != 0)
3103                 return (rc);
3104         if (dir.dn_type != DMU_OT_PACKED_NVLIST &&
3105             dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) {
3106                 return (EIO);
3107         }
3108
3109         if (dir.dn_bonuslen != sizeof (uint64_t))
3110                 return (EIO);
3111
3112         size = *(uint64_t *)DN_BONUS(&dir);
3113         nv = malloc(size);
3114         if (nv == NULL)
3115                 return (ENOMEM);
3116
3117         rc = dnode_read(spa, &dir, 0, nv, size);
3118         if (rc != 0) {
3119                 free(nv);
3120                 nv = NULL;
3121                 return (rc);
3122         }
3123         *value = nvlist_import(nv + 4, nv[0], nv[1]);
3124         free(nv);
3125         return (rc);
3126 }
3127
3128 static int
3129 zfs_spa_init(spa_t *spa)
3130 {
3131         dnode_phys_t dir;
3132         uint64_t config_object;
3133         nvlist_t *nvlist;
3134         int rc;
3135
3136         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
3137                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
3138                 return (EIO);
3139         }
3140         if (spa->spa_mos.os_type != DMU_OST_META) {
3141                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
3142                 return (EIO);
3143         }
3144
3145         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
3146             &dir)) {
3147                 printf("ZFS: failed to read pool %s directory object\n",
3148                     spa->spa_name);
3149                 return (EIO);
3150         }
3151         /* this is allowed to fail, older pools do not have salt */
3152         rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
3153             sizeof (spa->spa_cksum_salt.zcs_bytes),
3154             spa->spa_cksum_salt.zcs_bytes);
3155
3156         rc = check_mos_features(spa);
3157         if (rc != 0) {
3158                 printf("ZFS: pool %s is not supported\n", spa->spa_name);
3159                 return (rc);
3160         }
3161
3162         rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG,
3163             sizeof (config_object), 1, &config_object);
3164         if (rc != 0) {
3165                 printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG);
3166                 return (EIO);
3167         }
3168         rc = load_nvlist(spa, config_object, &nvlist);
3169         if (rc != 0)
3170                 return (rc);
3171         /*
3172          * Update vdevs from MOS config. Note, we do skip encoding bytes
3173          * here. See also vdev_label_read_config().
3174          */
3175         rc = vdev_init_from_nvlist(spa, nvlist);
3176         nvlist_destroy(nvlist);
3177         return (rc);
3178 }
3179
3180 static int
3181 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
3182 {
3183
3184         if (dn->dn_bonustype != DMU_OT_SA) {
3185                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
3186
3187                 sb->st_mode = zp->zp_mode;
3188                 sb->st_uid = zp->zp_uid;
3189                 sb->st_gid = zp->zp_gid;
3190                 sb->st_size = zp->zp_size;
3191         } else {
3192                 sa_hdr_phys_t *sahdrp;
3193                 int hdrsize;
3194                 size_t size = 0;
3195                 void *buf = NULL;
3196
3197                 if (dn->dn_bonuslen != 0)
3198                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3199                 else {
3200                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
3201                                 blkptr_t *bp = DN_SPILL_BLKPTR(dn);
3202                                 int error;
3203
3204                                 size = BP_GET_LSIZE(bp);
3205                                 buf = malloc(size);
3206                                 if (buf == NULL)
3207                                         error = ENOMEM;
3208                                 else
3209                                         error = zio_read(spa, bp, buf);
3210
3211                                 if (error != 0) {
3212                                         free(buf);
3213                                         return (error);
3214                                 }
3215                                 sahdrp = buf;
3216                         } else {
3217                                 return (EIO);
3218                         }
3219                 }
3220                 hdrsize = SA_HDR_SIZE(sahdrp);
3221                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
3222                     SA_MODE_OFFSET);
3223                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
3224                     SA_UID_OFFSET);
3225                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
3226                     SA_GID_OFFSET);
3227                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
3228                     SA_SIZE_OFFSET);
3229                 free(buf);
3230         }
3231
3232         return (0);
3233 }
3234
3235 static int
3236 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
3237 {
3238         int rc = 0;
3239
3240         if (dn->dn_bonustype == DMU_OT_SA) {
3241                 sa_hdr_phys_t *sahdrp = NULL;
3242                 size_t size = 0;
3243                 void *buf = NULL;
3244                 int hdrsize;
3245                 char *p;
3246
3247                 if (dn->dn_bonuslen != 0) {
3248                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3249                 } else {
3250                         blkptr_t *bp;
3251
3252                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
3253                                 return (EIO);
3254                         bp = DN_SPILL_BLKPTR(dn);
3255
3256                         size = BP_GET_LSIZE(bp);
3257                         buf = malloc(size);
3258                         if (buf == NULL)
3259                                 rc = ENOMEM;
3260                         else
3261                                 rc = zio_read(spa, bp, buf);
3262                         if (rc != 0) {
3263                                 free(buf);
3264                                 return (rc);
3265                         }
3266                         sahdrp = buf;
3267                 }
3268                 hdrsize = SA_HDR_SIZE(sahdrp);
3269                 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
3270                 memcpy(path, p, psize);
3271                 free(buf);
3272                 return (0);
3273         }
3274         /*
3275          * Second test is purely to silence bogus compiler
3276          * warning about accessing past the end of dn_bonus.
3277          */
3278         if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
3279             sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
3280                 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
3281         } else {
3282                 rc = dnode_read(spa, dn, 0, path, psize);
3283         }
3284         return (rc);
3285 }
3286
3287 struct obj_list {
3288         uint64_t                objnum;
3289         STAILQ_ENTRY(obj_list)  entry;
3290 };
3291
3292 /*
3293  * Lookup a file and return its dnode.
3294  */
3295 static int
3296 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
3297 {
3298         int rc;
3299         uint64_t objnum;
3300         const spa_t *spa;
3301         dnode_phys_t dn;
3302         const char *p, *q;
3303         char element[256];
3304         char path[1024];
3305         int symlinks_followed = 0;
3306         struct stat sb;
3307         struct obj_list *entry, *tentry;
3308         STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
3309
3310         spa = mount->spa;
3311         if (mount->objset.os_type != DMU_OST_ZFS) {
3312                 printf("ZFS: unexpected object set type %ju\n",
3313                     (uintmax_t)mount->objset.os_type);
3314                 return (EIO);
3315         }
3316
3317         if ((entry = malloc(sizeof(struct obj_list))) == NULL)
3318                 return (ENOMEM);
3319
3320         /*
3321          * Get the root directory dnode.
3322          */
3323         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
3324         if (rc) {
3325                 free(entry);
3326                 return (rc);
3327         }
3328
3329         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof(objnum), 1, &objnum);
3330         if (rc) {
3331                 free(entry);
3332                 return (rc);
3333         }
3334         entry->objnum = objnum;
3335         STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3336
3337         rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3338         if (rc != 0)
3339                 goto done;
3340
3341         p = upath;
3342         while (p && *p) {
3343                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3344                 if (rc != 0)
3345                         goto done;
3346
3347                 while (*p == '/')
3348                         p++;
3349                 if (*p == '\0')
3350                         break;
3351                 q = p;
3352                 while (*q != '\0' && *q != '/')
3353                         q++;
3354
3355                 /* skip dot */
3356                 if (p + 1 == q && p[0] == '.') {
3357                         p++;
3358                         continue;
3359                 }
3360                 /* double dot */
3361                 if (p + 2 == q && p[0] == '.' && p[1] == '.') {
3362                         p += 2;
3363                         if (STAILQ_FIRST(&on_cache) ==
3364                             STAILQ_LAST(&on_cache, obj_list, entry)) {
3365                                 rc = ENOENT;
3366                                 goto done;
3367                         }
3368                         entry = STAILQ_FIRST(&on_cache);
3369                         STAILQ_REMOVE_HEAD(&on_cache, entry);
3370                         free(entry);
3371                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
3372                         continue;
3373                 }
3374                 if (q - p + 1 > sizeof(element)) {
3375                         rc = ENAMETOOLONG;
3376                         goto done;
3377                 }
3378                 memcpy(element, p, q - p);
3379                 element[q - p] = 0;
3380                 p = q;
3381
3382                 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
3383                         goto done;
3384                 if (!S_ISDIR(sb.st_mode)) {
3385                         rc = ENOTDIR;
3386                         goto done;
3387                 }
3388
3389                 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
3390                 if (rc)
3391                         goto done;
3392                 objnum = ZFS_DIRENT_OBJ(objnum);
3393
3394                 if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
3395                         rc = ENOMEM;
3396                         goto done;
3397                 }
3398                 entry->objnum = objnum;
3399                 STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3400                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3401                 if (rc)
3402                         goto done;
3403
3404                 /*
3405                  * Check for symlink.
3406                  */
3407                 rc = zfs_dnode_stat(spa, &dn, &sb);
3408                 if (rc)
3409                         goto done;
3410                 if (S_ISLNK(sb.st_mode)) {
3411                         if (symlinks_followed > 10) {
3412                                 rc = EMLINK;
3413                                 goto done;
3414                         }
3415                         symlinks_followed++;
3416
3417                         /*
3418                          * Read the link value and copy the tail of our
3419                          * current path onto the end.
3420                          */
3421                         if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
3422                                 rc = ENAMETOOLONG;
3423                                 goto done;
3424                         }
3425                         strcpy(&path[sb.st_size], p);
3426
3427                         rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
3428                         if (rc != 0)
3429                                 goto done;
3430
3431                         /*
3432                          * Restart with the new path, starting either at
3433                          * the root or at the parent depending whether or
3434                          * not the link is relative.
3435                          */
3436                         p = path;
3437                         if (*p == '/') {
3438                                 while (STAILQ_FIRST(&on_cache) !=
3439                                     STAILQ_LAST(&on_cache, obj_list, entry)) {
3440                                         entry = STAILQ_FIRST(&on_cache);
3441                                         STAILQ_REMOVE_HEAD(&on_cache, entry);
3442                                         free(entry);
3443                                 }
3444                         } else {
3445                                 entry = STAILQ_FIRST(&on_cache);
3446                                 STAILQ_REMOVE_HEAD(&on_cache, entry);
3447                                 free(entry);
3448                         }
3449                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
3450                 }
3451         }
3452
3453         *dnode = dn;
3454 done:
3455         STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
3456                 free(entry);
3457         return (rc);
3458 }