]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - stand/libsa/zfs/zfsimpl.c
stand: Remove double words in source code comments
[FreeBSD/FreeBSD.git] / stand / libsa / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <stdbool.h>
35 #include <sys/endian.h>
36 #include <sys/stat.h>
37 #include <sys/stdint.h>
38 #include <sys/list.h>
39 #include <sys/zfs_bootenv.h>
40 #include <machine/_inttypes.h>
41
42 #include "zfsimpl.h"
43 #include "zfssubr.c"
44
45 #ifdef HAS_ZSTD_ZFS
46 extern int zstd_init(void);
47 #endif
48
49 struct zfsmount {
50         char                    *path;
51         const spa_t             *spa;
52         objset_phys_t           objset;
53         uint64_t                rootobj;
54         STAILQ_ENTRY(zfsmount)  next;
55 };
56
57 typedef STAILQ_HEAD(zfs_mnt_list, zfsmount) zfs_mnt_list_t;
58 static zfs_mnt_list_t zfsmount = STAILQ_HEAD_INITIALIZER(zfsmount);
59
60 /*
61  * The indirect_child_t represents the vdev that we will read from, when we
62  * need to read all copies of the data (e.g. for scrub or reconstruction).
63  * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
64  * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
65  * ic_vdev is a child of the mirror.
66  */
67 typedef struct indirect_child {
68         void *ic_data;
69         vdev_t *ic_vdev;
70 } indirect_child_t;
71
72 /*
73  * The indirect_split_t represents one mapped segment of an i/o to the
74  * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
75  * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
76  * For split blocks, there will be several of these.
77  */
78 typedef struct indirect_split {
79         list_node_t is_node; /* link on iv_splits */
80
81         /*
82          * is_split_offset is the offset into the i/o.
83          * This is the sum of the previous splits' is_size's.
84          */
85         uint64_t is_split_offset;
86
87         vdev_t *is_vdev; /* top-level vdev */
88         uint64_t is_target_offset; /* offset on is_vdev */
89         uint64_t is_size;
90         int is_children; /* number of entries in is_child[] */
91
92         /*
93          * is_good_child is the child that we are currently using to
94          * attempt reconstruction.
95          */
96         int is_good_child;
97
98         indirect_child_t is_child[1]; /* variable-length */
99 } indirect_split_t;
100
101 /*
102  * The indirect_vsd_t is associated with each i/o to the indirect vdev.
103  * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
104  */
105 typedef struct indirect_vsd {
106         boolean_t iv_split_block;
107         boolean_t iv_reconstruct;
108
109         list_t iv_splits; /* list of indirect_split_t's */
110 } indirect_vsd_t;
111
112 /*
113  * List of all vdevs, chained through v_alllink.
114  */
115 static vdev_list_t zfs_vdevs;
116
117 /*
118  * List of ZFS features supported for read
119  */
120 static const char *features_for_read[] = {
121         "com.datto:bookmark_v2",
122         "com.datto:encryption",
123         "com.datto:resilver_defer",
124         "com.delphix:bookmark_written",
125         "com.delphix:device_removal",
126         "com.delphix:embedded_data",
127         "com.delphix:extensible_dataset",
128         "com.delphix:head_errlog",
129         "com.delphix:hole_birth",
130         "com.delphix:obsolete_counts",
131         "com.delphix:spacemap_histogram",
132         "com.delphix:spacemap_v2",
133         "com.delphix:zpool_checkpoint",
134         "com.intel:allocation_classes",
135         "com.joyent:multi_vdev_crash_dump",
136         "org.freebsd:zstd_compress",
137         "org.illumos:lz4_compress",
138         "org.illumos:sha512",
139         "org.illumos:skein",
140         "org.open-zfs:large_blocks",
141         "org.openzfs:blake3",
142         "org.zfsonlinux:allocation_classes",
143         "org.zfsonlinux:large_dnode",
144         NULL
145 };
146
147 /*
148  * List of all pools, chained through spa_link.
149  */
150 static spa_list_t zfs_pools;
151
152 static const dnode_phys_t *dnode_cache_obj;
153 static uint64_t dnode_cache_bn;
154 static char *dnode_cache_buf;
155
156 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
157 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
158 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
159 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
160     const char *name, uint64_t integer_size, uint64_t num_integers,
161     void *value);
162 static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t,
163     dnode_phys_t *);
164 static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *,
165     size_t);
166 static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t,
167     size_t);
168 static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t);
169 vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *,
170     uint64_t);
171 vdev_indirect_mapping_entry_phys_t *
172     vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t,
173     uint64_t, uint64_t *);
174
175 static void
176 zfs_init(void)
177 {
178         STAILQ_INIT(&zfs_vdevs);
179         STAILQ_INIT(&zfs_pools);
180
181         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
182
183         zfs_init_crc();
184 #ifdef HAS_ZSTD_ZFS
185         zstd_init();
186 #endif
187 }
188
189 static int
190 nvlist_check_features_for_read(nvlist_t *nvl)
191 {
192         nvlist_t *features = NULL;
193         nvs_data_t *data;
194         nvp_header_t *nvp;
195         nv_string_t *nvp_name;
196         int rc;
197
198         rc = nvlist_find(nvl, ZPOOL_CONFIG_FEATURES_FOR_READ,
199             DATA_TYPE_NVLIST, NULL, &features, NULL);
200         switch (rc) {
201         case 0:
202                 break;          /* Continue with checks */
203
204         case ENOENT:
205                 return (0);     /* All features are disabled */
206
207         default:
208                 return (rc);    /* Error while reading nvlist */
209         }
210
211         data = (nvs_data_t *)features->nv_data;
212         nvp = &data->nvl_pair;  /* first pair in nvlist */
213
214         while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
215                 int i, found;
216
217                 nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp));
218                 found = 0;
219
220                 for (i = 0; features_for_read[i] != NULL; i++) {
221                         if (memcmp(nvp_name->nv_data, features_for_read[i],
222                             nvp_name->nv_size) == 0) {
223                                 found = 1;
224                                 break;
225                         }
226                 }
227
228                 if (!found) {
229                         printf("ZFS: unsupported feature: %.*s\n",
230                             nvp_name->nv_size, nvp_name->nv_data);
231                         rc = EIO;
232                 }
233                 nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size);
234         }
235         nvlist_destroy(features);
236
237         return (rc);
238 }
239
240 static int
241 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
242     off_t offset, size_t size)
243 {
244         size_t psize;
245         int rc;
246
247         if (vdev->v_phys_read == NULL)
248                 return (ENOTSUP);
249
250         if (bp) {
251                 psize = BP_GET_PSIZE(bp);
252         } else {
253                 psize = size;
254         }
255
256         rc = vdev->v_phys_read(vdev, vdev->v_priv, offset, buf, psize);
257         if (rc == 0) {
258                 if (bp != NULL)
259                         rc = zio_checksum_verify(vdev->v_spa, bp, buf);
260         }
261
262         return (rc);
263 }
264
265 static int
266 vdev_write_phys(vdev_t *vdev, void *buf, off_t offset, size_t size)
267 {
268         if (vdev->v_phys_write == NULL)
269                 return (ENOTSUP);
270
271         return (vdev->v_phys_write(vdev, offset, buf, size));
272 }
273
274 typedef struct remap_segment {
275         vdev_t *rs_vd;
276         uint64_t rs_offset;
277         uint64_t rs_asize;
278         uint64_t rs_split_offset;
279         list_node_t rs_node;
280 } remap_segment_t;
281
282 static remap_segment_t *
283 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
284 {
285         remap_segment_t *rs = malloc(sizeof (remap_segment_t));
286
287         if (rs != NULL) {
288                 rs->rs_vd = vd;
289                 rs->rs_offset = offset;
290                 rs->rs_asize = asize;
291                 rs->rs_split_offset = split_offset;
292         }
293
294         return (rs);
295 }
296
297 vdev_indirect_mapping_t *
298 vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os,
299     uint64_t mapping_object)
300 {
301         vdev_indirect_mapping_t *vim;
302         vdev_indirect_mapping_phys_t *vim_phys;
303         int rc;
304
305         vim = calloc(1, sizeof (*vim));
306         if (vim == NULL)
307                 return (NULL);
308
309         vim->vim_dn = calloc(1, sizeof (*vim->vim_dn));
310         if (vim->vim_dn == NULL) {
311                 free(vim);
312                 return (NULL);
313         }
314
315         rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn);
316         if (rc != 0) {
317                 free(vim->vim_dn);
318                 free(vim);
319                 return (NULL);
320         }
321
322         vim->vim_spa = spa;
323         vim->vim_phys = malloc(sizeof (*vim->vim_phys));
324         if (vim->vim_phys == NULL) {
325                 free(vim->vim_dn);
326                 free(vim);
327                 return (NULL);
328         }
329
330         vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn);
331         *vim->vim_phys = *vim_phys;
332
333         vim->vim_objset = os;
334         vim->vim_object = mapping_object;
335         vim->vim_entries = NULL;
336
337         vim->vim_havecounts =
338             (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
339
340         return (vim);
341 }
342
343 /*
344  * Compare an offset with an indirect mapping entry; there are three
345  * possible scenarios:
346  *
347  *     1. The offset is "less than" the mapping entry; meaning the
348  *        offset is less than the source offset of the mapping entry. In
349  *        this case, there is no overlap between the offset and the
350  *        mapping entry and -1 will be returned.
351  *
352  *     2. The offset is "greater than" the mapping entry; meaning the
353  *        offset is greater than the mapping entry's source offset plus
354  *        the entry's size. In this case, there is no overlap between
355  *        the offset and the mapping entry and 1 will be returned.
356  *
357  *        NOTE: If the offset is actually equal to the entry's offset
358  *        plus size, this is considered to be "greater" than the entry,
359  *        and this case applies (i.e. 1 will be returned). Thus, the
360  *        entry's "range" can be considered to be inclusive at its
361  *        start, but exclusive at its end: e.g. [src, src + size).
362  *
363  *     3. The last case to consider is if the offset actually falls
364  *        within the mapping entry's range. If this is the case, the
365  *        offset is considered to be "equal to" the mapping entry and
366  *        0 will be returned.
367  *
368  *        NOTE: If the offset is equal to the entry's source offset,
369  *        this case applies and 0 will be returned. If the offset is
370  *        equal to the entry's source plus its size, this case does
371  *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
372  *        returned.
373  */
374 static int
375 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
376 {
377         const uint64_t *key = v_key;
378         const vdev_indirect_mapping_entry_phys_t *array_elem =
379             v_array_elem;
380         uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
381
382         if (*key < src_offset) {
383                 return (-1);
384         } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
385                 return (0);
386         } else {
387                 return (1);
388         }
389 }
390
391 /*
392  * Return array entry.
393  */
394 static vdev_indirect_mapping_entry_phys_t *
395 vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index)
396 {
397         uint64_t size;
398         off_t offset = 0;
399         int rc;
400
401         if (vim->vim_phys->vimp_num_entries == 0)
402                 return (NULL);
403
404         if (vim->vim_entries == NULL) {
405                 uint64_t bsize;
406
407                 bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
408                 size = vim->vim_phys->vimp_num_entries *
409                     sizeof (*vim->vim_entries);
410                 if (size > bsize) {
411                         size = bsize / sizeof (*vim->vim_entries);
412                         size *= sizeof (*vim->vim_entries);
413                 }
414                 vim->vim_entries = malloc(size);
415                 if (vim->vim_entries == NULL)
416                         return (NULL);
417                 vim->vim_num_entries = size / sizeof (*vim->vim_entries);
418                 offset = index * sizeof (*vim->vim_entries);
419         }
420
421         /* We have data in vim_entries */
422         if (offset == 0) {
423                 if (index >= vim->vim_entry_offset &&
424                     index <= vim->vim_entry_offset + vim->vim_num_entries) {
425                         index -= vim->vim_entry_offset;
426                         return (&vim->vim_entries[index]);
427                 }
428                 offset = index * sizeof (*vim->vim_entries);
429         }
430
431         vim->vim_entry_offset = index;
432         size = vim->vim_num_entries * sizeof (*vim->vim_entries);
433         rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries,
434             size);
435         if (rc != 0) {
436                 /* Read error, invalidate vim_entries. */
437                 free(vim->vim_entries);
438                 vim->vim_entries = NULL;
439                 return (NULL);
440         }
441         index -= vim->vim_entry_offset;
442         return (&vim->vim_entries[index]);
443 }
444
445 /*
446  * Returns the mapping entry for the given offset.
447  *
448  * It's possible that the given offset will not be in the mapping table
449  * (i.e. no mapping entries contain this offset), in which case, the
450  * return value depends on the "next_if_missing" parameter.
451  *
452  * If the offset is not found in the table and "next_if_missing" is
453  * B_FALSE, then NULL will always be returned. The behavior is intended
454  * to allow consumers to get the entry corresponding to the offset
455  * parameter, iff the offset overlaps with an entry in the table.
456  *
457  * If the offset is not found in the table and "next_if_missing" is
458  * B_TRUE, then the entry nearest to the given offset will be returned,
459  * such that the entry's source offset is greater than the offset
460  * passed in (i.e. the "next" mapping entry in the table is returned, if
461  * the offset is missing from the table). If there are no entries whose
462  * source offset is greater than the passed in offset, NULL is returned.
463  */
464 static vdev_indirect_mapping_entry_phys_t *
465 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
466     uint64_t offset)
467 {
468         ASSERT(vim->vim_phys->vimp_num_entries > 0);
469
470         vdev_indirect_mapping_entry_phys_t *entry;
471
472         uint64_t last = vim->vim_phys->vimp_num_entries - 1;
473         uint64_t base = 0;
474
475         /*
476          * We don't define these inside of the while loop because we use
477          * their value in the case that offset isn't in the mapping.
478          */
479         uint64_t mid;
480         int result;
481
482         while (last >= base) {
483                 mid = base + ((last - base) >> 1);
484
485                 entry = vdev_indirect_mapping_entry(vim, mid);
486                 if (entry == NULL)
487                         break;
488                 result = dva_mapping_overlap_compare(&offset, entry);
489
490                 if (result == 0) {
491                         break;
492                 } else if (result < 0) {
493                         last = mid - 1;
494                 } else {
495                         base = mid + 1;
496                 }
497         }
498         return (entry);
499 }
500
501 /*
502  * Given an indirect vdev and an extent on that vdev, it duplicates the
503  * physical entries of the indirect mapping that correspond to the extent
504  * to a new array and returns a pointer to it. In addition, copied_entries
505  * is populated with the number of mapping entries that were duplicated.
506  *
507  * Finally, since we are doing an allocation, it is up to the caller to
508  * free the array allocated in this function.
509  */
510 vdev_indirect_mapping_entry_phys_t *
511 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
512     uint64_t asize, uint64_t *copied_entries)
513 {
514         vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
515         vdev_indirect_mapping_t *vim = vd->v_mapping;
516         uint64_t entries = 0;
517
518         vdev_indirect_mapping_entry_phys_t *first_mapping =
519             vdev_indirect_mapping_entry_for_offset(vim, offset);
520         ASSERT3P(first_mapping, !=, NULL);
521
522         vdev_indirect_mapping_entry_phys_t *m = first_mapping;
523         while (asize > 0) {
524                 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
525                 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
526                 uint64_t inner_size = MIN(asize, size - inner_offset);
527
528                 offset += inner_size;
529                 asize -= inner_size;
530                 entries++;
531                 m++;
532         }
533
534         size_t copy_length = entries * sizeof (*first_mapping);
535         duplicate_mappings = malloc(copy_length);
536         if (duplicate_mappings != NULL)
537                 bcopy(first_mapping, duplicate_mappings, copy_length);
538         else
539                 entries = 0;
540
541         *copied_entries = entries;
542
543         return (duplicate_mappings);
544 }
545
546 static vdev_t *
547 vdev_lookup_top(spa_t *spa, uint64_t vdev)
548 {
549         vdev_t *rvd;
550         vdev_list_t *vlist;
551
552         vlist = &spa->spa_root_vdev->v_children;
553         STAILQ_FOREACH(rvd, vlist, v_childlink)
554                 if (rvd->v_id == vdev)
555                         break;
556
557         return (rvd);
558 }
559
560 /*
561  * This is a callback for vdev_indirect_remap() which allocates an
562  * indirect_split_t for each split segment and adds it to iv_splits.
563  */
564 static void
565 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
566     uint64_t size, void *arg)
567 {
568         int n = 1;
569         zio_t *zio = arg;
570         indirect_vsd_t *iv = zio->io_vsd;
571
572         if (vd->v_read == vdev_indirect_read)
573                 return;
574
575         if (vd->v_read == vdev_mirror_read)
576                 n = vd->v_nchildren;
577
578         indirect_split_t *is =
579             malloc(offsetof(indirect_split_t, is_child[n]));
580         if (is == NULL) {
581                 zio->io_error = ENOMEM;
582                 return;
583         }
584         bzero(is, offsetof(indirect_split_t, is_child[n]));
585
586         is->is_children = n;
587         is->is_size = size;
588         is->is_split_offset = split_offset;
589         is->is_target_offset = offset;
590         is->is_vdev = vd;
591
592         /*
593          * Note that we only consider multiple copies of the data for
594          * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
595          * though they use the same ops as mirror, because there's only one
596          * "good" copy under the replacing/spare.
597          */
598         if (vd->v_read == vdev_mirror_read) {
599                 int i = 0;
600                 vdev_t *kid;
601
602                 STAILQ_FOREACH(kid, &vd->v_children, v_childlink) {
603                         is->is_child[i++].ic_vdev = kid;
604                 }
605         } else {
606                 is->is_child[0].ic_vdev = vd;
607         }
608
609         list_insert_tail(&iv->iv_splits, is);
610 }
611
612 static void
613 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
614 {
615         list_t stack;
616         spa_t *spa = vd->v_spa;
617         zio_t *zio = arg;
618         remap_segment_t *rs;
619
620         list_create(&stack, sizeof (remap_segment_t),
621             offsetof(remap_segment_t, rs_node));
622
623         rs = rs_alloc(vd, offset, asize, 0);
624         if (rs == NULL) {
625                 printf("vdev_indirect_remap: out of memory.\n");
626                 zio->io_error = ENOMEM;
627         }
628         for (; rs != NULL; rs = list_remove_head(&stack)) {
629                 vdev_t *v = rs->rs_vd;
630                 uint64_t num_entries = 0;
631                 /* vdev_indirect_mapping_t *vim = v->v_mapping; */
632                 vdev_indirect_mapping_entry_phys_t *mapping =
633                     vdev_indirect_mapping_duplicate_adjacent_entries(v,
634                     rs->rs_offset, rs->rs_asize, &num_entries);
635
636                 if (num_entries == 0)
637                         zio->io_error = ENOMEM;
638
639                 for (uint64_t i = 0; i < num_entries; i++) {
640                         vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
641                         uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
642                         uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
643                         uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
644                         uint64_t inner_offset = rs->rs_offset -
645                             DVA_MAPPING_GET_SRC_OFFSET(m);
646                         uint64_t inner_size =
647                             MIN(rs->rs_asize, size - inner_offset);
648                         vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
649
650                         if (dst_v->v_read == vdev_indirect_read) {
651                                 remap_segment_t *o;
652
653                                 o = rs_alloc(dst_v, dst_offset + inner_offset,
654                                     inner_size, rs->rs_split_offset);
655                                 if (o == NULL) {
656                                         printf("vdev_indirect_remap: "
657                                             "out of memory.\n");
658                                         zio->io_error = ENOMEM;
659                                         break;
660                                 }
661
662                                 list_insert_head(&stack, o);
663                         }
664                         vdev_indirect_gather_splits(rs->rs_split_offset, dst_v,
665                             dst_offset + inner_offset,
666                             inner_size, arg);
667
668                         /*
669                          * vdev_indirect_gather_splits can have memory
670                          * allocation error, we can not recover from it.
671                          */
672                         if (zio->io_error != 0)
673                                 break;
674                         rs->rs_offset += inner_size;
675                         rs->rs_asize -= inner_size;
676                         rs->rs_split_offset += inner_size;
677                 }
678
679                 free(mapping);
680                 free(rs);
681                 if (zio->io_error != 0)
682                         break;
683         }
684
685         list_destroy(&stack);
686 }
687
688 static void
689 vdev_indirect_map_free(zio_t *zio)
690 {
691         indirect_vsd_t *iv = zio->io_vsd;
692         indirect_split_t *is;
693
694         while ((is = list_head(&iv->iv_splits)) != NULL) {
695                 for (int c = 0; c < is->is_children; c++) {
696                         indirect_child_t *ic = &is->is_child[c];
697                         free(ic->ic_data);
698                 }
699                 list_remove(&iv->iv_splits, is);
700                 free(is);
701         }
702         free(iv);
703 }
704
705 static int
706 vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
707     off_t offset, size_t bytes)
708 {
709         zio_t zio;
710         spa_t *spa = vdev->v_spa;
711         indirect_vsd_t *iv;
712         indirect_split_t *first;
713         int rc = EIO;
714
715         iv = calloc(1, sizeof(*iv));
716         if (iv == NULL)
717                 return (ENOMEM);
718
719         list_create(&iv->iv_splits,
720             sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
721
722         bzero(&zio, sizeof(zio));
723         zio.io_spa = spa;
724         zio.io_bp = (blkptr_t *)bp;
725         zio.io_data = buf;
726         zio.io_size = bytes;
727         zio.io_offset = offset;
728         zio.io_vd = vdev;
729         zio.io_vsd = iv;
730
731         if (vdev->v_mapping == NULL) {
732                 vdev_indirect_config_t *vic;
733
734                 vic = &vdev->vdev_indirect_config;
735                 vdev->v_mapping = vdev_indirect_mapping_open(spa,
736                     spa->spa_mos, vic->vic_mapping_object);
737         }
738
739         vdev_indirect_remap(vdev, offset, bytes, &zio);
740         if (zio.io_error != 0)
741                 return (zio.io_error);
742
743         first = list_head(&iv->iv_splits);
744         if (first->is_size == zio.io_size) {
745                 /*
746                  * This is not a split block; we are pointing to the entire
747                  * data, which will checksum the same as the original data.
748                  * Pass the BP down so that the child i/o can verify the
749                  * checksum, and try a different location if available
750                  * (e.g. on a mirror).
751                  *
752                  * While this special case could be handled the same as the
753                  * general (split block) case, doing it this way ensures
754                  * that the vast majority of blocks on indirect vdevs
755                  * (which are not split) are handled identically to blocks
756                  * on non-indirect vdevs.  This allows us to be less strict
757                  * about performance in the general (but rare) case.
758                  */
759                 rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp,
760                     zio.io_data, first->is_target_offset, bytes);
761         } else {
762                 iv->iv_split_block = B_TRUE;
763                 /*
764                  * Read one copy of each split segment, from the
765                  * top-level vdev.  Since we don't know the
766                  * checksum of each split individually, the child
767                  * zio can't ensure that we get the right data.
768                  * E.g. if it's a mirror, it will just read from a
769                  * random (healthy) leaf vdev.  We have to verify
770                  * the checksum in vdev_indirect_io_done().
771                  */
772                 for (indirect_split_t *is = list_head(&iv->iv_splits);
773                     is != NULL; is = list_next(&iv->iv_splits, is)) {
774                         char *ptr = zio.io_data;
775
776                         rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp,
777                             ptr + is->is_split_offset, is->is_target_offset,
778                             is->is_size);
779                 }
780                 if (zio_checksum_verify(spa, zio.io_bp, zio.io_data))
781                         rc = ECKSUM;
782                 else
783                         rc = 0;
784         }
785
786         vdev_indirect_map_free(&zio);
787         if (rc == 0)
788                 rc = zio.io_error;
789
790         return (rc);
791 }
792
793 static int
794 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
795     off_t offset, size_t bytes)
796 {
797
798         return (vdev_read_phys(vdev, bp, buf,
799             offset + VDEV_LABEL_START_SIZE, bytes));
800 }
801
802 static int
803 vdev_missing_read(vdev_t *vdev __unused, const blkptr_t *bp __unused,
804     void *buf __unused, off_t offset __unused, size_t bytes __unused)
805 {
806
807         return (ENOTSUP);
808 }
809
810 static int
811 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
812     off_t offset, size_t bytes)
813 {
814         vdev_t *kid;
815         int rc;
816
817         rc = EIO;
818         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
819                 if (kid->v_state != VDEV_STATE_HEALTHY)
820                         continue;
821                 rc = kid->v_read(kid, bp, buf, offset, bytes);
822                 if (!rc)
823                         return (0);
824         }
825
826         return (rc);
827 }
828
829 static int
830 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
831     off_t offset, size_t bytes)
832 {
833         vdev_t *kid;
834
835         /*
836          * Here we should have two kids:
837          * First one which is the one we are replacing and we can trust
838          * only this one to have valid data, but it might not be present.
839          * Second one is that one we are replacing with. It is most likely
840          * healthy, but we can't trust it has needed data, so we won't use it.
841          */
842         kid = STAILQ_FIRST(&vdev->v_children);
843         if (kid == NULL)
844                 return (EIO);
845         if (kid->v_state != VDEV_STATE_HEALTHY)
846                 return (EIO);
847         return (kid->v_read(kid, bp, buf, offset, bytes));
848 }
849
850 static vdev_t *
851 vdev_find(uint64_t guid)
852 {
853         vdev_t *vdev;
854
855         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
856                 if (vdev->v_guid == guid)
857                         return (vdev);
858
859         return (0);
860 }
861
862 static vdev_t *
863 vdev_create(uint64_t guid, vdev_read_t *_read)
864 {
865         vdev_t *vdev;
866         vdev_indirect_config_t *vic;
867
868         vdev = calloc(1, sizeof(vdev_t));
869         if (vdev != NULL) {
870                 STAILQ_INIT(&vdev->v_children);
871                 vdev->v_guid = guid;
872                 vdev->v_read = _read;
873
874                 /*
875                  * root vdev has no read function, we use this fact to
876                  * skip setting up data we do not need for root vdev.
877                  * We only point root vdev from spa.
878                  */
879                 if (_read != NULL) {
880                         vic = &vdev->vdev_indirect_config;
881                         vic->vic_prev_indirect_vdev = UINT64_MAX;
882                         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
883                 }
884         }
885
886         return (vdev);
887 }
888
889 static void
890 vdev_set_initial_state(vdev_t *vdev, const nvlist_t *nvlist)
891 {
892         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
893         uint64_t is_log;
894
895         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
896         is_log = 0;
897         (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
898             &is_offline, NULL);
899         (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
900             &is_removed, NULL);
901         (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
902             &is_faulted, NULL);
903         (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
904             NULL, &is_degraded, NULL);
905         (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
906             NULL, &isnt_present, NULL);
907         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
908             &is_log, NULL);
909
910         if (is_offline != 0)
911                 vdev->v_state = VDEV_STATE_OFFLINE;
912         else if (is_removed != 0)
913                 vdev->v_state = VDEV_STATE_REMOVED;
914         else if (is_faulted != 0)
915                 vdev->v_state = VDEV_STATE_FAULTED;
916         else if (is_degraded != 0)
917                 vdev->v_state = VDEV_STATE_DEGRADED;
918         else if (isnt_present != 0)
919                 vdev->v_state = VDEV_STATE_CANT_OPEN;
920
921         vdev->v_islog = is_log != 0;
922 }
923
924 static int
925 vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp)
926 {
927         uint64_t id, ashift, asize, nparity;
928         const char *path;
929         const char *type;
930         int len, pathlen;
931         char *name;
932         vdev_t *vdev;
933
934         if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id,
935             NULL) ||
936             nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL,
937             &type, &len)) {
938                 return (ENOENT);
939         }
940
941         if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 &&
942             memcmp(type, VDEV_TYPE_DISK, len) != 0 &&
943 #ifdef ZFS_TEST
944             memcmp(type, VDEV_TYPE_FILE, len) != 0 &&
945 #endif
946             memcmp(type, VDEV_TYPE_RAIDZ, len) != 0 &&
947             memcmp(type, VDEV_TYPE_INDIRECT, len) != 0 &&
948             memcmp(type, VDEV_TYPE_REPLACING, len) != 0 &&
949             memcmp(type, VDEV_TYPE_HOLE, len) != 0) {
950                 printf("ZFS: can only boot from disk, mirror, raidz1, "
951                     "raidz2 and raidz3 vdevs, got: %.*s\n", len, type);
952                 return (EIO);
953         }
954
955         if (memcmp(type, VDEV_TYPE_MIRROR, len) == 0)
956                 vdev = vdev_create(guid, vdev_mirror_read);
957         else if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0)
958                 vdev = vdev_create(guid, vdev_raidz_read);
959         else if (memcmp(type, VDEV_TYPE_REPLACING, len) == 0)
960                 vdev = vdev_create(guid, vdev_replacing_read);
961         else if (memcmp(type, VDEV_TYPE_INDIRECT, len) == 0) {
962                 vdev_indirect_config_t *vic;
963
964                 vdev = vdev_create(guid, vdev_indirect_read);
965                 if (vdev != NULL) {
966                         vdev->v_state = VDEV_STATE_HEALTHY;
967                         vic = &vdev->vdev_indirect_config;
968
969                         nvlist_find(nvlist,
970                             ZPOOL_CONFIG_INDIRECT_OBJECT,
971                             DATA_TYPE_UINT64,
972                             NULL, &vic->vic_mapping_object, NULL);
973                         nvlist_find(nvlist,
974                             ZPOOL_CONFIG_INDIRECT_BIRTHS,
975                             DATA_TYPE_UINT64,
976                             NULL, &vic->vic_births_object, NULL);
977                         nvlist_find(nvlist,
978                             ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
979                             DATA_TYPE_UINT64,
980                             NULL, &vic->vic_prev_indirect_vdev, NULL);
981                 }
982         } else if (memcmp(type, VDEV_TYPE_HOLE, len) == 0) {
983                 vdev = vdev_create(guid, vdev_missing_read);
984         } else {
985                 vdev = vdev_create(guid, vdev_disk_read);
986         }
987
988         if (vdev == NULL)
989                 return (ENOMEM);
990
991         vdev_set_initial_state(vdev, nvlist);
992         vdev->v_id = id;
993         if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
994             DATA_TYPE_UINT64, NULL, &ashift, NULL) == 0)
995                 vdev->v_ashift = ashift;
996
997         if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
998             DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) {
999                 vdev->v_psize = asize +
1000                     VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1001         }
1002
1003         if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
1004             DATA_TYPE_UINT64, NULL, &nparity, NULL) == 0)
1005                 vdev->v_nparity = nparity;
1006
1007         if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
1008             DATA_TYPE_STRING, NULL, &path, &pathlen) == 0) {
1009                 char prefix[] = "/dev/";
1010
1011                 len = strlen(prefix);
1012                 if (len < pathlen && memcmp(path, prefix, len) == 0) {
1013                         path += len;
1014                         pathlen -= len;
1015                 }
1016                 name = malloc(pathlen + 1);
1017                 bcopy(path, name, pathlen);
1018                 name[pathlen] = '\0';
1019                 vdev->v_name = name;
1020         } else {
1021                 name = NULL;
1022                 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) {
1023                         if (vdev->v_nparity < 1 ||
1024                             vdev->v_nparity > 3) {
1025                                 printf("ZFS: invalid raidz parity: %d\n",
1026                                     vdev->v_nparity);
1027                                 return (EIO);
1028                         }
1029                         (void) asprintf(&name, "%.*s%d-%" PRIu64, len, type,
1030                             vdev->v_nparity, id);
1031                 } else {
1032                         (void) asprintf(&name, "%.*s-%" PRIu64, len, type, id);
1033                 }
1034                 vdev->v_name = name;
1035         }
1036         *vdevp = vdev;
1037         return (0);
1038 }
1039
1040 /*
1041  * Find slot for vdev. We return either NULL to signal to use
1042  * STAILQ_INSERT_HEAD, or we return link element to be used with
1043  * STAILQ_INSERT_AFTER.
1044  */
1045 static vdev_t *
1046 vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
1047 {
1048         vdev_t *v, *previous;
1049
1050         if (STAILQ_EMPTY(&top_vdev->v_children))
1051                 return (NULL);
1052
1053         previous = NULL;
1054         STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
1055                 if (v->v_id > vdev->v_id)
1056                         return (previous);
1057
1058                 if (v->v_id == vdev->v_id)
1059                         return (v);
1060
1061                 if (v->v_id < vdev->v_id)
1062                         previous = v;
1063         }
1064         return (previous);
1065 }
1066
1067 static size_t
1068 vdev_child_count(vdev_t *vdev)
1069 {
1070         vdev_t *v;
1071         size_t count;
1072
1073         count = 0;
1074         STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
1075                 count++;
1076         }
1077         return (count);
1078 }
1079
1080 /*
1081  * Insert vdev into top_vdev children list. List is ordered by v_id.
1082  */
1083 static void
1084 vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
1085 {
1086         vdev_t *previous;
1087         size_t count;
1088
1089         /*
1090          * The top level vdev can appear in random order, depending how
1091          * the firmware is presenting the disk devices.
1092          * However, we will insert vdev to create list ordered by v_id,
1093          * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
1094          * as STAILQ does not have insert before.
1095          */
1096         previous = vdev_find_previous(top_vdev, vdev);
1097
1098         if (previous == NULL) {
1099                 STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
1100         } else if (previous->v_id == vdev->v_id) {
1101                 /*
1102                  * This vdev was configured from label config,
1103                  * do not insert duplicate.
1104                  */
1105                 return;
1106         } else {
1107                 STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev,
1108                     v_childlink);
1109         }
1110
1111         count = vdev_child_count(top_vdev);
1112         if (top_vdev->v_nchildren < count)
1113                 top_vdev->v_nchildren = count;
1114 }
1115
1116 static int
1117 vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist)
1118 {
1119         vdev_t *top_vdev, *vdev;
1120         nvlist_t **kids = NULL;
1121         int rc, nkids;
1122
1123         /* Get top vdev. */
1124         top_vdev = vdev_find(top_guid);
1125         if (top_vdev == NULL) {
1126                 rc = vdev_init(top_guid, nvlist, &top_vdev);
1127                 if (rc != 0)
1128                         return (rc);
1129                 top_vdev->v_spa = spa;
1130                 top_vdev->v_top = top_vdev;
1131                 vdev_insert(spa->spa_root_vdev, top_vdev);
1132         }
1133
1134         /* Add children if there are any. */
1135         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1136             &nkids, &kids, NULL);
1137         if (rc == 0) {
1138                 for (int i = 0; i < nkids; i++) {
1139                         uint64_t guid;
1140
1141                         rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID,
1142                             DATA_TYPE_UINT64, NULL, &guid, NULL);
1143                         if (rc != 0)
1144                                 goto done;
1145
1146                         rc = vdev_init(guid, kids[i], &vdev);
1147                         if (rc != 0)
1148                                 goto done;
1149
1150                         vdev->v_spa = spa;
1151                         vdev->v_top = top_vdev;
1152                         vdev_insert(top_vdev, vdev);
1153                 }
1154         } else {
1155                 /*
1156                  * When there are no children, nvlist_find() does return
1157                  * error, reset it because leaf devices have no children.
1158                  */
1159                 rc = 0;
1160         }
1161 done:
1162         if (kids != NULL) {
1163                 for (int i = 0; i < nkids; i++)
1164                         nvlist_destroy(kids[i]);
1165                 free(kids);
1166         }
1167
1168         return (rc);
1169 }
1170
1171 static int
1172 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
1173 {
1174         uint64_t pool_guid, top_guid;
1175         nvlist_t *vdevs;
1176         int rc;
1177
1178         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1179             NULL, &pool_guid, NULL) ||
1180             nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
1181             NULL, &top_guid, NULL) ||
1182             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1183             NULL, &vdevs, NULL)) {
1184                 printf("ZFS: can't find vdev details\n");
1185                 return (ENOENT);
1186         }
1187
1188         rc = vdev_from_nvlist(spa, top_guid, vdevs);
1189         nvlist_destroy(vdevs);
1190         return (rc);
1191 }
1192
1193 static void
1194 vdev_set_state(vdev_t *vdev)
1195 {
1196         vdev_t *kid;
1197         int good_kids;
1198         int bad_kids;
1199
1200         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1201                 vdev_set_state(kid);
1202         }
1203
1204         /*
1205          * A mirror or raidz is healthy if all its kids are healthy. A
1206          * mirror is degraded if any of its kids is healthy; a raidz
1207          * is degraded if at most nparity kids are offline.
1208          */
1209         if (STAILQ_FIRST(&vdev->v_children)) {
1210                 good_kids = 0;
1211                 bad_kids = 0;
1212                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1213                         if (kid->v_state == VDEV_STATE_HEALTHY)
1214                                 good_kids++;
1215                         else
1216                                 bad_kids++;
1217                 }
1218                 if (bad_kids == 0) {
1219                         vdev->v_state = VDEV_STATE_HEALTHY;
1220                 } else {
1221                         if (vdev->v_read == vdev_mirror_read) {
1222                                 if (good_kids) {
1223                                         vdev->v_state = VDEV_STATE_DEGRADED;
1224                                 } else {
1225                                         vdev->v_state = VDEV_STATE_OFFLINE;
1226                                 }
1227                         } else if (vdev->v_read == vdev_raidz_read) {
1228                                 if (bad_kids > vdev->v_nparity) {
1229                                         vdev->v_state = VDEV_STATE_OFFLINE;
1230                                 } else {
1231                                         vdev->v_state = VDEV_STATE_DEGRADED;
1232                                 }
1233                         }
1234                 }
1235         }
1236 }
1237
1238 static int
1239 vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist)
1240 {
1241         vdev_t *vdev;
1242         nvlist_t **kids = NULL;
1243         int rc, nkids;
1244
1245         /* Update top vdev. */
1246         vdev = vdev_find(top_guid);
1247         if (vdev != NULL)
1248                 vdev_set_initial_state(vdev, nvlist);
1249
1250         /* Update children if there are any. */
1251         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1252             &nkids, &kids, NULL);
1253         if (rc == 0) {
1254                 for (int i = 0; i < nkids; i++) {
1255                         uint64_t guid;
1256
1257                         rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID,
1258                             DATA_TYPE_UINT64, NULL, &guid, NULL);
1259                         if (rc != 0)
1260                                 break;
1261
1262                         vdev = vdev_find(guid);
1263                         if (vdev != NULL)
1264                                 vdev_set_initial_state(vdev, kids[i]);
1265                 }
1266         } else {
1267                 rc = 0;
1268         }
1269         if (kids != NULL) {
1270                 for (int i = 0; i < nkids; i++)
1271                         nvlist_destroy(kids[i]);
1272                 free(kids);
1273         }
1274
1275         return (rc);
1276 }
1277
1278 static int
1279 vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
1280 {
1281         uint64_t pool_guid, vdev_children;
1282         nvlist_t *vdevs = NULL, **kids = NULL;
1283         int rc, nkids;
1284
1285         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1286             NULL, &pool_guid, NULL) ||
1287             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64,
1288             NULL, &vdev_children, NULL) ||
1289             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1290             NULL, &vdevs, NULL)) {
1291                 printf("ZFS: can't find vdev details\n");
1292                 return (ENOENT);
1293         }
1294
1295         /* Wrong guid?! */
1296         if (spa->spa_guid != pool_guid) {
1297                 nvlist_destroy(vdevs);
1298                 return (EINVAL);
1299         }
1300
1301         spa->spa_root_vdev->v_nchildren = vdev_children;
1302
1303         rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1304             &nkids, &kids, NULL);
1305         nvlist_destroy(vdevs);
1306
1307         /*
1308          * MOS config has at least one child for root vdev.
1309          */
1310         if (rc != 0)
1311                 return (rc);
1312
1313         for (int i = 0; i < nkids; i++) {
1314                 uint64_t guid;
1315                 vdev_t *vdev;
1316
1317                 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1318                     NULL, &guid, NULL);
1319                 if (rc != 0)
1320                         break;
1321                 vdev = vdev_find(guid);
1322                 /*
1323                  * Top level vdev is missing, create it.
1324                  */
1325                 if (vdev == NULL)
1326                         rc = vdev_from_nvlist(spa, guid, kids[i]);
1327                 else
1328                         rc = vdev_update_from_nvlist(guid, kids[i]);
1329                 if (rc != 0)
1330                         break;
1331         }
1332         if (kids != NULL) {
1333                 for (int i = 0; i < nkids; i++)
1334                         nvlist_destroy(kids[i]);
1335                 free(kids);
1336         }
1337
1338         /*
1339          * Re-evaluate top-level vdev state.
1340          */
1341         vdev_set_state(spa->spa_root_vdev);
1342
1343         return (rc);
1344 }
1345
1346 static spa_t *
1347 spa_find_by_guid(uint64_t guid)
1348 {
1349         spa_t *spa;
1350
1351         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1352                 if (spa->spa_guid == guid)
1353                         return (spa);
1354
1355         return (NULL);
1356 }
1357
1358 static spa_t *
1359 spa_find_by_name(const char *name)
1360 {
1361         spa_t *spa;
1362
1363         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1364                 if (strcmp(spa->spa_name, name) == 0)
1365                         return (spa);
1366
1367         return (NULL);
1368 }
1369
1370 static spa_t *
1371 spa_find_by_dev(struct zfs_devdesc *dev)
1372 {
1373
1374         if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1375                 return (NULL);
1376
1377         if (dev->pool_guid == 0)
1378                 return (STAILQ_FIRST(&zfs_pools));
1379
1380         return (spa_find_by_guid(dev->pool_guid));
1381 }
1382
1383 static spa_t *
1384 spa_create(uint64_t guid, const char *name)
1385 {
1386         spa_t *spa;
1387
1388         if ((spa = calloc(1, sizeof(spa_t))) == NULL)
1389                 return (NULL);
1390         if ((spa->spa_name = strdup(name)) == NULL) {
1391                 free(spa);
1392                 return (NULL);
1393         }
1394         spa->spa_uberblock = &spa->spa_uberblock_master;
1395         spa->spa_mos = &spa->spa_mos_master;
1396         spa->spa_guid = guid;
1397         spa->spa_root_vdev = vdev_create(guid, NULL);
1398         if (spa->spa_root_vdev == NULL) {
1399                 free(spa->spa_name);
1400                 free(spa);
1401                 return (NULL);
1402         }
1403         spa->spa_root_vdev->v_name = strdup("root");
1404         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
1405
1406         return (spa);
1407 }
1408
1409 static const char *
1410 state_name(vdev_state_t state)
1411 {
1412         static const char *names[] = {
1413                 "UNKNOWN",
1414                 "CLOSED",
1415                 "OFFLINE",
1416                 "REMOVED",
1417                 "CANT_OPEN",
1418                 "FAULTED",
1419                 "DEGRADED",
1420                 "ONLINE"
1421         };
1422         return (names[state]);
1423 }
1424
1425 #ifdef BOOT2
1426
1427 #define pager_printf printf
1428
1429 #else
1430
1431 static int
1432 pager_printf(const char *fmt, ...)
1433 {
1434         char line[80];
1435         va_list args;
1436
1437         va_start(args, fmt);
1438         vsnprintf(line, sizeof(line), fmt, args);
1439         va_end(args);
1440         return (pager_output(line));
1441 }
1442
1443 #endif
1444
1445 #define STATUS_FORMAT   "        %s %s\n"
1446
1447 static int
1448 print_state(int indent, const char *name, vdev_state_t state)
1449 {
1450         int i;
1451         char buf[512];
1452
1453         buf[0] = 0;
1454         for (i = 0; i < indent; i++)
1455                 strcat(buf, "  ");
1456         strcat(buf, name);
1457         return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
1458 }
1459
1460 static int
1461 vdev_status(vdev_t *vdev, int indent)
1462 {
1463         vdev_t *kid;
1464         int ret;
1465
1466         if (vdev->v_islog) {
1467                 (void) pager_output("        logs\n");
1468                 indent++;
1469         }
1470
1471         ret = print_state(indent, vdev->v_name, vdev->v_state);
1472         if (ret != 0)
1473                 return (ret);
1474
1475         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1476                 ret = vdev_status(kid, indent + 1);
1477                 if (ret != 0)
1478                         return (ret);
1479         }
1480         return (ret);
1481 }
1482
1483 static int
1484 spa_status(spa_t *spa)
1485 {
1486         static char bootfs[ZFS_MAXNAMELEN];
1487         uint64_t rootid;
1488         vdev_list_t *vlist;
1489         vdev_t *vdev;
1490         int good_kids, bad_kids, degraded_kids, ret;
1491         vdev_state_t state;
1492
1493         ret = pager_printf("  pool: %s\n", spa->spa_name);
1494         if (ret != 0)
1495                 return (ret);
1496
1497         if (zfs_get_root(spa, &rootid) == 0 &&
1498             zfs_rlookup(spa, rootid, bootfs) == 0) {
1499                 if (bootfs[0] == '\0')
1500                         ret = pager_printf("bootfs: %s\n", spa->spa_name);
1501                 else
1502                         ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
1503                             bootfs);
1504                 if (ret != 0)
1505                         return (ret);
1506         }
1507         ret = pager_printf("config:\n\n");
1508         if (ret != 0)
1509                 return (ret);
1510         ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
1511         if (ret != 0)
1512                 return (ret);
1513
1514         good_kids = 0;
1515         degraded_kids = 0;
1516         bad_kids = 0;
1517         vlist = &spa->spa_root_vdev->v_children;
1518         STAILQ_FOREACH(vdev, vlist, v_childlink) {
1519                 if (vdev->v_state == VDEV_STATE_HEALTHY)
1520                         good_kids++;
1521                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
1522                         degraded_kids++;
1523                 else
1524                         bad_kids++;
1525         }
1526
1527         state = VDEV_STATE_CLOSED;
1528         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
1529                 state = VDEV_STATE_HEALTHY;
1530         else if ((good_kids + degraded_kids) > 0)
1531                 state = VDEV_STATE_DEGRADED;
1532
1533         ret = print_state(0, spa->spa_name, state);
1534         if (ret != 0)
1535                 return (ret);
1536
1537         STAILQ_FOREACH(vdev, vlist, v_childlink) {
1538                 ret = vdev_status(vdev, 1);
1539                 if (ret != 0)
1540                         return (ret);
1541         }
1542         return (ret);
1543 }
1544
1545 static int
1546 spa_all_status(void)
1547 {
1548         spa_t *spa;
1549         int first = 1, ret = 0;
1550
1551         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1552                 if (!first) {
1553                         ret = pager_printf("\n");
1554                         if (ret != 0)
1555                                 return (ret);
1556                 }
1557                 first = 0;
1558                 ret = spa_status(spa);
1559                 if (ret != 0)
1560                         return (ret);
1561         }
1562         return (ret);
1563 }
1564
1565 static uint64_t
1566 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
1567 {
1568         uint64_t label_offset;
1569
1570         if (l < VDEV_LABELS / 2)
1571                 label_offset = 0;
1572         else
1573                 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
1574
1575         return (offset + l * sizeof (vdev_label_t) + label_offset);
1576 }
1577
1578 static int
1579 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
1580 {
1581         unsigned int seq1 = 0;
1582         unsigned int seq2 = 0;
1583         int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
1584
1585         if (cmp != 0)
1586                 return (cmp);
1587
1588         cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
1589         if (cmp != 0)
1590                 return (cmp);
1591
1592         if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
1593                 seq1 = MMP_SEQ(ub1);
1594
1595         if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
1596                 seq2 = MMP_SEQ(ub2);
1597
1598         return (AVL_CMP(seq1, seq2));
1599 }
1600
1601 static int
1602 uberblock_verify(uberblock_t *ub)
1603 {
1604         if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
1605                 byteswap_uint64_array(ub, sizeof (uberblock_t));
1606         }
1607
1608         if (ub->ub_magic != UBERBLOCK_MAGIC ||
1609             !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
1610                 return (EINVAL);
1611
1612         return (0);
1613 }
1614
1615 static int
1616 vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
1617     size_t size)
1618 {
1619         blkptr_t bp;
1620         off_t off;
1621
1622         off = vdev_label_offset(vd->v_psize, l, offset);
1623
1624         BP_ZERO(&bp);
1625         BP_SET_LSIZE(&bp, size);
1626         BP_SET_PSIZE(&bp, size);
1627         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1628         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1629         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
1630         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1631
1632         return (vdev_read_phys(vd, &bp, buf, off, size));
1633 }
1634
1635 /*
1636  * We do need to be sure we write to correct location.
1637  * Our vdev label does consist of 4 fields:
1638  * pad1 (8k), reserved.
1639  * bootenv (8k), checksummed, previously reserved, may contian garbage.
1640  * vdev_phys (112k), checksummed
1641  * uberblock ring (128k), checksummed.
1642  *
1643  * Since bootenv area may contain garbage, we can not reliably read it, as
1644  * we can get checksum errors.
1645  * Next best thing is vdev_phys - it is just after bootenv. It still may
1646  * be corrupted, but in such case we will miss this one write.
1647  */
1648 static int
1649 vdev_label_write_validate(vdev_t *vd, int l, uint64_t offset)
1650 {
1651         uint64_t off, o_phys;
1652         void *buf;
1653         size_t size = VDEV_PHYS_SIZE;
1654         int rc;
1655
1656         o_phys = offsetof(vdev_label_t, vl_vdev_phys);
1657         off = vdev_label_offset(vd->v_psize, l, o_phys);
1658
1659         /* off should be 8K from bootenv */
1660         if (vdev_label_offset(vd->v_psize, l, offset) + VDEV_PAD_SIZE != off)
1661                 return (EINVAL);
1662
1663         buf = malloc(size);
1664         if (buf == NULL)
1665                 return (ENOMEM);
1666
1667         /* Read vdev_phys */
1668         rc = vdev_label_read(vd, l, buf, o_phys, size);
1669         free(buf);
1670         return (rc);
1671 }
1672
1673 static int
1674 vdev_label_write(vdev_t *vd, int l, vdev_boot_envblock_t *be, uint64_t offset)
1675 {
1676         zio_checksum_info_t *ci;
1677         zio_cksum_t cksum;
1678         off_t off;
1679         size_t size = VDEV_PAD_SIZE;
1680         int rc;
1681
1682         if (vd->v_phys_write == NULL)
1683                 return (ENOTSUP);
1684
1685         off = vdev_label_offset(vd->v_psize, l, offset);
1686
1687         rc = vdev_label_write_validate(vd, l, offset);
1688         if (rc != 0) {
1689                 return (rc);
1690         }
1691
1692         ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
1693         be->vbe_zbt.zec_magic = ZEC_MAGIC;
1694         zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off);
1695         ci->ci_func[0](be, size, NULL, &cksum);
1696         be->vbe_zbt.zec_cksum = cksum;
1697
1698         return (vdev_write_phys(vd, be, off, size));
1699 }
1700
1701 static int
1702 vdev_write_bootenv_impl(vdev_t *vdev, vdev_boot_envblock_t *be)
1703 {
1704         vdev_t *kid;
1705         int rv = 0, rc;
1706
1707         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1708                 if (kid->v_state != VDEV_STATE_HEALTHY)
1709                         continue;
1710                 rc = vdev_write_bootenv_impl(kid, be);
1711                 if (rv == 0)
1712                         rv = rc;
1713         }
1714
1715         /*
1716          * Non-leaf vdevs do not have v_phys_write.
1717          */
1718         if (vdev->v_phys_write == NULL)
1719                 return (rv);
1720
1721         for (int l = 0; l < VDEV_LABELS; l++) {
1722                 rc = vdev_label_write(vdev, l, be,
1723                     offsetof(vdev_label_t, vl_be));
1724                 if (rc != 0) {
1725                         printf("failed to write bootenv to %s label %d: %d\n",
1726                             vdev->v_name ? vdev->v_name : "unknown", l, rc);
1727                         rv = rc;
1728                 }
1729         }
1730         return (rv);
1731 }
1732
1733 int
1734 vdev_write_bootenv(vdev_t *vdev, nvlist_t *nvl)
1735 {
1736         vdev_boot_envblock_t *be;
1737         nvlist_t nv, *nvp;
1738         uint64_t version;
1739         int rv;
1740
1741         if (nvl->nv_size > sizeof(be->vbe_bootenv))
1742                 return (E2BIG);
1743
1744         version = VB_RAW;
1745         nvp = vdev_read_bootenv(vdev);
1746         if (nvp != NULL) {
1747                 nvlist_find(nvp, BOOTENV_VERSION, DATA_TYPE_UINT64, NULL,
1748                     &version, NULL);
1749                 nvlist_destroy(nvp);
1750         }
1751
1752         be = calloc(1, sizeof(*be));
1753         if (be == NULL)
1754                 return (ENOMEM);
1755
1756         be->vbe_version = version;
1757         switch (version) {
1758         case VB_RAW:
1759                 /*
1760                  * If there is no envmap, we will just wipe bootenv.
1761                  */
1762                 nvlist_find(nvl, GRUB_ENVMAP, DATA_TYPE_STRING, NULL,
1763                     be->vbe_bootenv, NULL);
1764                 rv = 0;
1765                 break;
1766
1767         case VB_NVLIST:
1768                 nv.nv_header = nvl->nv_header;
1769                 nv.nv_asize = nvl->nv_asize;
1770                 nv.nv_size = nvl->nv_size;
1771
1772                 bcopy(&nv.nv_header, be->vbe_bootenv, sizeof(nv.nv_header));
1773                 nv.nv_data = be->vbe_bootenv + sizeof(nvs_header_t);
1774                 bcopy(nvl->nv_data, nv.nv_data, nv.nv_size);
1775                 rv = nvlist_export(&nv);
1776                 break;
1777
1778         default:
1779                 rv = EINVAL;
1780                 break;
1781         }
1782
1783         if (rv == 0) {
1784                 be->vbe_version = htobe64(be->vbe_version);
1785                 rv = vdev_write_bootenv_impl(vdev, be);
1786         }
1787         free(be);
1788         return (rv);
1789 }
1790
1791 /*
1792  * Read the bootenv area from pool label, return the nvlist from it.
1793  * We return from first successful read.
1794  */
1795 nvlist_t *
1796 vdev_read_bootenv(vdev_t *vdev)
1797 {
1798         vdev_t *kid;
1799         nvlist_t *benv;
1800         vdev_boot_envblock_t *be;
1801         char *command;
1802         bool ok;
1803         int rv;
1804
1805         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1806                 if (kid->v_state != VDEV_STATE_HEALTHY)
1807                         continue;
1808
1809                 benv = vdev_read_bootenv(kid);
1810                 if (benv != NULL)
1811                         return (benv);
1812         }
1813
1814         be = malloc(sizeof (*be));
1815         if (be == NULL)
1816                 return (NULL);
1817
1818         rv = 0;
1819         for (int l = 0; l < VDEV_LABELS; l++) {
1820                 rv = vdev_label_read(vdev, l, be,
1821                     offsetof(vdev_label_t, vl_be),
1822                     sizeof (*be));
1823                 if (rv == 0)
1824                         break;
1825         }
1826         if (rv != 0) {
1827                 free(be);
1828                 return (NULL);
1829         }
1830
1831         be->vbe_version = be64toh(be->vbe_version);
1832         switch (be->vbe_version) {
1833         case VB_RAW:
1834                 /*
1835                  * we have textual data in vbe_bootenv, create nvlist
1836                  * with key "envmap".
1837                  */
1838                 benv = nvlist_create(NV_UNIQUE_NAME);
1839                 if (benv != NULL) {
1840                         if (*be->vbe_bootenv == '\0') {
1841                                 nvlist_add_uint64(benv, BOOTENV_VERSION,
1842                                     VB_NVLIST);
1843                                 break;
1844                         }
1845                         nvlist_add_uint64(benv, BOOTENV_VERSION, VB_RAW);
1846                         be->vbe_bootenv[sizeof (be->vbe_bootenv) - 1] = '\0';
1847                         nvlist_add_string(benv, GRUB_ENVMAP, be->vbe_bootenv);
1848                 }
1849                 break;
1850
1851         case VB_NVLIST:
1852                 benv = nvlist_import(be->vbe_bootenv, sizeof(be->vbe_bootenv));
1853                 break;
1854
1855         default:
1856                 command = (char *)be;
1857                 ok = false;
1858
1859                 /* Check for legacy zfsbootcfg command string */
1860                 for (int i = 0; command[i] != '\0'; i++) {
1861                         if (iscntrl(command[i])) {
1862                                 ok = false;
1863                                 break;
1864                         } else {
1865                                 ok = true;
1866                         }
1867                 }
1868                 benv = nvlist_create(NV_UNIQUE_NAME);
1869                 if (benv != NULL) {
1870                         if (ok)
1871                                 nvlist_add_string(benv, FREEBSD_BOOTONCE,
1872                                     command);
1873                         else
1874                                 nvlist_add_uint64(benv, BOOTENV_VERSION,
1875                                     VB_NVLIST);
1876                 }
1877                 break;
1878         }
1879         free(be);
1880         return (benv);
1881 }
1882
1883 static uint64_t
1884 vdev_get_label_asize(nvlist_t *nvl)
1885 {
1886         nvlist_t *vdevs;
1887         uint64_t asize;
1888         const char *type;
1889         int len;
1890
1891         asize = 0;
1892         /* Get vdev tree */
1893         if (nvlist_find(nvl, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1894             NULL, &vdevs, NULL) != 0)
1895                 return (asize);
1896
1897         /*
1898          * Get vdev type. We will calculate asize for raidz, mirror and disk.
1899          * For raidz, the asize is raw size of all children.
1900          */
1901         if (nvlist_find(vdevs, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
1902             NULL, &type, &len) != 0)
1903                 goto done;
1904
1905         if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 &&
1906             memcmp(type, VDEV_TYPE_DISK, len) != 0 &&
1907             memcmp(type, VDEV_TYPE_RAIDZ, len) != 0)
1908                 goto done;
1909
1910         if (nvlist_find(vdevs, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64,
1911             NULL, &asize, NULL) != 0)
1912                 goto done;
1913
1914         if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) {
1915                 nvlist_t **kids;
1916                 int nkids;
1917
1918                 if (nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN,
1919                     DATA_TYPE_NVLIST_ARRAY, &nkids, &kids, NULL) != 0) {
1920                         asize = 0;
1921                         goto done;
1922                 }
1923
1924                 asize /= nkids;
1925                 for (int i = 0; i < nkids; i++)
1926                         nvlist_destroy(kids[i]);
1927                 free(kids);
1928         }
1929
1930         asize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1931 done:
1932         nvlist_destroy(vdevs);
1933         return (asize);
1934 }
1935
1936 static nvlist_t *
1937 vdev_label_read_config(vdev_t *vd, uint64_t txg)
1938 {
1939         vdev_phys_t *label;
1940         uint64_t best_txg = 0;
1941         uint64_t label_txg = 0;
1942         uint64_t asize;
1943         nvlist_t *nvl = NULL, *tmp;
1944         int error;
1945
1946         label = malloc(sizeof (vdev_phys_t));
1947         if (label == NULL)
1948                 return (NULL);
1949
1950         for (int l = 0; l < VDEV_LABELS; l++) {
1951                 if (vdev_label_read(vd, l, label,
1952                     offsetof(vdev_label_t, vl_vdev_phys),
1953                     sizeof (vdev_phys_t)))
1954                         continue;
1955
1956                 tmp = nvlist_import(label->vp_nvlist,
1957                     sizeof(label->vp_nvlist));
1958                 if (tmp == NULL)
1959                         continue;
1960
1961                 error = nvlist_find(tmp, ZPOOL_CONFIG_POOL_TXG,
1962                     DATA_TYPE_UINT64, NULL, &label_txg, NULL);
1963                 if (error != 0 || label_txg == 0) {
1964                         nvlist_destroy(nvl);
1965                         nvl = tmp;
1966                         goto done;
1967                 }
1968
1969                 if (label_txg <= txg && label_txg > best_txg) {
1970                         best_txg = label_txg;
1971                         nvlist_destroy(nvl);
1972                         nvl = tmp;
1973                         tmp = NULL;
1974
1975                         /*
1976                          * Use asize from pool config. We need this
1977                          * because we can get bad value from BIOS.
1978                          */
1979                         asize = vdev_get_label_asize(nvl);
1980                         if (asize != 0) {
1981                                 vd->v_psize = asize;
1982                         }
1983                 }
1984                 nvlist_destroy(tmp);
1985         }
1986
1987         if (best_txg == 0) {
1988                 nvlist_destroy(nvl);
1989                 nvl = NULL;
1990         }
1991 done:
1992         free(label);
1993         return (nvl);
1994 }
1995
1996 static void
1997 vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
1998 {
1999         uberblock_t *buf;
2000
2001         buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
2002         if (buf == NULL)
2003                 return;
2004
2005         for (int l = 0; l < VDEV_LABELS; l++) {
2006                 for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
2007                         if (vdev_label_read(vd, l, buf,
2008                             VDEV_UBERBLOCK_OFFSET(vd, n),
2009                             VDEV_UBERBLOCK_SIZE(vd)))
2010                                 continue;
2011                         if (uberblock_verify(buf) != 0)
2012                                 continue;
2013
2014                         if (vdev_uberblock_compare(buf, ub) > 0)
2015                                 *ub = *buf;
2016                 }
2017         }
2018         free(buf);
2019 }
2020
2021 static int
2022 vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
2023     spa_t **spap)
2024 {
2025         vdev_t vtmp;
2026         spa_t *spa;
2027         vdev_t *vdev;
2028         nvlist_t *nvl;
2029         uint64_t val;
2030         uint64_t guid, vdev_children;
2031         uint64_t pool_txg, pool_guid;
2032         const char *pool_name;
2033         int rc, namelen;
2034
2035         /*
2036          * Load the vdev label and figure out which
2037          * uberblock is most current.
2038          */
2039         memset(&vtmp, 0, sizeof(vtmp));
2040         vtmp.v_phys_read = _read;
2041         vtmp.v_phys_write = _write;
2042         vtmp.v_priv = priv;
2043         vtmp.v_psize = P2ALIGN(ldi_get_size(priv),
2044             (uint64_t)sizeof (vdev_label_t));
2045
2046         /* Test for minimum device size. */
2047         if (vtmp.v_psize < SPA_MINDEVSIZE)
2048                 return (EIO);
2049
2050         nvl = vdev_label_read_config(&vtmp, UINT64_MAX);
2051         if (nvl == NULL)
2052                 return (EIO);
2053
2054         if (nvlist_find(nvl, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
2055             NULL, &val, NULL) != 0) {
2056                 nvlist_destroy(nvl);
2057                 return (EIO);
2058         }
2059
2060         if (!SPA_VERSION_IS_SUPPORTED(val)) {
2061                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
2062                     (unsigned)val, (unsigned)SPA_VERSION);
2063                 nvlist_destroy(nvl);
2064                 return (EIO);
2065         }
2066
2067         /* Check ZFS features for read */
2068         rc = nvlist_check_features_for_read(nvl);
2069         if (rc != 0) {
2070                 nvlist_destroy(nvl);
2071                 return (EIO);
2072         }
2073
2074         if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
2075             NULL, &val, NULL) != 0) {
2076                 nvlist_destroy(nvl);
2077                 return (EIO);
2078         }
2079
2080         if (val == POOL_STATE_DESTROYED) {
2081                 /* We don't boot only from destroyed pools. */
2082                 nvlist_destroy(nvl);
2083                 return (EIO);
2084         }
2085
2086         if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
2087             NULL, &pool_txg, NULL) != 0 ||
2088             nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
2089             NULL, &pool_guid, NULL) != 0 ||
2090             nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
2091             NULL, &pool_name, &namelen) != 0) {
2092                 /*
2093                  * Cache and spare devices end up here - just ignore
2094                  * them.
2095                  */
2096                 nvlist_destroy(nvl);
2097                 return (EIO);
2098         }
2099
2100         /*
2101          * Create the pool if this is the first time we've seen it.
2102          */
2103         spa = spa_find_by_guid(pool_guid);
2104         if (spa == NULL) {
2105                 char *name;
2106
2107                 nvlist_find(nvl, ZPOOL_CONFIG_VDEV_CHILDREN,
2108                     DATA_TYPE_UINT64, NULL, &vdev_children, NULL);
2109                 name = malloc(namelen + 1);
2110                 if (name == NULL) {
2111                         nvlist_destroy(nvl);
2112                         return (ENOMEM);
2113                 }
2114                 bcopy(pool_name, name, namelen);
2115                 name[namelen] = '\0';
2116                 spa = spa_create(pool_guid, name);
2117                 free(name);
2118                 if (spa == NULL) {
2119                         nvlist_destroy(nvl);
2120                         return (ENOMEM);
2121                 }
2122                 spa->spa_root_vdev->v_nchildren = vdev_children;
2123         }
2124         if (pool_txg > spa->spa_txg)
2125                 spa->spa_txg = pool_txg;
2126
2127         /*
2128          * Get the vdev tree and create our in-core copy of it.
2129          * If we already have a vdev with this guid, this must
2130          * be some kind of alias (overlapping slices, dangerously dedicated
2131          * disks etc).
2132          */
2133         if (nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
2134             NULL, &guid, NULL) != 0) {
2135                 nvlist_destroy(nvl);
2136                 return (EIO);
2137         }
2138         vdev = vdev_find(guid);
2139         /* Has this vdev already been inited? */
2140         if (vdev && vdev->v_phys_read) {
2141                 nvlist_destroy(nvl);
2142                 return (EIO);
2143         }
2144
2145         rc = vdev_init_from_label(spa, nvl);
2146         nvlist_destroy(nvl);
2147         if (rc != 0)
2148                 return (rc);
2149
2150         /*
2151          * We should already have created an incomplete vdev for this
2152          * vdev. Find it and initialise it with our read proc.
2153          */
2154         vdev = vdev_find(guid);
2155         if (vdev != NULL) {
2156                 vdev->v_phys_read = _read;
2157                 vdev->v_phys_write = _write;
2158                 vdev->v_priv = priv;
2159                 vdev->v_psize = vtmp.v_psize;
2160                 /*
2161                  * If no other state is set, mark vdev healthy.
2162                  */
2163                 if (vdev->v_state == VDEV_STATE_UNKNOWN)
2164                         vdev->v_state = VDEV_STATE_HEALTHY;
2165         } else {
2166                 printf("ZFS: inconsistent nvlist contents\n");
2167                 return (EIO);
2168         }
2169
2170         if (vdev->v_islog)
2171                 spa->spa_with_log = vdev->v_islog;
2172
2173         /*
2174          * Re-evaluate top-level vdev state.
2175          */
2176         vdev_set_state(vdev->v_top);
2177
2178         /*
2179          * Ok, we are happy with the pool so far. Lets find
2180          * the best uberblock and then we can actually access
2181          * the contents of the pool.
2182          */
2183         vdev_uberblock_load(vdev, spa->spa_uberblock);
2184
2185         if (spap != NULL)
2186                 *spap = spa;
2187         return (0);
2188 }
2189
2190 static int
2191 ilog2(int n)
2192 {
2193         int v;
2194
2195         for (v = 0; v < 32; v++)
2196                 if (n == (1 << v))
2197                         return (v);
2198         return (-1);
2199 }
2200
2201 static int
2202 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
2203 {
2204         blkptr_t gbh_bp;
2205         zio_gbh_phys_t zio_gb;
2206         char *pbuf;
2207         int i;
2208
2209         /* Artificial BP for gang block header. */
2210         gbh_bp = *bp;
2211         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
2212         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
2213         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
2214         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
2215         for (i = 0; i < SPA_DVAS_PER_BP; i++)
2216                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
2217
2218         /* Read gang header block using the artificial BP. */
2219         if (zio_read(spa, &gbh_bp, &zio_gb))
2220                 return (EIO);
2221
2222         pbuf = buf;
2223         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
2224                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
2225
2226                 if (BP_IS_HOLE(gbp))
2227                         continue;
2228                 if (zio_read(spa, gbp, pbuf))
2229                         return (EIO);
2230                 pbuf += BP_GET_PSIZE(gbp);
2231         }
2232
2233         if (zio_checksum_verify(spa, bp, buf))
2234                 return (EIO);
2235         return (0);
2236 }
2237
2238 static int
2239 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
2240 {
2241         int cpfunc = BP_GET_COMPRESS(bp);
2242         uint64_t align, size;
2243         void *pbuf;
2244         int i, error;
2245
2246         /*
2247          * Process data embedded in block pointer
2248          */
2249         if (BP_IS_EMBEDDED(bp)) {
2250                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
2251
2252                 size = BPE_GET_PSIZE(bp);
2253                 ASSERT(size <= BPE_PAYLOAD_SIZE);
2254
2255                 if (cpfunc != ZIO_COMPRESS_OFF)
2256                         pbuf = malloc(size);
2257                 else
2258                         pbuf = buf;
2259
2260                 if (pbuf == NULL)
2261                         return (ENOMEM);
2262
2263                 decode_embedded_bp_compressed(bp, pbuf);
2264                 error = 0;
2265
2266                 if (cpfunc != ZIO_COMPRESS_OFF) {
2267                         error = zio_decompress_data(cpfunc, pbuf,
2268                             size, buf, BP_GET_LSIZE(bp));
2269                         free(pbuf);
2270                 }
2271                 if (error != 0)
2272                         printf("ZFS: i/o error - unable to decompress "
2273                             "block pointer data, error %d\n", error);
2274                 return (error);
2275         }
2276
2277         error = EIO;
2278
2279         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
2280                 const dva_t *dva = &bp->blk_dva[i];
2281                 vdev_t *vdev;
2282                 vdev_list_t *vlist;
2283                 uint64_t vdevid;
2284                 off_t offset;
2285
2286                 if (!dva->dva_word[0] && !dva->dva_word[1])
2287                         continue;
2288
2289                 vdevid = DVA_GET_VDEV(dva);
2290                 offset = DVA_GET_OFFSET(dva);
2291                 vlist = &spa->spa_root_vdev->v_children;
2292                 STAILQ_FOREACH(vdev, vlist, v_childlink) {
2293                         if (vdev->v_id == vdevid)
2294                                 break;
2295                 }
2296                 if (!vdev || !vdev->v_read)
2297                         continue;
2298
2299                 size = BP_GET_PSIZE(bp);
2300                 if (vdev->v_read == vdev_raidz_read) {
2301                         align = 1ULL << vdev->v_ashift;
2302                         if (P2PHASE(size, align) != 0)
2303                                 size = P2ROUNDUP(size, align);
2304                 }
2305                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
2306                         pbuf = malloc(size);
2307                 else
2308                         pbuf = buf;
2309
2310                 if (pbuf == NULL) {
2311                         error = ENOMEM;
2312                         break;
2313                 }
2314
2315                 if (DVA_GET_GANG(dva))
2316                         error = zio_read_gang(spa, bp, pbuf);
2317                 else
2318                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
2319                 if (error == 0) {
2320                         if (cpfunc != ZIO_COMPRESS_OFF)
2321                                 error = zio_decompress_data(cpfunc, pbuf,
2322                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
2323                         else if (size != BP_GET_PSIZE(bp))
2324                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
2325                 } else {
2326                         printf("zio_read error: %d\n", error);
2327                 }
2328                 if (buf != pbuf)
2329                         free(pbuf);
2330                 if (error == 0)
2331                         break;
2332         }
2333         if (error != 0)
2334                 printf("ZFS: i/o error - all block copies unavailable\n");
2335
2336         return (error);
2337 }
2338
2339 static int
2340 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset,
2341     void *buf, size_t buflen)
2342 {
2343         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
2344         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2345         int nlevels = dnode->dn_nlevels;
2346         int i, rc;
2347
2348         if (bsize > SPA_MAXBLOCKSIZE) {
2349                 printf("ZFS: I/O error - blocks larger than %llu are not "
2350                     "supported\n", SPA_MAXBLOCKSIZE);
2351                 return (EIO);
2352         }
2353
2354         /*
2355          * Handle odd block sizes, mirrors dmu_read_impl().  Data can't exist
2356          * past the first block, so we'll clip the read to the portion of the
2357          * buffer within bsize and zero out the remainder.
2358          */
2359         if (dnode->dn_maxblkid == 0) {
2360                 size_t newbuflen;
2361
2362                 newbuflen = offset > bsize ? 0 : MIN(buflen, bsize - offset);
2363                 bzero((char *)buf + newbuflen, buflen - newbuflen);
2364                 buflen = newbuflen;
2365         }
2366
2367         /*
2368          * Note: bsize may not be a power of two here so we need to do an
2369          * actual divide rather than a bitshift.
2370          */
2371         while (buflen > 0) {
2372                 uint64_t bn = offset / bsize;
2373                 int boff = offset % bsize;
2374                 int ibn;
2375                 const blkptr_t *indbp;
2376                 blkptr_t bp;
2377
2378                 if (bn > dnode->dn_maxblkid)
2379                         return (EIO);
2380
2381                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
2382                         goto cached;
2383
2384                 indbp = dnode->dn_blkptr;
2385                 for (i = 0; i < nlevels; i++) {
2386                         /*
2387                          * Copy the bp from the indirect array so that
2388                          * we can re-use the scratch buffer for multi-level
2389                          * objects.
2390                          */
2391                         ibn = bn >> ((nlevels - i - 1) * ibshift);
2392                         ibn &= ((1 << ibshift) - 1);
2393                         bp = indbp[ibn];
2394                         if (BP_IS_HOLE(&bp)) {
2395                                 memset(dnode_cache_buf, 0, bsize);
2396                                 break;
2397                         }
2398                         rc = zio_read(spa, &bp, dnode_cache_buf);
2399                         if (rc)
2400                                 return (rc);
2401                         indbp = (const blkptr_t *) dnode_cache_buf;
2402                 }
2403                 dnode_cache_obj = dnode;
2404                 dnode_cache_bn = bn;
2405         cached:
2406
2407                 /*
2408                  * The buffer contains our data block. Copy what we
2409                  * need from it and loop.
2410                  */
2411                 i = bsize - boff;
2412                 if (i > buflen) i = buflen;
2413                 memcpy(buf, &dnode_cache_buf[boff], i);
2414                 buf = ((char *)buf) + i;
2415                 offset += i;
2416                 buflen -= i;
2417         }
2418
2419         return (0);
2420 }
2421
2422 /*
2423  * Lookup a value in a microzap directory.
2424  */
2425 static int
2426 mzap_lookup(const mzap_phys_t *mz, size_t size, const char *name,
2427     uint64_t *value)
2428 {
2429         const mzap_ent_phys_t *mze;
2430         int chunks, i;
2431
2432         /*
2433          * Microzap objects use exactly one block. Read the whole
2434          * thing.
2435          */
2436         chunks = size / MZAP_ENT_LEN - 1;
2437         for (i = 0; i < chunks; i++) {
2438                 mze = &mz->mz_chunk[i];
2439                 if (strcmp(mze->mze_name, name) == 0) {
2440                         *value = mze->mze_value;
2441                         return (0);
2442                 }
2443         }
2444
2445         return (ENOENT);
2446 }
2447
2448 /*
2449  * Compare a name with a zap leaf entry. Return non-zero if the name
2450  * matches.
2451  */
2452 static int
2453 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2454     const char *name)
2455 {
2456         size_t namelen;
2457         const zap_leaf_chunk_t *nc;
2458         const char *p;
2459
2460         namelen = zc->l_entry.le_name_numints;
2461
2462         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2463         p = name;
2464         while (namelen > 0) {
2465                 size_t len;
2466
2467                 len = namelen;
2468                 if (len > ZAP_LEAF_ARRAY_BYTES)
2469                         len = ZAP_LEAF_ARRAY_BYTES;
2470                 if (memcmp(p, nc->l_array.la_array, len))
2471                         return (0);
2472                 p += len;
2473                 namelen -= len;
2474                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2475         }
2476
2477         return (1);
2478 }
2479
2480 /*
2481  * Extract a uint64_t value from a zap leaf entry.
2482  */
2483 static uint64_t
2484 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
2485 {
2486         const zap_leaf_chunk_t *vc;
2487         int i;
2488         uint64_t value;
2489         const uint8_t *p;
2490
2491         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
2492         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
2493                 value = (value << 8) | p[i];
2494         }
2495
2496         return (value);
2497 }
2498
2499 static void
2500 stv(int len, void *addr, uint64_t value)
2501 {
2502         switch (len) {
2503         case 1:
2504                 *(uint8_t *)addr = value;
2505                 return;
2506         case 2:
2507                 *(uint16_t *)addr = value;
2508                 return;
2509         case 4:
2510                 *(uint32_t *)addr = value;
2511                 return;
2512         case 8:
2513                 *(uint64_t *)addr = value;
2514                 return;
2515         }
2516 }
2517
2518 /*
2519  * Extract a array from a zap leaf entry.
2520  */
2521 static void
2522 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2523     uint64_t integer_size, uint64_t num_integers, void *buf)
2524 {
2525         uint64_t array_int_len = zc->l_entry.le_value_intlen;
2526         uint64_t value = 0;
2527         uint64_t *u64 = buf;
2528         char *p = buf;
2529         int len = MIN(zc->l_entry.le_value_numints, num_integers);
2530         int chunk = zc->l_entry.le_value_chunk;
2531         int byten = 0;
2532
2533         if (integer_size == 8 && len == 1) {
2534                 *u64 = fzap_leaf_value(zl, zc);
2535                 return;
2536         }
2537
2538         while (len > 0) {
2539                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
2540                 int i;
2541
2542                 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
2543                 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
2544                         value = (value << 8) | la->la_array[i];
2545                         byten++;
2546                         if (byten == array_int_len) {
2547                                 stv(integer_size, p, value);
2548                                 byten = 0;
2549                                 len--;
2550                                 if (len == 0)
2551                                         return;
2552                                 p += integer_size;
2553                         }
2554                 }
2555                 chunk = la->la_next;
2556         }
2557 }
2558
2559 static int
2560 fzap_check_size(uint64_t integer_size, uint64_t num_integers)
2561 {
2562
2563         switch (integer_size) {
2564         case 1:
2565         case 2:
2566         case 4:
2567         case 8:
2568                 break;
2569         default:
2570                 return (EINVAL);
2571         }
2572
2573         if (integer_size * num_integers > ZAP_MAXVALUELEN)
2574                 return (E2BIG);
2575
2576         return (0);
2577 }
2578
2579 static void
2580 zap_leaf_free(zap_leaf_t *leaf)
2581 {
2582         free(leaf->l_phys);
2583         free(leaf);
2584 }
2585
2586 static int
2587 zap_get_leaf_byblk(fat_zap_t *zap, uint64_t blk, zap_leaf_t **lp)
2588 {
2589         int bs = FZAP_BLOCK_SHIFT(zap);
2590         int err;
2591
2592         *lp = malloc(sizeof(**lp));
2593         if (*lp == NULL)
2594                 return (ENOMEM);
2595
2596         (*lp)->l_bs = bs;
2597         (*lp)->l_phys = malloc(1 << bs);
2598
2599         if ((*lp)->l_phys == NULL) {
2600                 free(*lp);
2601                 return (ENOMEM);
2602         }
2603         err = dnode_read(zap->zap_spa, zap->zap_dnode, blk << bs, (*lp)->l_phys,
2604             1 << bs);
2605         if (err != 0) {
2606                 zap_leaf_free(*lp);
2607         }
2608         return (err);
2609 }
2610
2611 static int
2612 zap_table_load(fat_zap_t *zap, zap_table_phys_t *tbl, uint64_t idx,
2613     uint64_t *valp)
2614 {
2615         int bs = FZAP_BLOCK_SHIFT(zap);
2616         uint64_t blk = idx >> (bs - 3);
2617         uint64_t off = idx & ((1 << (bs - 3)) - 1);
2618         uint64_t *buf;
2619         int rc;
2620
2621         buf = malloc(1 << zap->zap_block_shift);
2622         if (buf == NULL)
2623                 return (ENOMEM);
2624         rc = dnode_read(zap->zap_spa, zap->zap_dnode, (tbl->zt_blk + blk) << bs,
2625             buf, 1 << zap->zap_block_shift);
2626         if (rc == 0)
2627                 *valp = buf[off];
2628         free(buf);
2629         return (rc);
2630 }
2631
2632 static int
2633 zap_idx_to_blk(fat_zap_t *zap, uint64_t idx, uint64_t *valp)
2634 {
2635         if (zap->zap_phys->zap_ptrtbl.zt_numblks == 0) {
2636                 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
2637                 return (0);
2638         } else {
2639                 return (zap_table_load(zap, &zap->zap_phys->zap_ptrtbl,
2640                     idx, valp));
2641         }
2642 }
2643
2644 #define ZAP_HASH_IDX(hash, n)   (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
2645 static int
2646 zap_deref_leaf(fat_zap_t *zap, uint64_t h, zap_leaf_t **lp)
2647 {
2648         uint64_t idx, blk;
2649         int err;
2650
2651         idx = ZAP_HASH_IDX(h, zap->zap_phys->zap_ptrtbl.zt_shift);
2652         err = zap_idx_to_blk(zap, idx, &blk);
2653         if (err != 0)
2654                 return (err);
2655         return (zap_get_leaf_byblk(zap, blk, lp));
2656 }
2657
2658 #define CHAIN_END       0xffff  /* end of the chunk chain */
2659 #define LEAF_HASH(l, h) \
2660         ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
2661         ((h) >> \
2662         (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len)))
2663 #define LEAF_HASH_ENTPTR(l, h)  (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
2664
2665 static int
2666 zap_leaf_lookup(zap_leaf_t *zl, uint64_t hash, const char *name,
2667     uint64_t integer_size, uint64_t num_integers, void *value)
2668 {
2669         int rc;
2670         uint16_t *chunkp;
2671         struct zap_leaf_entry *le;
2672
2673         /*
2674          * Make sure this chunk matches our hash.
2675          */
2676         if (zl->l_phys->l_hdr.lh_prefix_len > 0 &&
2677             zl->l_phys->l_hdr.lh_prefix !=
2678             hash >> (64 - zl->l_phys->l_hdr.lh_prefix_len))
2679                 return (EIO);
2680
2681         rc = ENOENT;
2682         for (chunkp = LEAF_HASH_ENTPTR(zl, hash);
2683             *chunkp != CHAIN_END; chunkp = &le->le_next) {
2684                 zap_leaf_chunk_t *zc;
2685                 uint16_t chunk = *chunkp;
2686
2687                 le = ZAP_LEAF_ENTRY(zl, chunk);
2688                 if (le->le_hash != hash)
2689                         continue;
2690                 zc = &ZAP_LEAF_CHUNK(zl, chunk);
2691                 if (fzap_name_equal(zl, zc, name)) {
2692                         if (zc->l_entry.le_value_intlen > integer_size) {
2693                                 rc = EINVAL;
2694                         } else {
2695                                 fzap_leaf_array(zl, zc, integer_size,
2696                                     num_integers, value);
2697                                 rc = 0;
2698                         }
2699                         break;
2700                 }
2701         }
2702         return (rc);
2703 }
2704
2705 /*
2706  * Lookup a value in a fatzap directory.
2707  */
2708 static int
2709 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2710     const char *name, uint64_t integer_size, uint64_t num_integers,
2711     void *value)
2712 {
2713         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2714         fat_zap_t z;
2715         zap_leaf_t *zl;
2716         uint64_t hash;
2717         int rc;
2718
2719         if (zh->zap_magic != ZAP_MAGIC)
2720                 return (EIO);
2721
2722         if ((rc = fzap_check_size(integer_size, num_integers)) != 0) {
2723                 return (rc);
2724         }
2725
2726         z.zap_block_shift = ilog2(bsize);
2727         z.zap_phys = zh;
2728         z.zap_spa = spa;
2729         z.zap_dnode = dnode;
2730
2731         hash = zap_hash(zh->zap_salt, name);
2732         rc = zap_deref_leaf(&z, hash, &zl);
2733         if (rc != 0)
2734                 return (rc);
2735
2736         rc = zap_leaf_lookup(zl, hash, name, integer_size, num_integers, value);
2737
2738         zap_leaf_free(zl);
2739         return (rc);
2740 }
2741
2742 /*
2743  * Lookup a name in a zap object and return its value as a uint64_t.
2744  */
2745 static int
2746 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
2747     uint64_t integer_size, uint64_t num_integers, void *value)
2748 {
2749         int rc;
2750         zap_phys_t *zap;
2751         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2752
2753         zap = malloc(size);
2754         if (zap == NULL)
2755                 return (ENOMEM);
2756
2757         rc = dnode_read(spa, dnode, 0, zap, size);
2758         if (rc)
2759                 goto done;
2760
2761         switch (zap->zap_block_type) {
2762         case ZBT_MICRO:
2763                 rc = mzap_lookup((const mzap_phys_t *)zap, size, name, value);
2764                 break;
2765         case ZBT_HEADER:
2766                 rc = fzap_lookup(spa, dnode, zap, name, integer_size,
2767                     num_integers, value);
2768                 break;
2769         default:
2770                 printf("ZFS: invalid zap_type=%" PRIx64 "\n",
2771                     zap->zap_block_type);
2772                 rc = EIO;
2773         }
2774 done:
2775         free(zap);
2776         return (rc);
2777 }
2778
2779 /*
2780  * List a microzap directory.
2781  */
2782 static int
2783 mzap_list(const mzap_phys_t *mz, size_t size,
2784     int (*callback)(const char *, uint64_t))
2785 {
2786         const mzap_ent_phys_t *mze;
2787         int chunks, i, rc;
2788
2789         /*
2790          * Microzap objects use exactly one block. Read the whole
2791          * thing.
2792          */
2793         rc = 0;
2794         chunks = size / MZAP_ENT_LEN - 1;
2795         for (i = 0; i < chunks; i++) {
2796                 mze = &mz->mz_chunk[i];
2797                 if (mze->mze_name[0]) {
2798                         rc = callback(mze->mze_name, mze->mze_value);
2799                         if (rc != 0)
2800                                 break;
2801                 }
2802         }
2803
2804         return (rc);
2805 }
2806
2807 /*
2808  * List a fatzap directory.
2809  */
2810 static int
2811 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2812     int (*callback)(const char *, uint64_t))
2813 {
2814         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2815         fat_zap_t z;
2816         uint64_t i;
2817         int j, rc;
2818
2819         if (zh->zap_magic != ZAP_MAGIC)
2820                 return (EIO);
2821
2822         z.zap_block_shift = ilog2(bsize);
2823         z.zap_phys = zh;
2824
2825         /*
2826          * This assumes that the leaf blocks start at block 1. The
2827          * documentation isn't exactly clear on this.
2828          */
2829         zap_leaf_t zl;
2830         zl.l_bs = z.zap_block_shift;
2831         zl.l_phys = malloc(bsize);
2832         if (zl.l_phys == NULL)
2833                 return (ENOMEM);
2834
2835         for (i = 0; i < zh->zap_num_leafs; i++) {
2836                 off_t off = ((off_t)(i + 1)) << zl.l_bs;
2837                 char name[256], *p;
2838                 uint64_t value;
2839
2840                 if (dnode_read(spa, dnode, off, zl.l_phys, bsize)) {
2841                         free(zl.l_phys);
2842                         return (EIO);
2843                 }
2844
2845                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
2846                         zap_leaf_chunk_t *zc, *nc;
2847                         int namelen;
2848
2849                         zc = &ZAP_LEAF_CHUNK(&zl, j);
2850                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
2851                                 continue;
2852                         namelen = zc->l_entry.le_name_numints;
2853                         if (namelen > sizeof(name))
2854                                 namelen = sizeof(name);
2855
2856                         /*
2857                          * Paste the name back together.
2858                          */
2859                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
2860                         p = name;
2861                         while (namelen > 0) {
2862                                 int len;
2863                                 len = namelen;
2864                                 if (len > ZAP_LEAF_ARRAY_BYTES)
2865                                         len = ZAP_LEAF_ARRAY_BYTES;
2866                                 memcpy(p, nc->l_array.la_array, len);
2867                                 p += len;
2868                                 namelen -= len;
2869                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
2870                         }
2871
2872                         /*
2873                          * Assume the first eight bytes of the value are
2874                          * a uint64_t.
2875                          */
2876                         value = fzap_leaf_value(&zl, zc);
2877
2878                         /* printf("%s 0x%jx\n", name, (uintmax_t)value); */
2879                         rc = callback((const char *)name, value);
2880                         if (rc != 0) {
2881                                 free(zl.l_phys);
2882                                 return (rc);
2883                         }
2884                 }
2885         }
2886
2887         free(zl.l_phys);
2888         return (0);
2889 }
2890
2891 static int zfs_printf(const char *name, uint64_t value __unused)
2892 {
2893
2894         printf("%s\n", name);
2895
2896         return (0);
2897 }
2898
2899 /*
2900  * List a zap directory.
2901  */
2902 static int
2903 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
2904 {
2905         zap_phys_t *zap;
2906         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2907         int rc;
2908
2909         zap = malloc(size);
2910         if (zap == NULL)
2911                 return (ENOMEM);
2912
2913         rc = dnode_read(spa, dnode, 0, zap, size);
2914         if (rc == 0) {
2915                 if (zap->zap_block_type == ZBT_MICRO)
2916                         rc = mzap_list((const mzap_phys_t *)zap, size,
2917                             zfs_printf);
2918                 else
2919                         rc = fzap_list(spa, dnode, zap, zfs_printf);
2920         }
2921         free(zap);
2922         return (rc);
2923 }
2924
2925 static int
2926 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum,
2927     dnode_phys_t *dnode)
2928 {
2929         off_t offset;
2930
2931         offset = objnum * sizeof(dnode_phys_t);
2932         return dnode_read(spa, &os->os_meta_dnode, offset,
2933                 dnode, sizeof(dnode_phys_t));
2934 }
2935
2936 /*
2937  * Lookup a name in a microzap directory.
2938  */
2939 static int
2940 mzap_rlookup(const mzap_phys_t *mz, size_t size, char *name, uint64_t value)
2941 {
2942         const mzap_ent_phys_t *mze;
2943         int chunks, i;
2944
2945         /*
2946          * Microzap objects use exactly one block. Read the whole
2947          * thing.
2948          */
2949         chunks = size / MZAP_ENT_LEN - 1;
2950         for (i = 0; i < chunks; i++) {
2951                 mze = &mz->mz_chunk[i];
2952                 if (value == mze->mze_value) {
2953                         strcpy(name, mze->mze_name);
2954                         return (0);
2955                 }
2956         }
2957
2958         return (ENOENT);
2959 }
2960
2961 static void
2962 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
2963 {
2964         size_t namelen;
2965         const zap_leaf_chunk_t *nc;
2966         char *p;
2967
2968         namelen = zc->l_entry.le_name_numints;
2969
2970         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2971         p = name;
2972         while (namelen > 0) {
2973                 size_t len;
2974                 len = namelen;
2975                 if (len > ZAP_LEAF_ARRAY_BYTES)
2976                         len = ZAP_LEAF_ARRAY_BYTES;
2977                 memcpy(p, nc->l_array.la_array, len);
2978                 p += len;
2979                 namelen -= len;
2980                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2981         }
2982
2983         *p = '\0';
2984 }
2985
2986 static int
2987 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2988     char *name, uint64_t value)
2989 {
2990         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2991         fat_zap_t z;
2992         uint64_t i;
2993         int j, rc;
2994
2995         if (zh->zap_magic != ZAP_MAGIC)
2996                 return (EIO);
2997
2998         z.zap_block_shift = ilog2(bsize);
2999         z.zap_phys = zh;
3000
3001         /*
3002          * This assumes that the leaf blocks start at block 1. The
3003          * documentation isn't exactly clear on this.
3004          */
3005         zap_leaf_t zl;
3006         zl.l_bs = z.zap_block_shift;
3007         zl.l_phys = malloc(bsize);
3008         if (zl.l_phys == NULL)
3009                 return (ENOMEM);
3010
3011         for (i = 0; i < zh->zap_num_leafs; i++) {
3012                 off_t off = ((off_t)(i + 1)) << zl.l_bs;
3013
3014                 rc = dnode_read(spa, dnode, off, zl.l_phys, bsize);
3015                 if (rc != 0)
3016                         goto done;
3017
3018                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
3019                         zap_leaf_chunk_t *zc;
3020
3021                         zc = &ZAP_LEAF_CHUNK(&zl, j);
3022                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
3023                                 continue;
3024                         if (zc->l_entry.le_value_intlen != 8 ||
3025                             zc->l_entry.le_value_numints != 1)
3026                                 continue;
3027
3028                         if (fzap_leaf_value(&zl, zc) == value) {
3029                                 fzap_name_copy(&zl, zc, name);
3030                                 goto done;
3031                         }
3032                 }
3033         }
3034
3035         rc = ENOENT;
3036 done:
3037         free(zl.l_phys);
3038         return (rc);
3039 }
3040
3041 static int
3042 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name,
3043     uint64_t value)
3044 {
3045         zap_phys_t *zap;
3046         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
3047         int rc;
3048
3049         zap = malloc(size);
3050         if (zap == NULL)
3051                 return (ENOMEM);
3052
3053         rc = dnode_read(spa, dnode, 0, zap, size);
3054         if (rc == 0) {
3055                 if (zap->zap_block_type == ZBT_MICRO)
3056                         rc = mzap_rlookup((const mzap_phys_t *)zap, size,
3057                             name, value);
3058                 else
3059                         rc = fzap_rlookup(spa, dnode, zap, name, value);
3060         }
3061         free(zap);
3062         return (rc);
3063 }
3064
3065 static int
3066 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
3067 {
3068         char name[256];
3069         char component[256];
3070         uint64_t dir_obj, parent_obj, child_dir_zapobj;
3071         dnode_phys_t child_dir_zap, snapnames_zap, dataset, dir, parent;
3072         dsl_dir_phys_t *dd;
3073         dsl_dataset_phys_t *ds;
3074         char *p;
3075         int len;
3076         boolean_t issnap = B_FALSE;
3077
3078         p = &name[sizeof(name) - 1];
3079         *p = '\0';
3080
3081         if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) {
3082                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3083                 return (EIO);
3084         }
3085         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3086         dir_obj = ds->ds_dir_obj;
3087         if (ds->ds_snapnames_zapobj == 0)
3088                 issnap = B_TRUE;
3089
3090         for (;;) {
3091                 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir) != 0)
3092                         return (EIO);
3093                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3094
3095                 /* Actual loop condition. */
3096                 parent_obj = dd->dd_parent_obj;
3097                 if (parent_obj == 0)
3098                         break;
3099
3100                 if (objset_get_dnode(spa, spa->spa_mos, parent_obj,
3101                     &parent) != 0)
3102                         return (EIO);
3103                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
3104                 if (issnap == B_TRUE) {
3105                         /*
3106                          * The dataset we are looking up is a snapshot
3107                          * the dir_obj is the parent already, we don't want
3108                          * the grandparent just yet. Reset to the parent.
3109                          */
3110                         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3111                         /* Lookup the dataset to get the snapname ZAP */
3112                         if (objset_get_dnode(spa, spa->spa_mos,
3113                             dd->dd_head_dataset_obj, &dataset))
3114                                 return (EIO);
3115                         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3116                         if (objset_get_dnode(spa, spa->spa_mos,
3117                             ds->ds_snapnames_zapobj, &snapnames_zap) != 0)
3118                                 return (EIO);
3119                         /* Get the name of the snapshot */
3120                         if (zap_rlookup(spa, &snapnames_zap, component,
3121                             objnum) != 0)
3122                                 return (EIO);
3123                         len = strlen(component);
3124                         p -= len;
3125                         memcpy(p, component, len);
3126                         --p;
3127                         *p = '@';
3128                         issnap = B_FALSE;
3129                         continue;
3130                 }
3131
3132                 child_dir_zapobj = dd->dd_child_dir_zapobj;
3133                 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj,
3134                     &child_dir_zap) != 0)
3135                         return (EIO);
3136                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
3137                         return (EIO);
3138
3139                 len = strlen(component);
3140                 p -= len;
3141                 memcpy(p, component, len);
3142                 --p;
3143                 *p = '/';
3144
3145                 /* Actual loop iteration. */
3146                 dir_obj = parent_obj;
3147         }
3148
3149         if (*p != '\0')
3150                 ++p;
3151         strcpy(result, p);
3152
3153         return (0);
3154 }
3155
3156 static int
3157 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
3158 {
3159         char element[256];
3160         uint64_t dir_obj, child_dir_zapobj;
3161         dnode_phys_t child_dir_zap, snapnames_zap, dir, dataset;
3162         dsl_dir_phys_t *dd;
3163         dsl_dataset_phys_t *ds;
3164         const char *p, *q;
3165         boolean_t issnap = B_FALSE;
3166
3167         if (objset_get_dnode(spa, spa->spa_mos,
3168             DMU_POOL_DIRECTORY_OBJECT, &dir))
3169                 return (EIO);
3170         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
3171             1, &dir_obj))
3172                 return (EIO);
3173
3174         p = name;
3175         for (;;) {
3176                 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir))
3177                         return (EIO);
3178                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3179
3180                 while (*p == '/')
3181                         p++;
3182                 /* Actual loop condition #1. */
3183                 if (*p == '\0')
3184                         break;
3185
3186                 q = strchr(p, '/');
3187                 if (q) {
3188                         memcpy(element, p, q - p);
3189                         element[q - p] = '\0';
3190                         p = q + 1;
3191                 } else {
3192                         strcpy(element, p);
3193                         p += strlen(p);
3194                 }
3195
3196                 if (issnap == B_TRUE) {
3197                         if (objset_get_dnode(spa, spa->spa_mos,
3198                             dd->dd_head_dataset_obj, &dataset))
3199                                 return (EIO);
3200                         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3201                         if (objset_get_dnode(spa, spa->spa_mos,
3202                             ds->ds_snapnames_zapobj, &snapnames_zap) != 0)
3203                                 return (EIO);
3204                         /* Actual loop condition #2. */
3205                         if (zap_lookup(spa, &snapnames_zap, element,
3206                             sizeof (dir_obj), 1, &dir_obj) != 0)
3207                                 return (ENOENT);
3208                         *objnum = dir_obj;
3209                         return (0);
3210                 } else if ((q = strchr(element, '@')) != NULL) {
3211                         issnap = B_TRUE;
3212                         element[q - element] = '\0';
3213                         p = q + 1;
3214                 }
3215                 child_dir_zapobj = dd->dd_child_dir_zapobj;
3216                 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj,
3217                     &child_dir_zap) != 0)
3218                         return (EIO);
3219
3220                 /* Actual loop condition #2. */
3221                 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
3222                     1, &dir_obj) != 0)
3223                         return (ENOENT);
3224         }
3225
3226         *objnum = dd->dd_head_dataset_obj;
3227         return (0);
3228 }
3229
3230 #ifndef BOOT2
3231 static int
3232 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
3233 {
3234         uint64_t dir_obj, child_dir_zapobj;
3235         dnode_phys_t child_dir_zap, dir, dataset;
3236         dsl_dataset_phys_t *ds;
3237         dsl_dir_phys_t *dd;
3238
3239         if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) {
3240                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3241                 return (EIO);
3242         }
3243         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3244         dir_obj = ds->ds_dir_obj;
3245
3246         if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) {
3247                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
3248                 return (EIO);
3249         }
3250         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3251
3252         child_dir_zapobj = dd->dd_child_dir_zapobj;
3253         if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj,
3254             &child_dir_zap) != 0) {
3255                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
3256                 return (EIO);
3257         }
3258
3259         return (zap_list(spa, &child_dir_zap) != 0);
3260 }
3261
3262 int
3263 zfs_callback_dataset(const spa_t *spa, uint64_t objnum,
3264     int (*callback)(const char *, uint64_t))
3265 {
3266         uint64_t dir_obj, child_dir_zapobj;
3267         dnode_phys_t child_dir_zap, dir, dataset;
3268         dsl_dataset_phys_t *ds;
3269         dsl_dir_phys_t *dd;
3270         zap_phys_t *zap;
3271         size_t size;
3272         int err;
3273
3274         err = objset_get_dnode(spa, spa->spa_mos, objnum, &dataset);
3275         if (err != 0) {
3276                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3277                 return (err);
3278         }
3279         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3280         dir_obj = ds->ds_dir_obj;
3281
3282         err = objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir);
3283         if (err != 0) {
3284                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
3285                 return (err);
3286         }
3287         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3288
3289         child_dir_zapobj = dd->dd_child_dir_zapobj;
3290         err = objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj,
3291             &child_dir_zap);
3292         if (err != 0) {
3293                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
3294                 return (err);
3295         }
3296
3297         size = child_dir_zap.dn_datablkszsec << SPA_MINBLOCKSHIFT;
3298         zap = malloc(size);
3299         if (zap != NULL) {
3300                 err = dnode_read(spa, &child_dir_zap, 0, zap, size);
3301                 if (err != 0)
3302                         goto done;
3303
3304                 if (zap->zap_block_type == ZBT_MICRO)
3305                         err = mzap_list((const mzap_phys_t *)zap, size,
3306                             callback);
3307                 else
3308                         err = fzap_list(spa, &child_dir_zap, zap, callback);
3309         } else {
3310                 err = ENOMEM;
3311         }
3312 done:
3313         free(zap);
3314         return (err);
3315 }
3316 #endif
3317
3318 /*
3319  * Find the object set given the object number of its dataset object
3320  * and return its details in *objset
3321  */
3322 static int
3323 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
3324 {
3325         dnode_phys_t dataset;
3326         dsl_dataset_phys_t *ds;
3327
3328         if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) {
3329                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3330                 return (EIO);
3331         }
3332
3333         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3334         if (zio_read(spa, &ds->ds_bp, objset)) {
3335                 printf("ZFS: can't read object set for dataset %ju\n",
3336                     (uintmax_t)objnum);
3337                 return (EIO);
3338         }
3339
3340         return (0);
3341 }
3342
3343 /*
3344  * Find the object set pointed to by the BOOTFS property or the root
3345  * dataset if there is none and return its details in *objset
3346  */
3347 static int
3348 zfs_get_root(const spa_t *spa, uint64_t *objid)
3349 {
3350         dnode_phys_t dir, propdir;
3351         uint64_t props, bootfs, root;
3352
3353         *objid = 0;
3354
3355         /*
3356          * Start with the MOS directory object.
3357          */
3358         if (objset_get_dnode(spa, spa->spa_mos,
3359             DMU_POOL_DIRECTORY_OBJECT, &dir)) {
3360                 printf("ZFS: can't read MOS object directory\n");
3361                 return (EIO);
3362         }
3363
3364         /*
3365          * Lookup the pool_props and see if we can find a bootfs.
3366          */
3367         if (zap_lookup(spa, &dir, DMU_POOL_PROPS,
3368             sizeof(props), 1, &props) == 0 &&
3369             objset_get_dnode(spa, spa->spa_mos, props, &propdir) == 0 &&
3370             zap_lookup(spa, &propdir, "bootfs",
3371             sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) {
3372                 *objid = bootfs;
3373                 return (0);
3374         }
3375         /*
3376          * Lookup the root dataset directory
3377          */
3378         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET,
3379             sizeof(root), 1, &root) ||
3380             objset_get_dnode(spa, spa->spa_mos, root, &dir)) {
3381                 printf("ZFS: can't find root dsl_dir\n");
3382                 return (EIO);
3383         }
3384
3385         /*
3386          * Use the information from the dataset directory's bonus buffer
3387          * to find the dataset object and from that the object set itself.
3388          */
3389         dsl_dir_phys_t *dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3390         *objid = dd->dd_head_dataset_obj;
3391         return (0);
3392 }
3393
3394 static int
3395 zfs_mount_impl(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
3396 {
3397
3398         mount->spa = spa;
3399
3400         /*
3401          * Find the root object set if not explicitly provided
3402          */
3403         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
3404                 printf("ZFS: can't find root filesystem\n");
3405                 return (EIO);
3406         }
3407
3408         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
3409                 printf("ZFS: can't open root filesystem\n");
3410                 return (EIO);
3411         }
3412
3413         mount->rootobj = rootobj;
3414
3415         return (0);
3416 }
3417
3418 /*
3419  * callback function for feature name checks.
3420  */
3421 static int
3422 check_feature(const char *name, uint64_t value)
3423 {
3424         int i;
3425
3426         if (value == 0)
3427                 return (0);
3428         if (name[0] == '\0')
3429                 return (0);
3430
3431         for (i = 0; features_for_read[i] != NULL; i++) {
3432                 if (strcmp(name, features_for_read[i]) == 0)
3433                         return (0);
3434         }
3435         printf("ZFS: unsupported feature: %s\n", name);
3436         return (EIO);
3437 }
3438
3439 /*
3440  * Checks whether the MOS features that are active are supported.
3441  */
3442 static int
3443 check_mos_features(const spa_t *spa)
3444 {
3445         dnode_phys_t dir;
3446         zap_phys_t *zap;
3447         uint64_t objnum;
3448         size_t size;
3449         int rc;
3450
3451         if ((rc = objset_get_dnode(spa, spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
3452             &dir)) != 0)
3453                 return (rc);
3454         if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
3455             sizeof (objnum), 1, &objnum)) != 0) {
3456                 /*
3457                  * It is older pool without features. As we have already
3458                  * tested the label, just return without raising the error.
3459                  */
3460                 return (0);
3461         }
3462
3463         if ((rc = objset_get_dnode(spa, spa->spa_mos, objnum, &dir)) != 0)
3464                 return (rc);
3465
3466         if (dir.dn_type != DMU_OTN_ZAP_METADATA)
3467                 return (EIO);
3468
3469         size = dir.dn_datablkszsec << SPA_MINBLOCKSHIFT;
3470         zap = malloc(size);
3471         if (zap == NULL)
3472                 return (ENOMEM);
3473
3474         if (dnode_read(spa, &dir, 0, zap, size)) {
3475                 free(zap);
3476                 return (EIO);
3477         }
3478
3479         if (zap->zap_block_type == ZBT_MICRO)
3480                 rc = mzap_list((const mzap_phys_t *)zap, size, check_feature);
3481         else
3482                 rc = fzap_list(spa, &dir, zap, check_feature);
3483
3484         free(zap);
3485         return (rc);
3486 }
3487
3488 static int
3489 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
3490 {
3491         dnode_phys_t dir;
3492         size_t size;
3493         int rc;
3494         char *nv;
3495
3496         *value = NULL;
3497         if ((rc = objset_get_dnode(spa, spa->spa_mos, obj, &dir)) != 0)
3498                 return (rc);
3499         if (dir.dn_type != DMU_OT_PACKED_NVLIST &&
3500             dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) {
3501                 return (EIO);
3502         }
3503
3504         if (dir.dn_bonuslen != sizeof (uint64_t))
3505                 return (EIO);
3506
3507         size = *(uint64_t *)DN_BONUS(&dir);
3508         nv = malloc(size);
3509         if (nv == NULL)
3510                 return (ENOMEM);
3511
3512         rc = dnode_read(spa, &dir, 0, nv, size);
3513         if (rc != 0) {
3514                 free(nv);
3515                 nv = NULL;
3516                 return (rc);
3517         }
3518         *value = nvlist_import(nv, size);
3519         free(nv);
3520         return (rc);
3521 }
3522
3523 static int
3524 zfs_spa_init(spa_t *spa)
3525 {
3526         struct uberblock checkpoint;
3527         dnode_phys_t dir;
3528         uint64_t config_object;
3529         nvlist_t *nvlist;
3530         int rc;
3531
3532         if (zio_read(spa, &spa->spa_uberblock->ub_rootbp, spa->spa_mos)) {
3533                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
3534                 return (EIO);
3535         }
3536         if (spa->spa_mos->os_type != DMU_OST_META) {
3537                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
3538                 return (EIO);
3539         }
3540
3541         if (objset_get_dnode(spa, &spa->spa_mos_master,
3542             DMU_POOL_DIRECTORY_OBJECT, &dir)) {
3543                 printf("ZFS: failed to read pool %s directory object\n",
3544                     spa->spa_name);
3545                 return (EIO);
3546         }
3547         /* this is allowed to fail, older pools do not have salt */
3548         rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
3549             sizeof (spa->spa_cksum_salt.zcs_bytes),
3550             spa->spa_cksum_salt.zcs_bytes);
3551
3552         rc = check_mos_features(spa);
3553         if (rc != 0) {
3554                 printf("ZFS: pool %s is not supported\n", spa->spa_name);
3555                 return (rc);
3556         }
3557
3558         rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG,
3559             sizeof (config_object), 1, &config_object);
3560         if (rc != 0) {
3561                 printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG);
3562                 return (EIO);
3563         }
3564         rc = load_nvlist(spa, config_object, &nvlist);
3565         if (rc != 0)
3566                 return (rc);
3567
3568         rc = zap_lookup(spa, &dir, DMU_POOL_ZPOOL_CHECKPOINT,
3569             sizeof(uint64_t), sizeof(checkpoint) / sizeof(uint64_t),
3570             &checkpoint);
3571         if (rc == 0 && checkpoint.ub_checkpoint_txg != 0) {
3572                 memcpy(&spa->spa_uberblock_checkpoint, &checkpoint,
3573                     sizeof(checkpoint));
3574                 if (zio_read(spa, &spa->spa_uberblock_checkpoint.ub_rootbp,
3575                     &spa->spa_mos_checkpoint)) {
3576                         printf("ZFS: can not read checkpoint data.\n");
3577                         return (EIO);
3578                 }
3579         }
3580
3581         /*
3582          * Update vdevs from MOS config. Note, we do skip encoding bytes
3583          * here. See also vdev_label_read_config().
3584          */
3585         rc = vdev_init_from_nvlist(spa, nvlist);
3586         nvlist_destroy(nvlist);
3587         return (rc);
3588 }
3589
3590 static int
3591 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
3592 {
3593
3594         if (dn->dn_bonustype != DMU_OT_SA) {
3595                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
3596
3597                 sb->st_mode = zp->zp_mode;
3598                 sb->st_uid = zp->zp_uid;
3599                 sb->st_gid = zp->zp_gid;
3600                 sb->st_size = zp->zp_size;
3601         } else {
3602                 sa_hdr_phys_t *sahdrp;
3603                 int hdrsize;
3604                 size_t size = 0;
3605                 void *buf = NULL;
3606
3607                 if (dn->dn_bonuslen != 0)
3608                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3609                 else {
3610                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
3611                                 blkptr_t *bp = DN_SPILL_BLKPTR(dn);
3612                                 int error;
3613
3614                                 size = BP_GET_LSIZE(bp);
3615                                 buf = malloc(size);
3616                                 if (buf == NULL)
3617                                         error = ENOMEM;
3618                                 else
3619                                         error = zio_read(spa, bp, buf);
3620
3621                                 if (error != 0) {
3622                                         free(buf);
3623                                         return (error);
3624                                 }
3625                                 sahdrp = buf;
3626                         } else {
3627                                 return (EIO);
3628                         }
3629                 }
3630                 hdrsize = SA_HDR_SIZE(sahdrp);
3631                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
3632                     SA_MODE_OFFSET);
3633                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
3634                     SA_UID_OFFSET);
3635                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
3636                     SA_GID_OFFSET);
3637                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
3638                     SA_SIZE_OFFSET);
3639                 free(buf);
3640         }
3641
3642         return (0);
3643 }
3644
3645 static int
3646 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
3647 {
3648         int rc = 0;
3649
3650         if (dn->dn_bonustype == DMU_OT_SA) {
3651                 sa_hdr_phys_t *sahdrp = NULL;
3652                 size_t size = 0;
3653                 void *buf = NULL;
3654                 int hdrsize;
3655                 char *p;
3656
3657                 if (dn->dn_bonuslen != 0) {
3658                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3659                 } else {
3660                         blkptr_t *bp;
3661
3662                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
3663                                 return (EIO);
3664                         bp = DN_SPILL_BLKPTR(dn);
3665
3666                         size = BP_GET_LSIZE(bp);
3667                         buf = malloc(size);
3668                         if (buf == NULL)
3669                                 rc = ENOMEM;
3670                         else
3671                                 rc = zio_read(spa, bp, buf);
3672                         if (rc != 0) {
3673                                 free(buf);
3674                                 return (rc);
3675                         }
3676                         sahdrp = buf;
3677                 }
3678                 hdrsize = SA_HDR_SIZE(sahdrp);
3679                 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
3680                 memcpy(path, p, psize);
3681                 free(buf);
3682                 return (0);
3683         }
3684         /*
3685          * Second test is purely to silence bogus compiler
3686          * warning about accessing past the end of dn_bonus.
3687          */
3688         if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
3689             sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
3690                 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
3691         } else {
3692                 rc = dnode_read(spa, dn, 0, path, psize);
3693         }
3694         return (rc);
3695 }
3696
3697 struct obj_list {
3698         uint64_t                objnum;
3699         STAILQ_ENTRY(obj_list)  entry;
3700 };
3701
3702 /*
3703  * Lookup a file and return its dnode.
3704  */
3705 static int
3706 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
3707 {
3708         int rc;
3709         uint64_t objnum;
3710         const spa_t *spa;
3711         dnode_phys_t dn;
3712         const char *p, *q;
3713         char element[256];
3714         char path[1024];
3715         int symlinks_followed = 0;
3716         struct stat sb;
3717         struct obj_list *entry, *tentry;
3718         STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
3719
3720         spa = mount->spa;
3721         if (mount->objset.os_type != DMU_OST_ZFS) {
3722                 printf("ZFS: unexpected object set type %ju\n",
3723                     (uintmax_t)mount->objset.os_type);
3724                 return (EIO);
3725         }
3726
3727         if ((entry = malloc(sizeof(struct obj_list))) == NULL)
3728                 return (ENOMEM);
3729
3730         /*
3731          * Get the root directory dnode.
3732          */
3733         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
3734         if (rc) {
3735                 free(entry);
3736                 return (rc);
3737         }
3738
3739         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof(objnum), 1, &objnum);
3740         if (rc) {
3741                 free(entry);
3742                 return (rc);
3743         }
3744         entry->objnum = objnum;
3745         STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3746
3747         rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3748         if (rc != 0)
3749                 goto done;
3750
3751         p = upath;
3752         while (p && *p) {
3753                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3754                 if (rc != 0)
3755                         goto done;
3756
3757                 while (*p == '/')
3758                         p++;
3759                 if (*p == '\0')
3760                         break;
3761                 q = p;
3762                 while (*q != '\0' && *q != '/')
3763                         q++;
3764
3765                 /* skip dot */
3766                 if (p + 1 == q && p[0] == '.') {
3767                         p++;
3768                         continue;
3769                 }
3770                 /* double dot */
3771                 if (p + 2 == q && p[0] == '.' && p[1] == '.') {
3772                         p += 2;
3773                         if (STAILQ_FIRST(&on_cache) ==
3774                             STAILQ_LAST(&on_cache, obj_list, entry)) {
3775                                 rc = ENOENT;
3776                                 goto done;
3777                         }
3778                         entry = STAILQ_FIRST(&on_cache);
3779                         STAILQ_REMOVE_HEAD(&on_cache, entry);
3780                         free(entry);
3781                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
3782                         continue;
3783                 }
3784                 if (q - p + 1 > sizeof(element)) {
3785                         rc = ENAMETOOLONG;
3786                         goto done;
3787                 }
3788                 memcpy(element, p, q - p);
3789                 element[q - p] = 0;
3790                 p = q;
3791
3792                 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
3793                         goto done;
3794                 if (!S_ISDIR(sb.st_mode)) {
3795                         rc = ENOTDIR;
3796                         goto done;
3797                 }
3798
3799                 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
3800                 if (rc)
3801                         goto done;
3802                 objnum = ZFS_DIRENT_OBJ(objnum);
3803
3804                 if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
3805                         rc = ENOMEM;
3806                         goto done;
3807                 }
3808                 entry->objnum = objnum;
3809                 STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3810                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3811                 if (rc)
3812                         goto done;
3813
3814                 /*
3815                  * Check for symlink.
3816                  */
3817                 rc = zfs_dnode_stat(spa, &dn, &sb);
3818                 if (rc)
3819                         goto done;
3820                 if (S_ISLNK(sb.st_mode)) {
3821                         if (symlinks_followed > 10) {
3822                                 rc = EMLINK;
3823                                 goto done;
3824                         }
3825                         symlinks_followed++;
3826
3827                         /*
3828                          * Read the link value and copy the tail of our
3829                          * current path onto the end.
3830                          */
3831                         if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
3832                                 rc = ENAMETOOLONG;
3833                                 goto done;
3834                         }
3835                         strcpy(&path[sb.st_size], p);
3836
3837                         rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
3838                         if (rc != 0)
3839                                 goto done;
3840
3841                         /*
3842                          * Restart with the new path, starting either at
3843                          * the root or at the parent depending whether or
3844                          * not the link is relative.
3845                          */
3846                         p = path;
3847                         if (*p == '/') {
3848                                 while (STAILQ_FIRST(&on_cache) !=
3849                                     STAILQ_LAST(&on_cache, obj_list, entry)) {
3850                                         entry = STAILQ_FIRST(&on_cache);
3851                                         STAILQ_REMOVE_HEAD(&on_cache, entry);
3852                                         free(entry);
3853                                 }
3854                         } else {
3855                                 entry = STAILQ_FIRST(&on_cache);
3856                                 STAILQ_REMOVE_HEAD(&on_cache, entry);
3857                                 free(entry);
3858                         }
3859                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
3860                 }
3861         }
3862
3863         *dnode = dn;
3864 done:
3865         STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
3866                 free(entry);
3867         return (rc);
3868 }