]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - stand/libsa/zfs/zfsimpl.c
zfs: merge openzfs/zfs@d96e29576
[FreeBSD/FreeBSD.git] / stand / libsa / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <stdbool.h>
35 #include <sys/endian.h>
36 #include <sys/stat.h>
37 #include <sys/stdint.h>
38 #include <sys/list.h>
39 #include <sys/zfs_bootenv.h>
40 #include <machine/_inttypes.h>
41
42 #include "zfsimpl.h"
43 #include "zfssubr.c"
44
45 #ifdef HAS_ZSTD_ZFS
46 extern int zstd_init(void);
47 #endif
48
49 struct zfsmount {
50         char                    *path;
51         const spa_t             *spa;
52         objset_phys_t           objset;
53         uint64_t                rootobj;
54         STAILQ_ENTRY(zfsmount)  next;
55 };
56
57 typedef STAILQ_HEAD(zfs_mnt_list, zfsmount) zfs_mnt_list_t;
58 static zfs_mnt_list_t zfsmount = STAILQ_HEAD_INITIALIZER(zfsmount);
59
60 /*
61  * The indirect_child_t represents the vdev that we will read from, when we
62  * need to read all copies of the data (e.g. for scrub or reconstruction).
63  * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
64  * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
65  * ic_vdev is a child of the mirror.
66  */
67 typedef struct indirect_child {
68         void *ic_data;
69         vdev_t *ic_vdev;
70 } indirect_child_t;
71
72 /*
73  * The indirect_split_t represents one mapped segment of an i/o to the
74  * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
75  * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
76  * For split blocks, there will be several of these.
77  */
78 typedef struct indirect_split {
79         list_node_t is_node; /* link on iv_splits */
80
81         /*
82          * is_split_offset is the offset into the i/o.
83          * This is the sum of the previous splits' is_size's.
84          */
85         uint64_t is_split_offset;
86
87         vdev_t *is_vdev; /* top-level vdev */
88         uint64_t is_target_offset; /* offset on is_vdev */
89         uint64_t is_size;
90         int is_children; /* number of entries in is_child[] */
91
92         /*
93          * is_good_child is the child that we are currently using to
94          * attempt reconstruction.
95          */
96         int is_good_child;
97
98         indirect_child_t is_child[1]; /* variable-length */
99 } indirect_split_t;
100
101 /*
102  * The indirect_vsd_t is associated with each i/o to the indirect vdev.
103  * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
104  */
105 typedef struct indirect_vsd {
106         boolean_t iv_split_block;
107         boolean_t iv_reconstruct;
108
109         list_t iv_splits; /* list of indirect_split_t's */
110 } indirect_vsd_t;
111
112 /*
113  * List of all vdevs, chained through v_alllink.
114  */
115 static vdev_list_t zfs_vdevs;
116
117 /*
118  * List of ZFS features supported for read
119  */
120 static const char *features_for_read[] = {
121         "com.datto:bookmark_v2",
122         "com.datto:encryption",
123         "com.datto:resilver_defer",
124         "com.delphix:bookmark_written",
125         "com.delphix:device_removal",
126         "com.delphix:embedded_data",
127         "com.delphix:extensible_dataset",
128         "com.delphix:head_errlog",
129         "com.delphix:hole_birth",
130         "com.delphix:obsolete_counts",
131         "com.delphix:spacemap_histogram",
132         "com.delphix:spacemap_v2",
133         "com.delphix:zpool_checkpoint",
134         "com.intel:allocation_classes",
135         "com.joyent:multi_vdev_crash_dump",
136         "org.freebsd:zstd_compress",
137         "org.illumos:lz4_compress",
138         "org.illumos:sha512",
139         "org.illumos:skein",
140         "org.open-zfs:large_blocks",
141         "org.openzfs:blake3",
142         "org.zfsonlinux:allocation_classes",
143         "org.zfsonlinux:large_dnode",
144         NULL
145 };
146
147 /*
148  * List of all pools, chained through spa_link.
149  */
150 static spa_list_t zfs_pools;
151
152 static const dnode_phys_t *dnode_cache_obj;
153 static uint64_t dnode_cache_bn;
154 static char *dnode_cache_buf;
155
156 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
157 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
158 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
159 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
160     const char *name, uint64_t integer_size, uint64_t num_integers,
161     void *value);
162 static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t,
163     dnode_phys_t *);
164 static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *,
165     size_t);
166 static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t,
167     size_t);
168 static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t);
169 vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *,
170     uint64_t);
171 vdev_indirect_mapping_entry_phys_t *
172     vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t,
173     uint64_t, uint64_t *);
174
175 static void
176 zfs_init(void)
177 {
178         STAILQ_INIT(&zfs_vdevs);
179         STAILQ_INIT(&zfs_pools);
180
181         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
182
183         zfs_init_crc();
184 #ifdef HAS_ZSTD_ZFS
185         zstd_init();
186 #endif
187 }
188
189 static int
190 nvlist_check_features_for_read(nvlist_t *nvl)
191 {
192         nvlist_t *features = NULL;
193         nvs_data_t *data;
194         nvp_header_t *nvp;
195         nv_string_t *nvp_name;
196         int rc;
197
198         rc = nvlist_find(nvl, ZPOOL_CONFIG_FEATURES_FOR_READ,
199             DATA_TYPE_NVLIST, NULL, &features, NULL);
200         switch (rc) {
201         case 0:
202                 break;          /* Continue with checks */
203
204         case ENOENT:
205                 return (0);     /* All features are disabled */
206
207         default:
208                 return (rc);    /* Error while reading nvlist */
209         }
210
211         data = (nvs_data_t *)features->nv_data;
212         nvp = &data->nvl_pair;  /* first pair in nvlist */
213
214         while (nvp->encoded_size != 0 && nvp->decoded_size != 0) {
215                 int i, found;
216
217                 nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp));
218                 found = 0;
219
220                 for (i = 0; features_for_read[i] != NULL; i++) {
221                         if (memcmp(nvp_name->nv_data, features_for_read[i],
222                             nvp_name->nv_size) == 0) {
223                                 found = 1;
224                                 break;
225                         }
226                 }
227
228                 if (!found) {
229                         printf("ZFS: unsupported feature: %.*s\n",
230                             nvp_name->nv_size, nvp_name->nv_data);
231                         rc = EIO;
232                 }
233                 nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size);
234         }
235         nvlist_destroy(features);
236
237         return (rc);
238 }
239
240 static int
241 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
242     off_t offset, size_t size)
243 {
244         size_t psize;
245         int rc;
246
247         if (vdev->v_phys_read == NULL)
248                 return (ENOTSUP);
249
250         if (bp) {
251                 psize = BP_GET_PSIZE(bp);
252         } else {
253                 psize = size;
254         }
255
256         rc = vdev->v_phys_read(vdev, vdev->v_priv, offset, buf, psize);
257         if (rc == 0) {
258                 if (bp != NULL)
259                         rc = zio_checksum_verify(vdev->v_spa, bp, buf);
260         }
261
262         return (rc);
263 }
264
265 static int
266 vdev_write_phys(vdev_t *vdev, void *buf, off_t offset, size_t size)
267 {
268         if (vdev->v_phys_write == NULL)
269                 return (ENOTSUP);
270
271         return (vdev->v_phys_write(vdev, offset, buf, size));
272 }
273
274 typedef struct remap_segment {
275         vdev_t *rs_vd;
276         uint64_t rs_offset;
277         uint64_t rs_asize;
278         uint64_t rs_split_offset;
279         list_node_t rs_node;
280 } remap_segment_t;
281
282 static remap_segment_t *
283 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
284 {
285         remap_segment_t *rs = malloc(sizeof (remap_segment_t));
286
287         if (rs != NULL) {
288                 rs->rs_vd = vd;
289                 rs->rs_offset = offset;
290                 rs->rs_asize = asize;
291                 rs->rs_split_offset = split_offset;
292         }
293
294         return (rs);
295 }
296
297 vdev_indirect_mapping_t *
298 vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os,
299     uint64_t mapping_object)
300 {
301         vdev_indirect_mapping_t *vim;
302         vdev_indirect_mapping_phys_t *vim_phys;
303         int rc;
304
305         vim = calloc(1, sizeof (*vim));
306         if (vim == NULL)
307                 return (NULL);
308
309         vim->vim_dn = calloc(1, sizeof (*vim->vim_dn));
310         if (vim->vim_dn == NULL) {
311                 free(vim);
312                 return (NULL);
313         }
314
315         rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn);
316         if (rc != 0) {
317                 free(vim->vim_dn);
318                 free(vim);
319                 return (NULL);
320         }
321
322         vim->vim_spa = spa;
323         vim->vim_phys = malloc(sizeof (*vim->vim_phys));
324         if (vim->vim_phys == NULL) {
325                 free(vim->vim_dn);
326                 free(vim);
327                 return (NULL);
328         }
329
330         vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn);
331         *vim->vim_phys = *vim_phys;
332
333         vim->vim_objset = os;
334         vim->vim_object = mapping_object;
335         vim->vim_entries = NULL;
336
337         vim->vim_havecounts =
338             (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
339
340         return (vim);
341 }
342
343 /*
344  * Compare an offset with an indirect mapping entry; there are three
345  * possible scenarios:
346  *
347  *     1. The offset is "less than" the mapping entry; meaning the
348  *        offset is less than the source offset of the mapping entry. In
349  *        this case, there is no overlap between the offset and the
350  *        mapping entry and -1 will be returned.
351  *
352  *     2. The offset is "greater than" the mapping entry; meaning the
353  *        offset is greater than the mapping entry's source offset plus
354  *        the entry's size. In this case, there is no overlap between
355  *        the offset and the mapping entry and 1 will be returned.
356  *
357  *        NOTE: If the offset is actually equal to the entry's offset
358  *        plus size, this is considered to be "greater" than the entry,
359  *        and this case applies (i.e. 1 will be returned). Thus, the
360  *        entry's "range" can be considered to be inclusive at its
361  *        start, but exclusive at its end: e.g. [src, src + size).
362  *
363  *     3. The last case to consider is if the offset actually falls
364  *        within the mapping entry's range. If this is the case, the
365  *        offset is considered to be "equal to" the mapping entry and
366  *        0 will be returned.
367  *
368  *        NOTE: If the offset is equal to the entry's source offset,
369  *        this case applies and 0 will be returned. If the offset is
370  *        equal to the entry's source plus its size, this case does
371  *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
372  *        returned.
373  */
374 static int
375 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
376 {
377         const uint64_t *key = v_key;
378         const vdev_indirect_mapping_entry_phys_t *array_elem =
379             v_array_elem;
380         uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
381
382         if (*key < src_offset) {
383                 return (-1);
384         } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
385                 return (0);
386         } else {
387                 return (1);
388         }
389 }
390
391 /*
392  * Return array entry.
393  */
394 static vdev_indirect_mapping_entry_phys_t *
395 vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index)
396 {
397         uint64_t size;
398         off_t offset = 0;
399         int rc;
400
401         if (vim->vim_phys->vimp_num_entries == 0)
402                 return (NULL);
403
404         if (vim->vim_entries == NULL) {
405                 uint64_t bsize;
406
407                 bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
408                 size = vim->vim_phys->vimp_num_entries *
409                     sizeof (*vim->vim_entries);
410                 if (size > bsize) {
411                         size = bsize / sizeof (*vim->vim_entries);
412                         size *= sizeof (*vim->vim_entries);
413                 }
414                 vim->vim_entries = malloc(size);
415                 if (vim->vim_entries == NULL)
416                         return (NULL);
417                 vim->vim_num_entries = size / sizeof (*vim->vim_entries);
418                 offset = index * sizeof (*vim->vim_entries);
419         }
420
421         /* We have data in vim_entries */
422         if (offset == 0) {
423                 if (index >= vim->vim_entry_offset &&
424                     index <= vim->vim_entry_offset + vim->vim_num_entries) {
425                         index -= vim->vim_entry_offset;
426                         return (&vim->vim_entries[index]);
427                 }
428                 offset = index * sizeof (*vim->vim_entries);
429         }
430
431         vim->vim_entry_offset = index;
432         size = vim->vim_num_entries * sizeof (*vim->vim_entries);
433         rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries,
434             size);
435         if (rc != 0) {
436                 /* Read error, invalidate vim_entries. */
437                 free(vim->vim_entries);
438                 vim->vim_entries = NULL;
439                 return (NULL);
440         }
441         index -= vim->vim_entry_offset;
442         return (&vim->vim_entries[index]);
443 }
444
445 /*
446  * Returns the mapping entry for the given offset.
447  *
448  * It's possible that the given offset will not be in the mapping table
449  * (i.e. no mapping entries contain this offset), in which case, the
450  * return value depends on the "next_if_missing" parameter.
451  *
452  * If the offset is not found in the table and "next_if_missing" is
453  * B_FALSE, then NULL will always be returned. The behavior is intended
454  * to allow consumers to get the entry corresponding to the offset
455  * parameter, iff the offset overlaps with an entry in the table.
456  *
457  * If the offset is not found in the table and "next_if_missing" is
458  * B_TRUE, then the entry nearest to the given offset will be returned,
459  * such that the entry's source offset is greater than the offset
460  * passed in (i.e. the "next" mapping entry in the table is returned, if
461  * the offset is missing from the table). If there are no entries whose
462  * source offset is greater than the passed in offset, NULL is returned.
463  */
464 static vdev_indirect_mapping_entry_phys_t *
465 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
466     uint64_t offset)
467 {
468         ASSERT(vim->vim_phys->vimp_num_entries > 0);
469
470         vdev_indirect_mapping_entry_phys_t *entry;
471
472         uint64_t last = vim->vim_phys->vimp_num_entries - 1;
473         uint64_t base = 0;
474
475         /*
476          * We don't define these inside of the while loop because we use
477          * their value in the case that offset isn't in the mapping.
478          */
479         uint64_t mid;
480         int result;
481
482         while (last >= base) {
483                 mid = base + ((last - base) >> 1);
484
485                 entry = vdev_indirect_mapping_entry(vim, mid);
486                 if (entry == NULL)
487                         break;
488                 result = dva_mapping_overlap_compare(&offset, entry);
489
490                 if (result == 0) {
491                         break;
492                 } else if (result < 0) {
493                         last = mid - 1;
494                 } else {
495                         base = mid + 1;
496                 }
497         }
498         return (entry);
499 }
500
501 /*
502  * Given an indirect vdev and an extent on that vdev, it duplicates the
503  * physical entries of the indirect mapping that correspond to the extent
504  * to a new array and returns a pointer to it. In addition, copied_entries
505  * is populated with the number of mapping entries that were duplicated.
506  *
507  * Finally, since we are doing an allocation, it is up to the caller to
508  * free the array allocated in this function.
509  */
510 vdev_indirect_mapping_entry_phys_t *
511 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
512     uint64_t asize, uint64_t *copied_entries)
513 {
514         vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
515         vdev_indirect_mapping_t *vim = vd->v_mapping;
516         uint64_t entries = 0;
517
518         vdev_indirect_mapping_entry_phys_t *first_mapping =
519             vdev_indirect_mapping_entry_for_offset(vim, offset);
520         ASSERT3P(first_mapping, !=, NULL);
521
522         vdev_indirect_mapping_entry_phys_t *m = first_mapping;
523         while (asize > 0) {
524                 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
525                 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
526                 uint64_t inner_size = MIN(asize, size - inner_offset);
527
528                 offset += inner_size;
529                 asize -= inner_size;
530                 entries++;
531                 m++;
532         }
533
534         size_t copy_length = entries * sizeof (*first_mapping);
535         duplicate_mappings = malloc(copy_length);
536         if (duplicate_mappings != NULL)
537                 bcopy(first_mapping, duplicate_mappings, copy_length);
538         else
539                 entries = 0;
540
541         *copied_entries = entries;
542
543         return (duplicate_mappings);
544 }
545
546 static vdev_t *
547 vdev_lookup_top(spa_t *spa, uint64_t vdev)
548 {
549         vdev_t *rvd;
550         vdev_list_t *vlist;
551
552         vlist = &spa->spa_root_vdev->v_children;
553         STAILQ_FOREACH(rvd, vlist, v_childlink)
554                 if (rvd->v_id == vdev)
555                         break;
556
557         return (rvd);
558 }
559
560 /*
561  * This is a callback for vdev_indirect_remap() which allocates an
562  * indirect_split_t for each split segment and adds it to iv_splits.
563  */
564 static void
565 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
566     uint64_t size, void *arg)
567 {
568         int n = 1;
569         zio_t *zio = arg;
570         indirect_vsd_t *iv = zio->io_vsd;
571
572         if (vd->v_read == vdev_indirect_read)
573                 return;
574
575         if (vd->v_read == vdev_mirror_read)
576                 n = vd->v_nchildren;
577
578         indirect_split_t *is =
579             malloc(offsetof(indirect_split_t, is_child[n]));
580         if (is == NULL) {
581                 zio->io_error = ENOMEM;
582                 return;
583         }
584         bzero(is, offsetof(indirect_split_t, is_child[n]));
585
586         is->is_children = n;
587         is->is_size = size;
588         is->is_split_offset = split_offset;
589         is->is_target_offset = offset;
590         is->is_vdev = vd;
591
592         /*
593          * Note that we only consider multiple copies of the data for
594          * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
595          * though they use the same ops as mirror, because there's only one
596          * "good" copy under the replacing/spare.
597          */
598         if (vd->v_read == vdev_mirror_read) {
599                 int i = 0;
600                 vdev_t *kid;
601
602                 STAILQ_FOREACH(kid, &vd->v_children, v_childlink) {
603                         is->is_child[i++].ic_vdev = kid;
604                 }
605         } else {
606                 is->is_child[0].ic_vdev = vd;
607         }
608
609         list_insert_tail(&iv->iv_splits, is);
610 }
611
612 static void
613 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
614 {
615         list_t stack;
616         spa_t *spa = vd->v_spa;
617         zio_t *zio = arg;
618         remap_segment_t *rs;
619
620         list_create(&stack, sizeof (remap_segment_t),
621             offsetof(remap_segment_t, rs_node));
622
623         rs = rs_alloc(vd, offset, asize, 0);
624         if (rs == NULL) {
625                 printf("vdev_indirect_remap: out of memory.\n");
626                 zio->io_error = ENOMEM;
627         }
628         for (; rs != NULL; rs = list_remove_head(&stack)) {
629                 vdev_t *v = rs->rs_vd;
630                 uint64_t num_entries = 0;
631                 /* vdev_indirect_mapping_t *vim = v->v_mapping; */
632                 vdev_indirect_mapping_entry_phys_t *mapping =
633                     vdev_indirect_mapping_duplicate_adjacent_entries(v,
634                     rs->rs_offset, rs->rs_asize, &num_entries);
635
636                 if (num_entries == 0)
637                         zio->io_error = ENOMEM;
638
639                 for (uint64_t i = 0; i < num_entries; i++) {
640                         vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
641                         uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
642                         uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
643                         uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
644                         uint64_t inner_offset = rs->rs_offset -
645                             DVA_MAPPING_GET_SRC_OFFSET(m);
646                         uint64_t inner_size =
647                             MIN(rs->rs_asize, size - inner_offset);
648                         vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
649
650                         if (dst_v->v_read == vdev_indirect_read) {
651                                 remap_segment_t *o;
652
653                                 o = rs_alloc(dst_v, dst_offset + inner_offset,
654                                     inner_size, rs->rs_split_offset);
655                                 if (o == NULL) {
656                                         printf("vdev_indirect_remap: "
657                                             "out of memory.\n");
658                                         zio->io_error = ENOMEM;
659                                         break;
660                                 }
661
662                                 list_insert_head(&stack, o);
663                         }
664                         vdev_indirect_gather_splits(rs->rs_split_offset, dst_v,
665                             dst_offset + inner_offset,
666                             inner_size, arg);
667
668                         /*
669                          * vdev_indirect_gather_splits can have memory
670                          * allocation error, we can not recover from it.
671                          */
672                         if (zio->io_error != 0)
673                                 break;
674                         rs->rs_offset += inner_size;
675                         rs->rs_asize -= inner_size;
676                         rs->rs_split_offset += inner_size;
677                 }
678
679                 free(mapping);
680                 free(rs);
681                 if (zio->io_error != 0)
682                         break;
683         }
684
685         list_destroy(&stack);
686 }
687
688 static void
689 vdev_indirect_map_free(zio_t *zio)
690 {
691         indirect_vsd_t *iv = zio->io_vsd;
692         indirect_split_t *is;
693
694         while ((is = list_head(&iv->iv_splits)) != NULL) {
695                 for (int c = 0; c < is->is_children; c++) {
696                         indirect_child_t *ic = &is->is_child[c];
697                         free(ic->ic_data);
698                 }
699                 list_remove(&iv->iv_splits, is);
700                 free(is);
701         }
702         free(iv);
703 }
704
705 static int
706 vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
707     off_t offset, size_t bytes)
708 {
709         zio_t zio;
710         spa_t *spa = vdev->v_spa;
711         indirect_vsd_t *iv;
712         indirect_split_t *first;
713         int rc = EIO;
714
715         iv = calloc(1, sizeof(*iv));
716         if (iv == NULL)
717                 return (ENOMEM);
718
719         list_create(&iv->iv_splits,
720             sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
721
722         bzero(&zio, sizeof(zio));
723         zio.io_spa = spa;
724         zio.io_bp = (blkptr_t *)bp;
725         zio.io_data = buf;
726         zio.io_size = bytes;
727         zio.io_offset = offset;
728         zio.io_vd = vdev;
729         zio.io_vsd = iv;
730
731         if (vdev->v_mapping == NULL) {
732                 vdev_indirect_config_t *vic;
733
734                 vic = &vdev->vdev_indirect_config;
735                 vdev->v_mapping = vdev_indirect_mapping_open(spa,
736                     spa->spa_mos, vic->vic_mapping_object);
737         }
738
739         vdev_indirect_remap(vdev, offset, bytes, &zio);
740         if (zio.io_error != 0)
741                 return (zio.io_error);
742
743         first = list_head(&iv->iv_splits);
744         if (first->is_size == zio.io_size) {
745                 /*
746                  * This is not a split block; we are pointing to the entire
747                  * data, which will checksum the same as the original data.
748                  * Pass the BP down so that the child i/o can verify the
749                  * checksum, and try a different location if available
750                  * (e.g. on a mirror).
751                  *
752                  * While this special case could be handled the same as the
753                  * general (split block) case, doing it this way ensures
754                  * that the vast majority of blocks on indirect vdevs
755                  * (which are not split) are handled identically to blocks
756                  * on non-indirect vdevs.  This allows us to be less strict
757                  * about performance in the general (but rare) case.
758                  */
759                 rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp,
760                     zio.io_data, first->is_target_offset, bytes);
761         } else {
762                 iv->iv_split_block = B_TRUE;
763                 /*
764                  * Read one copy of each split segment, from the
765                  * top-level vdev.  Since we don't know the
766                  * checksum of each split individually, the child
767                  * zio can't ensure that we get the right data.
768                  * E.g. if it's a mirror, it will just read from a
769                  * random (healthy) leaf vdev.  We have to verify
770                  * the checksum in vdev_indirect_io_done().
771                  */
772                 for (indirect_split_t *is = list_head(&iv->iv_splits);
773                     is != NULL; is = list_next(&iv->iv_splits, is)) {
774                         char *ptr = zio.io_data;
775
776                         rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp,
777                             ptr + is->is_split_offset, is->is_target_offset,
778                             is->is_size);
779                 }
780                 if (zio_checksum_verify(spa, zio.io_bp, zio.io_data))
781                         rc = ECKSUM;
782                 else
783                         rc = 0;
784         }
785
786         vdev_indirect_map_free(&zio);
787         if (rc == 0)
788                 rc = zio.io_error;
789
790         return (rc);
791 }
792
793 static int
794 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
795     off_t offset, size_t bytes)
796 {
797
798         return (vdev_read_phys(vdev, bp, buf,
799             offset + VDEV_LABEL_START_SIZE, bytes));
800 }
801
802 static int
803 vdev_missing_read(vdev_t *vdev __unused, const blkptr_t *bp __unused,
804     void *buf __unused, off_t offset __unused, size_t bytes __unused)
805 {
806
807         return (ENOTSUP);
808 }
809
810 static int
811 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
812     off_t offset, size_t bytes)
813 {
814         vdev_t *kid;
815         int rc;
816
817         rc = EIO;
818         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
819                 if (kid->v_state != VDEV_STATE_HEALTHY)
820                         continue;
821                 rc = kid->v_read(kid, bp, buf, offset, bytes);
822                 if (!rc)
823                         return (0);
824         }
825
826         return (rc);
827 }
828
829 static int
830 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
831     off_t offset, size_t bytes)
832 {
833         vdev_t *kid;
834
835         /*
836          * Here we should have two kids:
837          * First one which is the one we are replacing and we can trust
838          * only this one to have valid data, but it might not be present.
839          * Second one is that one we are replacing with. It is most likely
840          * healthy, but we can't trust it has needed data, so we won't use it.
841          */
842         kid = STAILQ_FIRST(&vdev->v_children);
843         if (kid == NULL)
844                 return (EIO);
845         if (kid->v_state != VDEV_STATE_HEALTHY)
846                 return (EIO);
847         return (kid->v_read(kid, bp, buf, offset, bytes));
848 }
849
850 static vdev_t *
851 vdev_find(uint64_t guid)
852 {
853         vdev_t *vdev;
854
855         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
856                 if (vdev->v_guid == guid)
857                         return (vdev);
858
859         return (0);
860 }
861
862 static vdev_t *
863 vdev_create(uint64_t guid, vdev_read_t *_read)
864 {
865         vdev_t *vdev;
866         vdev_indirect_config_t *vic;
867
868         vdev = calloc(1, sizeof(vdev_t));
869         if (vdev != NULL) {
870                 STAILQ_INIT(&vdev->v_children);
871                 vdev->v_guid = guid;
872                 vdev->v_read = _read;
873
874                 /*
875                  * root vdev has no read function, we use this fact to
876                  * skip setting up data we do not need for root vdev.
877                  * We only point root vdev from spa.
878                  */
879                 if (_read != NULL) {
880                         vic = &vdev->vdev_indirect_config;
881                         vic->vic_prev_indirect_vdev = UINT64_MAX;
882                         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
883                 }
884         }
885
886         return (vdev);
887 }
888
889 static void
890 vdev_set_initial_state(vdev_t *vdev, const nvlist_t *nvlist)
891 {
892         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
893         uint64_t is_log;
894
895         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
896         is_log = 0;
897         (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
898             &is_offline, NULL);
899         (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
900             &is_removed, NULL);
901         (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
902             &is_faulted, NULL);
903         (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
904             NULL, &is_degraded, NULL);
905         (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
906             NULL, &isnt_present, NULL);
907         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
908             &is_log, NULL);
909
910         if (is_offline != 0)
911                 vdev->v_state = VDEV_STATE_OFFLINE;
912         else if (is_removed != 0)
913                 vdev->v_state = VDEV_STATE_REMOVED;
914         else if (is_faulted != 0)
915                 vdev->v_state = VDEV_STATE_FAULTED;
916         else if (is_degraded != 0)
917                 vdev->v_state = VDEV_STATE_DEGRADED;
918         else if (isnt_present != 0)
919                 vdev->v_state = VDEV_STATE_CANT_OPEN;
920
921         vdev->v_islog = is_log != 0;
922 }
923
924 static int
925 vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp)
926 {
927         uint64_t id, ashift, asize, nparity;
928         const char *path;
929         const char *type;
930         int len, pathlen;
931         char *name;
932         vdev_t *vdev;
933
934         if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id,
935             NULL) ||
936             nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL,
937             &type, &len)) {
938                 return (ENOENT);
939         }
940
941         if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 &&
942             memcmp(type, VDEV_TYPE_DISK, len) != 0 &&
943 #ifdef ZFS_TEST
944             memcmp(type, VDEV_TYPE_FILE, len) != 0 &&
945 #endif
946             memcmp(type, VDEV_TYPE_RAIDZ, len) != 0 &&
947             memcmp(type, VDEV_TYPE_INDIRECT, len) != 0 &&
948             memcmp(type, VDEV_TYPE_REPLACING, len) != 0 &&
949             memcmp(type, VDEV_TYPE_HOLE, len) != 0) {
950                 printf("ZFS: can only boot from disk, mirror, raidz1, "
951                     "raidz2 and raidz3 vdevs, got: %.*s\n", len, type);
952                 return (EIO);
953         }
954
955         if (memcmp(type, VDEV_TYPE_MIRROR, len) == 0)
956                 vdev = vdev_create(guid, vdev_mirror_read);
957         else if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0)
958                 vdev = vdev_create(guid, vdev_raidz_read);
959         else if (memcmp(type, VDEV_TYPE_REPLACING, len) == 0)
960                 vdev = vdev_create(guid, vdev_replacing_read);
961         else if (memcmp(type, VDEV_TYPE_INDIRECT, len) == 0) {
962                 vdev_indirect_config_t *vic;
963
964                 vdev = vdev_create(guid, vdev_indirect_read);
965                 if (vdev != NULL) {
966                         vdev->v_state = VDEV_STATE_HEALTHY;
967                         vic = &vdev->vdev_indirect_config;
968
969                         nvlist_find(nvlist,
970                             ZPOOL_CONFIG_INDIRECT_OBJECT,
971                             DATA_TYPE_UINT64,
972                             NULL, &vic->vic_mapping_object, NULL);
973                         nvlist_find(nvlist,
974                             ZPOOL_CONFIG_INDIRECT_BIRTHS,
975                             DATA_TYPE_UINT64,
976                             NULL, &vic->vic_births_object, NULL);
977                         nvlist_find(nvlist,
978                             ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
979                             DATA_TYPE_UINT64,
980                             NULL, &vic->vic_prev_indirect_vdev, NULL);
981                 }
982         } else if (memcmp(type, VDEV_TYPE_HOLE, len) == 0) {
983                 vdev = vdev_create(guid, vdev_missing_read);
984         } else {
985                 vdev = vdev_create(guid, vdev_disk_read);
986         }
987
988         if (vdev == NULL)
989                 return (ENOMEM);
990
991         vdev_set_initial_state(vdev, nvlist);
992         vdev->v_id = id;
993         if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
994             DATA_TYPE_UINT64, NULL, &ashift, NULL) == 0)
995                 vdev->v_ashift = ashift;
996
997         if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
998             DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) {
999                 vdev->v_psize = asize +
1000                     VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1001         }
1002
1003         if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
1004             DATA_TYPE_UINT64, NULL, &nparity, NULL) == 0)
1005                 vdev->v_nparity = nparity;
1006
1007         if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
1008             DATA_TYPE_STRING, NULL, &path, &pathlen) == 0) {
1009                 char prefix[] = "/dev/";
1010
1011                 len = strlen(prefix);
1012                 if (len < pathlen && memcmp(path, prefix, len) == 0) {
1013                         path += len;
1014                         pathlen -= len;
1015                 }
1016                 name = malloc(pathlen + 1);
1017                 bcopy(path, name, pathlen);
1018                 name[pathlen] = '\0';
1019                 vdev->v_name = name;
1020         } else {
1021                 name = NULL;
1022                 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) {
1023                         if (vdev->v_nparity < 1 ||
1024                             vdev->v_nparity > 3) {
1025                                 printf("ZFS: invalid raidz parity: %d\n",
1026                                     vdev->v_nparity);
1027                                 return (EIO);
1028                         }
1029                         (void) asprintf(&name, "%.*s%d-%" PRIu64, len, type,
1030                             vdev->v_nparity, id);
1031                 } else {
1032                         (void) asprintf(&name, "%.*s-%" PRIu64, len, type, id);
1033                 }
1034                 vdev->v_name = name;
1035         }
1036         *vdevp = vdev;
1037         return (0);
1038 }
1039
1040 /*
1041  * Find slot for vdev. We return either NULL to signal to use
1042  * STAILQ_INSERT_HEAD, or we return link element to be used with
1043  * STAILQ_INSERT_AFTER.
1044  */
1045 static vdev_t *
1046 vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
1047 {
1048         vdev_t *v, *previous;
1049
1050         if (STAILQ_EMPTY(&top_vdev->v_children))
1051                 return (NULL);
1052
1053         previous = NULL;
1054         STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
1055                 if (v->v_id > vdev->v_id)
1056                         return (previous);
1057
1058                 if (v->v_id == vdev->v_id)
1059                         return (v);
1060
1061                 if (v->v_id < vdev->v_id)
1062                         previous = v;
1063         }
1064         return (previous);
1065 }
1066
1067 static size_t
1068 vdev_child_count(vdev_t *vdev)
1069 {
1070         vdev_t *v;
1071         size_t count;
1072
1073         count = 0;
1074         STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
1075                 count++;
1076         }
1077         return (count);
1078 }
1079
1080 /*
1081  * Insert vdev into top_vdev children list. List is ordered by v_id.
1082  */
1083 static void
1084 vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
1085 {
1086         vdev_t *previous;
1087         size_t count;
1088
1089         /*
1090          * The top level vdev can appear in random order, depending how
1091          * the firmware is presenting the disk devices.
1092          * However, we will insert vdev to create list ordered by v_id,
1093          * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
1094          * as STAILQ does not have insert before.
1095          */
1096         previous = vdev_find_previous(top_vdev, vdev);
1097
1098         if (previous == NULL) {
1099                 STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
1100         } else if (previous->v_id == vdev->v_id) {
1101                 /*
1102                  * This vdev was configured from label config,
1103                  * do not insert duplicate.
1104                  */
1105                 return;
1106         } else {
1107                 STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev,
1108                     v_childlink);
1109         }
1110
1111         count = vdev_child_count(top_vdev);
1112         if (top_vdev->v_nchildren < count)
1113                 top_vdev->v_nchildren = count;
1114 }
1115
1116 static int
1117 vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist)
1118 {
1119         vdev_t *top_vdev, *vdev;
1120         nvlist_t **kids = NULL;
1121         int rc, nkids;
1122
1123         /* Get top vdev. */
1124         top_vdev = vdev_find(top_guid);
1125         if (top_vdev == NULL) {
1126                 rc = vdev_init(top_guid, nvlist, &top_vdev);
1127                 if (rc != 0)
1128                         return (rc);
1129                 top_vdev->v_spa = spa;
1130                 top_vdev->v_top = top_vdev;
1131                 vdev_insert(spa->spa_root_vdev, top_vdev);
1132         }
1133
1134         /* Add children if there are any. */
1135         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1136             &nkids, &kids, NULL);
1137         if (rc == 0) {
1138                 for (int i = 0; i < nkids; i++) {
1139                         uint64_t guid;
1140
1141                         rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID,
1142                             DATA_TYPE_UINT64, NULL, &guid, NULL);
1143                         if (rc != 0)
1144                                 goto done;
1145
1146                         rc = vdev_init(guid, kids[i], &vdev);
1147                         if (rc != 0)
1148                                 goto done;
1149
1150                         vdev->v_spa = spa;
1151                         vdev->v_top = top_vdev;
1152                         vdev_insert(top_vdev, vdev);
1153                 }
1154         } else {
1155                 /*
1156                  * When there are no children, nvlist_find() does return
1157                  * error, reset it because leaf devices have no children.
1158                  */
1159                 rc = 0;
1160         }
1161 done:
1162         if (kids != NULL) {
1163                 for (int i = 0; i < nkids; i++)
1164                         nvlist_destroy(kids[i]);
1165                 free(kids);
1166         }
1167
1168         return (rc);
1169 }
1170
1171 static int
1172 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
1173 {
1174         uint64_t pool_guid, top_guid;
1175         nvlist_t *vdevs;
1176         int rc;
1177
1178         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1179             NULL, &pool_guid, NULL) ||
1180             nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
1181             NULL, &top_guid, NULL) ||
1182             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1183             NULL, &vdevs, NULL)) {
1184                 printf("ZFS: can't find vdev details\n");
1185                 return (ENOENT);
1186         }
1187
1188         rc = vdev_from_nvlist(spa, top_guid, vdevs);
1189         nvlist_destroy(vdevs);
1190         return (rc);
1191 }
1192
1193 static void
1194 vdev_set_state(vdev_t *vdev)
1195 {
1196         vdev_t *kid;
1197         int good_kids;
1198         int bad_kids;
1199
1200         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1201                 vdev_set_state(kid);
1202         }
1203
1204         /*
1205          * A mirror or raidz is healthy if all its kids are healthy. A
1206          * mirror is degraded if any of its kids is healthy; a raidz
1207          * is degraded if at most nparity kids are offline.
1208          */
1209         if (STAILQ_FIRST(&vdev->v_children)) {
1210                 good_kids = 0;
1211                 bad_kids = 0;
1212                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1213                         if (kid->v_state == VDEV_STATE_HEALTHY)
1214                                 good_kids++;
1215                         else
1216                                 bad_kids++;
1217                 }
1218                 if (bad_kids == 0) {
1219                         vdev->v_state = VDEV_STATE_HEALTHY;
1220                 } else {
1221                         if (vdev->v_read == vdev_mirror_read) {
1222                                 if (good_kids) {
1223                                         vdev->v_state = VDEV_STATE_DEGRADED;
1224                                 } else {
1225                                         vdev->v_state = VDEV_STATE_OFFLINE;
1226                                 }
1227                         } else if (vdev->v_read == vdev_raidz_read) {
1228                                 if (bad_kids > vdev->v_nparity) {
1229                                         vdev->v_state = VDEV_STATE_OFFLINE;
1230                                 } else {
1231                                         vdev->v_state = VDEV_STATE_DEGRADED;
1232                                 }
1233                         }
1234                 }
1235         }
1236 }
1237
1238 static int
1239 vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist)
1240 {
1241         vdev_t *vdev;
1242         nvlist_t **kids = NULL;
1243         int rc, nkids;
1244
1245         /* Update top vdev. */
1246         vdev = vdev_find(top_guid);
1247         if (vdev != NULL)
1248                 vdev_set_initial_state(vdev, nvlist);
1249
1250         /* Update children if there are any. */
1251         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1252             &nkids, &kids, NULL);
1253         if (rc == 0) {
1254                 for (int i = 0; i < nkids; i++) {
1255                         uint64_t guid;
1256
1257                         rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID,
1258                             DATA_TYPE_UINT64, NULL, &guid, NULL);
1259                         if (rc != 0)
1260                                 break;
1261
1262                         vdev = vdev_find(guid);
1263                         if (vdev != NULL)
1264                                 vdev_set_initial_state(vdev, kids[i]);
1265                 }
1266         } else {
1267                 rc = 0;
1268         }
1269         if (kids != NULL) {
1270                 for (int i = 0; i < nkids; i++)
1271                         nvlist_destroy(kids[i]);
1272                 free(kids);
1273         }
1274
1275         return (rc);
1276 }
1277
1278 static int
1279 vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
1280 {
1281         uint64_t pool_guid, vdev_children;
1282         nvlist_t *vdevs = NULL, **kids = NULL;
1283         int rc, nkids;
1284
1285         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1286             NULL, &pool_guid, NULL) ||
1287             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64,
1288             NULL, &vdev_children, NULL) ||
1289             nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1290             NULL, &vdevs, NULL)) {
1291                 printf("ZFS: can't find vdev details\n");
1292                 return (ENOENT);
1293         }
1294
1295         /* Wrong guid?! */
1296         if (spa->spa_guid != pool_guid) {
1297                 nvlist_destroy(vdevs);
1298                 return (EINVAL);
1299         }
1300
1301         spa->spa_root_vdev->v_nchildren = vdev_children;
1302
1303         rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1304             &nkids, &kids, NULL);
1305         nvlist_destroy(vdevs);
1306
1307         /*
1308          * MOS config has at least one child for root vdev.
1309          */
1310         if (rc != 0)
1311                 return (rc);
1312
1313         for (int i = 0; i < nkids; i++) {
1314                 uint64_t guid;
1315                 vdev_t *vdev;
1316
1317                 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1318                     NULL, &guid, NULL);
1319                 if (rc != 0)
1320                         break;
1321                 vdev = vdev_find(guid);
1322                 /*
1323                  * Top level vdev is missing, create it.
1324                  */
1325                 if (vdev == NULL)
1326                         rc = vdev_from_nvlist(spa, guid, kids[i]);
1327                 else
1328                         rc = vdev_update_from_nvlist(guid, kids[i]);
1329                 if (rc != 0)
1330                         break;
1331         }
1332         if (kids != NULL) {
1333                 for (int i = 0; i < nkids; i++)
1334                         nvlist_destroy(kids[i]);
1335                 free(kids);
1336         }
1337
1338         /*
1339          * Re-evaluate top-level vdev state.
1340          */
1341         vdev_set_state(spa->spa_root_vdev);
1342
1343         return (rc);
1344 }
1345
1346 static spa_t *
1347 spa_find_by_guid(uint64_t guid)
1348 {
1349         spa_t *spa;
1350
1351         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1352                 if (spa->spa_guid == guid)
1353                         return (spa);
1354
1355         return (NULL);
1356 }
1357
1358 static spa_t *
1359 spa_find_by_name(const char *name)
1360 {
1361         spa_t *spa;
1362
1363         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1364                 if (strcmp(spa->spa_name, name) == 0)
1365                         return (spa);
1366
1367         return (NULL);
1368 }
1369
1370 static spa_t *
1371 spa_create(uint64_t guid, const char *name)
1372 {
1373         spa_t *spa;
1374
1375         if ((spa = calloc(1, sizeof(spa_t))) == NULL)
1376                 return (NULL);
1377         if ((spa->spa_name = strdup(name)) == NULL) {
1378                 free(spa);
1379                 return (NULL);
1380         }
1381         spa->spa_uberblock = &spa->spa_uberblock_master;
1382         spa->spa_mos = &spa->spa_mos_master;
1383         spa->spa_guid = guid;
1384         spa->spa_root_vdev = vdev_create(guid, NULL);
1385         if (spa->spa_root_vdev == NULL) {
1386                 free(spa->spa_name);
1387                 free(spa);
1388                 return (NULL);
1389         }
1390         spa->spa_root_vdev->v_name = strdup("root");
1391         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
1392
1393         return (spa);
1394 }
1395
1396 static const char *
1397 state_name(vdev_state_t state)
1398 {
1399         static const char *names[] = {
1400                 "UNKNOWN",
1401                 "CLOSED",
1402                 "OFFLINE",
1403                 "REMOVED",
1404                 "CANT_OPEN",
1405                 "FAULTED",
1406                 "DEGRADED",
1407                 "ONLINE"
1408         };
1409         return (names[state]);
1410 }
1411
1412 #ifdef BOOT2
1413
1414 #define pager_printf printf
1415
1416 #else
1417
1418 static int
1419 pager_printf(const char *fmt, ...)
1420 {
1421         char line[80];
1422         va_list args;
1423
1424         va_start(args, fmt);
1425         vsnprintf(line, sizeof(line), fmt, args);
1426         va_end(args);
1427         return (pager_output(line));
1428 }
1429
1430 #endif
1431
1432 #define STATUS_FORMAT   "        %s %s\n"
1433
1434 static int
1435 print_state(int indent, const char *name, vdev_state_t state)
1436 {
1437         int i;
1438         char buf[512];
1439
1440         buf[0] = 0;
1441         for (i = 0; i < indent; i++)
1442                 strcat(buf, "  ");
1443         strcat(buf, name);
1444         return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
1445 }
1446
1447 static int
1448 vdev_status(vdev_t *vdev, int indent)
1449 {
1450         vdev_t *kid;
1451         int ret;
1452
1453         if (vdev->v_islog) {
1454                 (void) pager_output("        logs\n");
1455                 indent++;
1456         }
1457
1458         ret = print_state(indent, vdev->v_name, vdev->v_state);
1459         if (ret != 0)
1460                 return (ret);
1461
1462         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1463                 ret = vdev_status(kid, indent + 1);
1464                 if (ret != 0)
1465                         return (ret);
1466         }
1467         return (ret);
1468 }
1469
1470 static int
1471 spa_status(spa_t *spa)
1472 {
1473         static char bootfs[ZFS_MAXNAMELEN];
1474         uint64_t rootid;
1475         vdev_list_t *vlist;
1476         vdev_t *vdev;
1477         int good_kids, bad_kids, degraded_kids, ret;
1478         vdev_state_t state;
1479
1480         ret = pager_printf("  pool: %s\n", spa->spa_name);
1481         if (ret != 0)
1482                 return (ret);
1483
1484         if (zfs_get_root(spa, &rootid) == 0 &&
1485             zfs_rlookup(spa, rootid, bootfs) == 0) {
1486                 if (bootfs[0] == '\0')
1487                         ret = pager_printf("bootfs: %s\n", spa->spa_name);
1488                 else
1489                         ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
1490                             bootfs);
1491                 if (ret != 0)
1492                         return (ret);
1493         }
1494         ret = pager_printf("config:\n\n");
1495         if (ret != 0)
1496                 return (ret);
1497         ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
1498         if (ret != 0)
1499                 return (ret);
1500
1501         good_kids = 0;
1502         degraded_kids = 0;
1503         bad_kids = 0;
1504         vlist = &spa->spa_root_vdev->v_children;
1505         STAILQ_FOREACH(vdev, vlist, v_childlink) {
1506                 if (vdev->v_state == VDEV_STATE_HEALTHY)
1507                         good_kids++;
1508                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
1509                         degraded_kids++;
1510                 else
1511                         bad_kids++;
1512         }
1513
1514         state = VDEV_STATE_CLOSED;
1515         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
1516                 state = VDEV_STATE_HEALTHY;
1517         else if ((good_kids + degraded_kids) > 0)
1518                 state = VDEV_STATE_DEGRADED;
1519
1520         ret = print_state(0, spa->spa_name, state);
1521         if (ret != 0)
1522                 return (ret);
1523
1524         STAILQ_FOREACH(vdev, vlist, v_childlink) {
1525                 ret = vdev_status(vdev, 1);
1526                 if (ret != 0)
1527                         return (ret);
1528         }
1529         return (ret);
1530 }
1531
1532 static int
1533 spa_all_status(void)
1534 {
1535         spa_t *spa;
1536         int first = 1, ret = 0;
1537
1538         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1539                 if (!first) {
1540                         ret = pager_printf("\n");
1541                         if (ret != 0)
1542                                 return (ret);
1543                 }
1544                 first = 0;
1545                 ret = spa_status(spa);
1546                 if (ret != 0)
1547                         return (ret);
1548         }
1549         return (ret);
1550 }
1551
1552 static uint64_t
1553 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
1554 {
1555         uint64_t label_offset;
1556
1557         if (l < VDEV_LABELS / 2)
1558                 label_offset = 0;
1559         else
1560                 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
1561
1562         return (offset + l * sizeof (vdev_label_t) + label_offset);
1563 }
1564
1565 static int
1566 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
1567 {
1568         unsigned int seq1 = 0;
1569         unsigned int seq2 = 0;
1570         int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
1571
1572         if (cmp != 0)
1573                 return (cmp);
1574
1575         cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
1576         if (cmp != 0)
1577                 return (cmp);
1578
1579         if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
1580                 seq1 = MMP_SEQ(ub1);
1581
1582         if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
1583                 seq2 = MMP_SEQ(ub2);
1584
1585         return (AVL_CMP(seq1, seq2));
1586 }
1587
1588 static int
1589 uberblock_verify(uberblock_t *ub)
1590 {
1591         if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
1592                 byteswap_uint64_array(ub, sizeof (uberblock_t));
1593         }
1594
1595         if (ub->ub_magic != UBERBLOCK_MAGIC ||
1596             !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
1597                 return (EINVAL);
1598
1599         return (0);
1600 }
1601
1602 static int
1603 vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
1604     size_t size)
1605 {
1606         blkptr_t bp;
1607         off_t off;
1608
1609         off = vdev_label_offset(vd->v_psize, l, offset);
1610
1611         BP_ZERO(&bp);
1612         BP_SET_LSIZE(&bp, size);
1613         BP_SET_PSIZE(&bp, size);
1614         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1615         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1616         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
1617         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1618
1619         return (vdev_read_phys(vd, &bp, buf, off, size));
1620 }
1621
1622 /*
1623  * We do need to be sure we write to correct location.
1624  * Our vdev label does consist of 4 fields:
1625  * pad1 (8k), reserved.
1626  * bootenv (8k), checksummed, previously reserved, may contian garbage.
1627  * vdev_phys (112k), checksummed
1628  * uberblock ring (128k), checksummed.
1629  *
1630  * Since bootenv area may contain garbage, we can not reliably read it, as
1631  * we can get checksum errors.
1632  * Next best thing is vdev_phys - it is just after bootenv. It still may
1633  * be corrupted, but in such case we will miss this one write.
1634  */
1635 static int
1636 vdev_label_write_validate(vdev_t *vd, int l, uint64_t offset)
1637 {
1638         uint64_t off, o_phys;
1639         void *buf;
1640         size_t size = VDEV_PHYS_SIZE;
1641         int rc;
1642
1643         o_phys = offsetof(vdev_label_t, vl_vdev_phys);
1644         off = vdev_label_offset(vd->v_psize, l, o_phys);
1645
1646         /* off should be 8K from bootenv */
1647         if (vdev_label_offset(vd->v_psize, l, offset) + VDEV_PAD_SIZE != off)
1648                 return (EINVAL);
1649
1650         buf = malloc(size);
1651         if (buf == NULL)
1652                 return (ENOMEM);
1653
1654         /* Read vdev_phys */
1655         rc = vdev_label_read(vd, l, buf, o_phys, size);
1656         free(buf);
1657         return (rc);
1658 }
1659
1660 static int
1661 vdev_label_write(vdev_t *vd, int l, vdev_boot_envblock_t *be, uint64_t offset)
1662 {
1663         zio_checksum_info_t *ci;
1664         zio_cksum_t cksum;
1665         off_t off;
1666         size_t size = VDEV_PAD_SIZE;
1667         int rc;
1668
1669         if (vd->v_phys_write == NULL)
1670                 return (ENOTSUP);
1671
1672         off = vdev_label_offset(vd->v_psize, l, offset);
1673
1674         rc = vdev_label_write_validate(vd, l, offset);
1675         if (rc != 0) {
1676                 return (rc);
1677         }
1678
1679         ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
1680         be->vbe_zbt.zec_magic = ZEC_MAGIC;
1681         zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off);
1682         ci->ci_func[0](be, size, NULL, &cksum);
1683         be->vbe_zbt.zec_cksum = cksum;
1684
1685         return (vdev_write_phys(vd, be, off, size));
1686 }
1687
1688 static int
1689 vdev_write_bootenv_impl(vdev_t *vdev, vdev_boot_envblock_t *be)
1690 {
1691         vdev_t *kid;
1692         int rv = 0, rc;
1693
1694         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1695                 if (kid->v_state != VDEV_STATE_HEALTHY)
1696                         continue;
1697                 rc = vdev_write_bootenv_impl(kid, be);
1698                 if (rv == 0)
1699                         rv = rc;
1700         }
1701
1702         /*
1703          * Non-leaf vdevs do not have v_phys_write.
1704          */
1705         if (vdev->v_phys_write == NULL)
1706                 return (rv);
1707
1708         for (int l = 0; l < VDEV_LABELS; l++) {
1709                 rc = vdev_label_write(vdev, l, be,
1710                     offsetof(vdev_label_t, vl_be));
1711                 if (rc != 0) {
1712                         printf("failed to write bootenv to %s label %d: %d\n",
1713                             vdev->v_name ? vdev->v_name : "unknown", l, rc);
1714                         rv = rc;
1715                 }
1716         }
1717         return (rv);
1718 }
1719
1720 int
1721 vdev_write_bootenv(vdev_t *vdev, nvlist_t *nvl)
1722 {
1723         vdev_boot_envblock_t *be;
1724         nvlist_t nv, *nvp;
1725         uint64_t version;
1726         int rv;
1727
1728         if (nvl->nv_size > sizeof(be->vbe_bootenv))
1729                 return (E2BIG);
1730
1731         version = VB_RAW;
1732         nvp = vdev_read_bootenv(vdev);
1733         if (nvp != NULL) {
1734                 nvlist_find(nvp, BOOTENV_VERSION, DATA_TYPE_UINT64, NULL,
1735                     &version, NULL);
1736                 nvlist_destroy(nvp);
1737         }
1738
1739         be = calloc(1, sizeof(*be));
1740         if (be == NULL)
1741                 return (ENOMEM);
1742
1743         be->vbe_version = version;
1744         switch (version) {
1745         case VB_RAW:
1746                 /*
1747                  * If there is no envmap, we will just wipe bootenv.
1748                  */
1749                 nvlist_find(nvl, GRUB_ENVMAP, DATA_TYPE_STRING, NULL,
1750                     be->vbe_bootenv, NULL);
1751                 rv = 0;
1752                 break;
1753
1754         case VB_NVLIST:
1755                 nv.nv_header = nvl->nv_header;
1756                 nv.nv_asize = nvl->nv_asize;
1757                 nv.nv_size = nvl->nv_size;
1758
1759                 bcopy(&nv.nv_header, be->vbe_bootenv, sizeof(nv.nv_header));
1760                 nv.nv_data = be->vbe_bootenv + sizeof(nvs_header_t);
1761                 bcopy(nvl->nv_data, nv.nv_data, nv.nv_size);
1762                 rv = nvlist_export(&nv);
1763                 break;
1764
1765         default:
1766                 rv = EINVAL;
1767                 break;
1768         }
1769
1770         if (rv == 0) {
1771                 be->vbe_version = htobe64(be->vbe_version);
1772                 rv = vdev_write_bootenv_impl(vdev, be);
1773         }
1774         free(be);
1775         return (rv);
1776 }
1777
1778 /*
1779  * Read the bootenv area from pool label, return the nvlist from it.
1780  * We return from first successful read.
1781  */
1782 nvlist_t *
1783 vdev_read_bootenv(vdev_t *vdev)
1784 {
1785         vdev_t *kid;
1786         nvlist_t *benv;
1787         vdev_boot_envblock_t *be;
1788         char *command;
1789         bool ok;
1790         int rv;
1791
1792         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1793                 if (kid->v_state != VDEV_STATE_HEALTHY)
1794                         continue;
1795
1796                 benv = vdev_read_bootenv(kid);
1797                 if (benv != NULL)
1798                         return (benv);
1799         }
1800
1801         be = malloc(sizeof (*be));
1802         if (be == NULL)
1803                 return (NULL);
1804
1805         rv = 0;
1806         for (int l = 0; l < VDEV_LABELS; l++) {
1807                 rv = vdev_label_read(vdev, l, be,
1808                     offsetof(vdev_label_t, vl_be),
1809                     sizeof (*be));
1810                 if (rv == 0)
1811                         break;
1812         }
1813         if (rv != 0) {
1814                 free(be);
1815                 return (NULL);
1816         }
1817
1818         be->vbe_version = be64toh(be->vbe_version);
1819         switch (be->vbe_version) {
1820         case VB_RAW:
1821                 /*
1822                  * we have textual data in vbe_bootenv, create nvlist
1823                  * with key "envmap".
1824                  */
1825                 benv = nvlist_create(NV_UNIQUE_NAME);
1826                 if (benv != NULL) {
1827                         if (*be->vbe_bootenv == '\0') {
1828                                 nvlist_add_uint64(benv, BOOTENV_VERSION,
1829                                     VB_NVLIST);
1830                                 break;
1831                         }
1832                         nvlist_add_uint64(benv, BOOTENV_VERSION, VB_RAW);
1833                         be->vbe_bootenv[sizeof (be->vbe_bootenv) - 1] = '\0';
1834                         nvlist_add_string(benv, GRUB_ENVMAP, be->vbe_bootenv);
1835                 }
1836                 break;
1837
1838         case VB_NVLIST:
1839                 benv = nvlist_import(be->vbe_bootenv, sizeof(be->vbe_bootenv));
1840                 break;
1841
1842         default:
1843                 command = (char *)be;
1844                 ok = false;
1845
1846                 /* Check for legacy zfsbootcfg command string */
1847                 for (int i = 0; command[i] != '\0'; i++) {
1848                         if (iscntrl(command[i])) {
1849                                 ok = false;
1850                                 break;
1851                         } else {
1852                                 ok = true;
1853                         }
1854                 }
1855                 benv = nvlist_create(NV_UNIQUE_NAME);
1856                 if (benv != NULL) {
1857                         if (ok)
1858                                 nvlist_add_string(benv, FREEBSD_BOOTONCE,
1859                                     command);
1860                         else
1861                                 nvlist_add_uint64(benv, BOOTENV_VERSION,
1862                                     VB_NVLIST);
1863                 }
1864                 break;
1865         }
1866         free(be);
1867         return (benv);
1868 }
1869
1870 static uint64_t
1871 vdev_get_label_asize(nvlist_t *nvl)
1872 {
1873         nvlist_t *vdevs;
1874         uint64_t asize;
1875         const char *type;
1876         int len;
1877
1878         asize = 0;
1879         /* Get vdev tree */
1880         if (nvlist_find(nvl, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1881             NULL, &vdevs, NULL) != 0)
1882                 return (asize);
1883
1884         /*
1885          * Get vdev type. We will calculate asize for raidz, mirror and disk.
1886          * For raidz, the asize is raw size of all children.
1887          */
1888         if (nvlist_find(vdevs, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
1889             NULL, &type, &len) != 0)
1890                 goto done;
1891
1892         if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 &&
1893             memcmp(type, VDEV_TYPE_DISK, len) != 0 &&
1894             memcmp(type, VDEV_TYPE_RAIDZ, len) != 0)
1895                 goto done;
1896
1897         if (nvlist_find(vdevs, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64,
1898             NULL, &asize, NULL) != 0)
1899                 goto done;
1900
1901         if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) {
1902                 nvlist_t **kids;
1903                 int nkids;
1904
1905                 if (nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN,
1906                     DATA_TYPE_NVLIST_ARRAY, &nkids, &kids, NULL) != 0) {
1907                         asize = 0;
1908                         goto done;
1909                 }
1910
1911                 asize /= nkids;
1912                 for (int i = 0; i < nkids; i++)
1913                         nvlist_destroy(kids[i]);
1914                 free(kids);
1915         }
1916
1917         asize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1918 done:
1919         nvlist_destroy(vdevs);
1920         return (asize);
1921 }
1922
1923 static nvlist_t *
1924 vdev_label_read_config(vdev_t *vd, uint64_t txg)
1925 {
1926         vdev_phys_t *label;
1927         uint64_t best_txg = 0;
1928         uint64_t label_txg = 0;
1929         uint64_t asize;
1930         nvlist_t *nvl = NULL, *tmp;
1931         int error;
1932
1933         label = malloc(sizeof (vdev_phys_t));
1934         if (label == NULL)
1935                 return (NULL);
1936
1937         for (int l = 0; l < VDEV_LABELS; l++) {
1938                 if (vdev_label_read(vd, l, label,
1939                     offsetof(vdev_label_t, vl_vdev_phys),
1940                     sizeof (vdev_phys_t)))
1941                         continue;
1942
1943                 tmp = nvlist_import(label->vp_nvlist,
1944                     sizeof(label->vp_nvlist));
1945                 if (tmp == NULL)
1946                         continue;
1947
1948                 error = nvlist_find(tmp, ZPOOL_CONFIG_POOL_TXG,
1949                     DATA_TYPE_UINT64, NULL, &label_txg, NULL);
1950                 if (error != 0 || label_txg == 0) {
1951                         nvlist_destroy(nvl);
1952                         nvl = tmp;
1953                         goto done;
1954                 }
1955
1956                 if (label_txg <= txg && label_txg > best_txg) {
1957                         best_txg = label_txg;
1958                         nvlist_destroy(nvl);
1959                         nvl = tmp;
1960                         tmp = NULL;
1961
1962                         /*
1963                          * Use asize from pool config. We need this
1964                          * because we can get bad value from BIOS.
1965                          */
1966                         asize = vdev_get_label_asize(nvl);
1967                         if (asize != 0) {
1968                                 vd->v_psize = asize;
1969                         }
1970                 }
1971                 nvlist_destroy(tmp);
1972         }
1973
1974         if (best_txg == 0) {
1975                 nvlist_destroy(nvl);
1976                 nvl = NULL;
1977         }
1978 done:
1979         free(label);
1980         return (nvl);
1981 }
1982
1983 static void
1984 vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
1985 {
1986         uberblock_t *buf;
1987
1988         buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
1989         if (buf == NULL)
1990                 return;
1991
1992         for (int l = 0; l < VDEV_LABELS; l++) {
1993                 for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
1994                         if (vdev_label_read(vd, l, buf,
1995                             VDEV_UBERBLOCK_OFFSET(vd, n),
1996                             VDEV_UBERBLOCK_SIZE(vd)))
1997                                 continue;
1998                         if (uberblock_verify(buf) != 0)
1999                                 continue;
2000
2001                         if (vdev_uberblock_compare(buf, ub) > 0)
2002                                 *ub = *buf;
2003                 }
2004         }
2005         free(buf);
2006 }
2007
2008 static int
2009 vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
2010     spa_t **spap)
2011 {
2012         vdev_t vtmp;
2013         spa_t *spa;
2014         vdev_t *vdev;
2015         nvlist_t *nvl;
2016         uint64_t val;
2017         uint64_t guid, vdev_children;
2018         uint64_t pool_txg, pool_guid;
2019         const char *pool_name;
2020         int rc, namelen;
2021
2022         /*
2023          * Load the vdev label and figure out which
2024          * uberblock is most current.
2025          */
2026         memset(&vtmp, 0, sizeof(vtmp));
2027         vtmp.v_phys_read = _read;
2028         vtmp.v_phys_write = _write;
2029         vtmp.v_priv = priv;
2030         vtmp.v_psize = P2ALIGN(ldi_get_size(priv),
2031             (uint64_t)sizeof (vdev_label_t));
2032
2033         /* Test for minimum device size. */
2034         if (vtmp.v_psize < SPA_MINDEVSIZE)
2035                 return (EIO);
2036
2037         nvl = vdev_label_read_config(&vtmp, UINT64_MAX);
2038         if (nvl == NULL)
2039                 return (EIO);
2040
2041         if (nvlist_find(nvl, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
2042             NULL, &val, NULL) != 0) {
2043                 nvlist_destroy(nvl);
2044                 return (EIO);
2045         }
2046
2047         if (!SPA_VERSION_IS_SUPPORTED(val)) {
2048                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
2049                     (unsigned)val, (unsigned)SPA_VERSION);
2050                 nvlist_destroy(nvl);
2051                 return (EIO);
2052         }
2053
2054         /* Check ZFS features for read */
2055         rc = nvlist_check_features_for_read(nvl);
2056         if (rc != 0) {
2057                 nvlist_destroy(nvl);
2058                 return (EIO);
2059         }
2060
2061         if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
2062             NULL, &val, NULL) != 0) {
2063                 nvlist_destroy(nvl);
2064                 return (EIO);
2065         }
2066
2067         if (val == POOL_STATE_DESTROYED) {
2068                 /* We don't boot only from destroyed pools. */
2069                 nvlist_destroy(nvl);
2070                 return (EIO);
2071         }
2072
2073         if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
2074             NULL, &pool_txg, NULL) != 0 ||
2075             nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
2076             NULL, &pool_guid, NULL) != 0 ||
2077             nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
2078             NULL, &pool_name, &namelen) != 0) {
2079                 /*
2080                  * Cache and spare devices end up here - just ignore
2081                  * them.
2082                  */
2083                 nvlist_destroy(nvl);
2084                 return (EIO);
2085         }
2086
2087         /*
2088          * Create the pool if this is the first time we've seen it.
2089          */
2090         spa = spa_find_by_guid(pool_guid);
2091         if (spa == NULL) {
2092                 char *name;
2093
2094                 nvlist_find(nvl, ZPOOL_CONFIG_VDEV_CHILDREN,
2095                     DATA_TYPE_UINT64, NULL, &vdev_children, NULL);
2096                 name = malloc(namelen + 1);
2097                 if (name == NULL) {
2098                         nvlist_destroy(nvl);
2099                         return (ENOMEM);
2100                 }
2101                 bcopy(pool_name, name, namelen);
2102                 name[namelen] = '\0';
2103                 spa = spa_create(pool_guid, name);
2104                 free(name);
2105                 if (spa == NULL) {
2106                         nvlist_destroy(nvl);
2107                         return (ENOMEM);
2108                 }
2109                 spa->spa_root_vdev->v_nchildren = vdev_children;
2110         }
2111         if (pool_txg > spa->spa_txg)
2112                 spa->spa_txg = pool_txg;
2113
2114         /*
2115          * Get the vdev tree and create our in-core copy of it.
2116          * If we already have a vdev with this guid, this must
2117          * be some kind of alias (overlapping slices, dangerously dedicated
2118          * disks etc).
2119          */
2120         if (nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
2121             NULL, &guid, NULL) != 0) {
2122                 nvlist_destroy(nvl);
2123                 return (EIO);
2124         }
2125         vdev = vdev_find(guid);
2126         /* Has this vdev already been inited? */
2127         if (vdev && vdev->v_phys_read) {
2128                 nvlist_destroy(nvl);
2129                 return (EIO);
2130         }
2131
2132         rc = vdev_init_from_label(spa, nvl);
2133         nvlist_destroy(nvl);
2134         if (rc != 0)
2135                 return (rc);
2136
2137         /*
2138          * We should already have created an incomplete vdev for this
2139          * vdev. Find it and initialise it with our read proc.
2140          */
2141         vdev = vdev_find(guid);
2142         if (vdev != NULL) {
2143                 vdev->v_phys_read = _read;
2144                 vdev->v_phys_write = _write;
2145                 vdev->v_priv = priv;
2146                 vdev->v_psize = vtmp.v_psize;
2147                 /*
2148                  * If no other state is set, mark vdev healthy.
2149                  */
2150                 if (vdev->v_state == VDEV_STATE_UNKNOWN)
2151                         vdev->v_state = VDEV_STATE_HEALTHY;
2152         } else {
2153                 printf("ZFS: inconsistent nvlist contents\n");
2154                 return (EIO);
2155         }
2156
2157         if (vdev->v_islog)
2158                 spa->spa_with_log = vdev->v_islog;
2159
2160         /*
2161          * Re-evaluate top-level vdev state.
2162          */
2163         vdev_set_state(vdev->v_top);
2164
2165         /*
2166          * Ok, we are happy with the pool so far. Lets find
2167          * the best uberblock and then we can actually access
2168          * the contents of the pool.
2169          */
2170         vdev_uberblock_load(vdev, spa->spa_uberblock);
2171
2172         if (spap != NULL)
2173                 *spap = spa;
2174         return (0);
2175 }
2176
2177 static int
2178 ilog2(int n)
2179 {
2180         int v;
2181
2182         for (v = 0; v < 32; v++)
2183                 if (n == (1 << v))
2184                         return (v);
2185         return (-1);
2186 }
2187
2188 static int
2189 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
2190 {
2191         blkptr_t gbh_bp;
2192         zio_gbh_phys_t zio_gb;
2193         char *pbuf;
2194         int i;
2195
2196         /* Artificial BP for gang block header. */
2197         gbh_bp = *bp;
2198         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
2199         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
2200         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
2201         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
2202         for (i = 0; i < SPA_DVAS_PER_BP; i++)
2203                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
2204
2205         /* Read gang header block using the artificial BP. */
2206         if (zio_read(spa, &gbh_bp, &zio_gb))
2207                 return (EIO);
2208
2209         pbuf = buf;
2210         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
2211                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
2212
2213                 if (BP_IS_HOLE(gbp))
2214                         continue;
2215                 if (zio_read(spa, gbp, pbuf))
2216                         return (EIO);
2217                 pbuf += BP_GET_PSIZE(gbp);
2218         }
2219
2220         if (zio_checksum_verify(spa, bp, buf))
2221                 return (EIO);
2222         return (0);
2223 }
2224
2225 static int
2226 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
2227 {
2228         int cpfunc = BP_GET_COMPRESS(bp);
2229         uint64_t align, size;
2230         void *pbuf;
2231         int i, error;
2232
2233         /*
2234          * Process data embedded in block pointer
2235          */
2236         if (BP_IS_EMBEDDED(bp)) {
2237                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
2238
2239                 size = BPE_GET_PSIZE(bp);
2240                 ASSERT(size <= BPE_PAYLOAD_SIZE);
2241
2242                 if (cpfunc != ZIO_COMPRESS_OFF)
2243                         pbuf = malloc(size);
2244                 else
2245                         pbuf = buf;
2246
2247                 if (pbuf == NULL)
2248                         return (ENOMEM);
2249
2250                 decode_embedded_bp_compressed(bp, pbuf);
2251                 error = 0;
2252
2253                 if (cpfunc != ZIO_COMPRESS_OFF) {
2254                         error = zio_decompress_data(cpfunc, pbuf,
2255                             size, buf, BP_GET_LSIZE(bp));
2256                         free(pbuf);
2257                 }
2258                 if (error != 0)
2259                         printf("ZFS: i/o error - unable to decompress "
2260                             "block pointer data, error %d\n", error);
2261                 return (error);
2262         }
2263
2264         error = EIO;
2265
2266         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
2267                 const dva_t *dva = &bp->blk_dva[i];
2268                 vdev_t *vdev;
2269                 vdev_list_t *vlist;
2270                 uint64_t vdevid;
2271                 off_t offset;
2272
2273                 if (!dva->dva_word[0] && !dva->dva_word[1])
2274                         continue;
2275
2276                 vdevid = DVA_GET_VDEV(dva);
2277                 offset = DVA_GET_OFFSET(dva);
2278                 vlist = &spa->spa_root_vdev->v_children;
2279                 STAILQ_FOREACH(vdev, vlist, v_childlink) {
2280                         if (vdev->v_id == vdevid)
2281                                 break;
2282                 }
2283                 if (!vdev || !vdev->v_read)
2284                         continue;
2285
2286                 size = BP_GET_PSIZE(bp);
2287                 if (vdev->v_read == vdev_raidz_read) {
2288                         align = 1ULL << vdev->v_ashift;
2289                         if (P2PHASE(size, align) != 0)
2290                                 size = P2ROUNDUP(size, align);
2291                 }
2292                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
2293                         pbuf = malloc(size);
2294                 else
2295                         pbuf = buf;
2296
2297                 if (pbuf == NULL) {
2298                         error = ENOMEM;
2299                         break;
2300                 }
2301
2302                 if (DVA_GET_GANG(dva))
2303                         error = zio_read_gang(spa, bp, pbuf);
2304                 else
2305                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
2306                 if (error == 0) {
2307                         if (cpfunc != ZIO_COMPRESS_OFF)
2308                                 error = zio_decompress_data(cpfunc, pbuf,
2309                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
2310                         else if (size != BP_GET_PSIZE(bp))
2311                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
2312                 } else {
2313                         printf("zio_read error: %d\n", error);
2314                 }
2315                 if (buf != pbuf)
2316                         free(pbuf);
2317                 if (error == 0)
2318                         break;
2319         }
2320         if (error != 0)
2321                 printf("ZFS: i/o error - all block copies unavailable\n");
2322
2323         return (error);
2324 }
2325
2326 static int
2327 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset,
2328     void *buf, size_t buflen)
2329 {
2330         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
2331         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2332         int nlevels = dnode->dn_nlevels;
2333         int i, rc;
2334
2335         if (bsize > SPA_MAXBLOCKSIZE) {
2336                 printf("ZFS: I/O error - blocks larger than %llu are not "
2337                     "supported\n", SPA_MAXBLOCKSIZE);
2338                 return (EIO);
2339         }
2340
2341         /*
2342          * Handle odd block sizes, mirrors dmu_read_impl().  Data can't exist
2343          * past the first block, so we'll clip the read to the portion of the
2344          * buffer within bsize and zero out the remainder.
2345          */
2346         if (dnode->dn_maxblkid == 0) {
2347                 size_t newbuflen;
2348
2349                 newbuflen = offset > bsize ? 0 : MIN(buflen, bsize - offset);
2350                 bzero((char *)buf + newbuflen, buflen - newbuflen);
2351                 buflen = newbuflen;
2352         }
2353
2354         /*
2355          * Note: bsize may not be a power of two here so we need to do an
2356          * actual divide rather than a bitshift.
2357          */
2358         while (buflen > 0) {
2359                 uint64_t bn = offset / bsize;
2360                 int boff = offset % bsize;
2361                 int ibn;
2362                 const blkptr_t *indbp;
2363                 blkptr_t bp;
2364
2365                 if (bn > dnode->dn_maxblkid)
2366                         return (EIO);
2367
2368                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
2369                         goto cached;
2370
2371                 indbp = dnode->dn_blkptr;
2372                 for (i = 0; i < nlevels; i++) {
2373                         /*
2374                          * Copy the bp from the indirect array so that
2375                          * we can re-use the scratch buffer for multi-level
2376                          * objects.
2377                          */
2378                         ibn = bn >> ((nlevels - i - 1) * ibshift);
2379                         ibn &= ((1 << ibshift) - 1);
2380                         bp = indbp[ibn];
2381                         if (BP_IS_HOLE(&bp)) {
2382                                 memset(dnode_cache_buf, 0, bsize);
2383                                 break;
2384                         }
2385                         rc = zio_read(spa, &bp, dnode_cache_buf);
2386                         if (rc)
2387                                 return (rc);
2388                         indbp = (const blkptr_t *) dnode_cache_buf;
2389                 }
2390                 dnode_cache_obj = dnode;
2391                 dnode_cache_bn = bn;
2392         cached:
2393
2394                 /*
2395                  * The buffer contains our data block. Copy what we
2396                  * need from it and loop.
2397                  */
2398                 i = bsize - boff;
2399                 if (i > buflen) i = buflen;
2400                 memcpy(buf, &dnode_cache_buf[boff], i);
2401                 buf = ((char *)buf) + i;
2402                 offset += i;
2403                 buflen -= i;
2404         }
2405
2406         return (0);
2407 }
2408
2409 /*
2410  * Lookup a value in a microzap directory.
2411  */
2412 static int
2413 mzap_lookup(const mzap_phys_t *mz, size_t size, const char *name,
2414     uint64_t *value)
2415 {
2416         const mzap_ent_phys_t *mze;
2417         int chunks, i;
2418
2419         /*
2420          * Microzap objects use exactly one block. Read the whole
2421          * thing.
2422          */
2423         chunks = size / MZAP_ENT_LEN - 1;
2424         for (i = 0; i < chunks; i++) {
2425                 mze = &mz->mz_chunk[i];
2426                 if (strcmp(mze->mze_name, name) == 0) {
2427                         *value = mze->mze_value;
2428                         return (0);
2429                 }
2430         }
2431
2432         return (ENOENT);
2433 }
2434
2435 /*
2436  * Compare a name with a zap leaf entry. Return non-zero if the name
2437  * matches.
2438  */
2439 static int
2440 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2441     const char *name)
2442 {
2443         size_t namelen;
2444         const zap_leaf_chunk_t *nc;
2445         const char *p;
2446
2447         namelen = zc->l_entry.le_name_numints;
2448
2449         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2450         p = name;
2451         while (namelen > 0) {
2452                 size_t len;
2453
2454                 len = namelen;
2455                 if (len > ZAP_LEAF_ARRAY_BYTES)
2456                         len = ZAP_LEAF_ARRAY_BYTES;
2457                 if (memcmp(p, nc->l_array.la_array, len))
2458                         return (0);
2459                 p += len;
2460                 namelen -= len;
2461                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2462         }
2463
2464         return (1);
2465 }
2466
2467 /*
2468  * Extract a uint64_t value from a zap leaf entry.
2469  */
2470 static uint64_t
2471 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
2472 {
2473         const zap_leaf_chunk_t *vc;
2474         int i;
2475         uint64_t value;
2476         const uint8_t *p;
2477
2478         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
2479         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
2480                 value = (value << 8) | p[i];
2481         }
2482
2483         return (value);
2484 }
2485
2486 static void
2487 stv(int len, void *addr, uint64_t value)
2488 {
2489         switch (len) {
2490         case 1:
2491                 *(uint8_t *)addr = value;
2492                 return;
2493         case 2:
2494                 *(uint16_t *)addr = value;
2495                 return;
2496         case 4:
2497                 *(uint32_t *)addr = value;
2498                 return;
2499         case 8:
2500                 *(uint64_t *)addr = value;
2501                 return;
2502         }
2503 }
2504
2505 /*
2506  * Extract a array from a zap leaf entry.
2507  */
2508 static void
2509 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2510     uint64_t integer_size, uint64_t num_integers, void *buf)
2511 {
2512         uint64_t array_int_len = zc->l_entry.le_value_intlen;
2513         uint64_t value = 0;
2514         uint64_t *u64 = buf;
2515         char *p = buf;
2516         int len = MIN(zc->l_entry.le_value_numints, num_integers);
2517         int chunk = zc->l_entry.le_value_chunk;
2518         int byten = 0;
2519
2520         if (integer_size == 8 && len == 1) {
2521                 *u64 = fzap_leaf_value(zl, zc);
2522                 return;
2523         }
2524
2525         while (len > 0) {
2526                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
2527                 int i;
2528
2529                 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
2530                 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
2531                         value = (value << 8) | la->la_array[i];
2532                         byten++;
2533                         if (byten == array_int_len) {
2534                                 stv(integer_size, p, value);
2535                                 byten = 0;
2536                                 len--;
2537                                 if (len == 0)
2538                                         return;
2539                                 p += integer_size;
2540                         }
2541                 }
2542                 chunk = la->la_next;
2543         }
2544 }
2545
2546 static int
2547 fzap_check_size(uint64_t integer_size, uint64_t num_integers)
2548 {
2549
2550         switch (integer_size) {
2551         case 1:
2552         case 2:
2553         case 4:
2554         case 8:
2555                 break;
2556         default:
2557                 return (EINVAL);
2558         }
2559
2560         if (integer_size * num_integers > ZAP_MAXVALUELEN)
2561                 return (E2BIG);
2562
2563         return (0);
2564 }
2565
2566 static void
2567 zap_leaf_free(zap_leaf_t *leaf)
2568 {
2569         free(leaf->l_phys);
2570         free(leaf);
2571 }
2572
2573 static int
2574 zap_get_leaf_byblk(fat_zap_t *zap, uint64_t blk, zap_leaf_t **lp)
2575 {
2576         int bs = FZAP_BLOCK_SHIFT(zap);
2577         int err;
2578
2579         *lp = malloc(sizeof(**lp));
2580         if (*lp == NULL)
2581                 return (ENOMEM);
2582
2583         (*lp)->l_bs = bs;
2584         (*lp)->l_phys = malloc(1 << bs);
2585
2586         if ((*lp)->l_phys == NULL) {
2587                 free(*lp);
2588                 return (ENOMEM);
2589         }
2590         err = dnode_read(zap->zap_spa, zap->zap_dnode, blk << bs, (*lp)->l_phys,
2591             1 << bs);
2592         if (err != 0) {
2593                 zap_leaf_free(*lp);
2594         }
2595         return (err);
2596 }
2597
2598 static int
2599 zap_table_load(fat_zap_t *zap, zap_table_phys_t *tbl, uint64_t idx,
2600     uint64_t *valp)
2601 {
2602         int bs = FZAP_BLOCK_SHIFT(zap);
2603         uint64_t blk = idx >> (bs - 3);
2604         uint64_t off = idx & ((1 << (bs - 3)) - 1);
2605         uint64_t *buf;
2606         int rc;
2607
2608         buf = malloc(1 << zap->zap_block_shift);
2609         if (buf == NULL)
2610                 return (ENOMEM);
2611         rc = dnode_read(zap->zap_spa, zap->zap_dnode, (tbl->zt_blk + blk) << bs,
2612             buf, 1 << zap->zap_block_shift);
2613         if (rc == 0)
2614                 *valp = buf[off];
2615         free(buf);
2616         return (rc);
2617 }
2618
2619 static int
2620 zap_idx_to_blk(fat_zap_t *zap, uint64_t idx, uint64_t *valp)
2621 {
2622         if (zap->zap_phys->zap_ptrtbl.zt_numblks == 0) {
2623                 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
2624                 return (0);
2625         } else {
2626                 return (zap_table_load(zap, &zap->zap_phys->zap_ptrtbl,
2627                     idx, valp));
2628         }
2629 }
2630
2631 #define ZAP_HASH_IDX(hash, n)   (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
2632 static int
2633 zap_deref_leaf(fat_zap_t *zap, uint64_t h, zap_leaf_t **lp)
2634 {
2635         uint64_t idx, blk;
2636         int err;
2637
2638         idx = ZAP_HASH_IDX(h, zap->zap_phys->zap_ptrtbl.zt_shift);
2639         err = zap_idx_to_blk(zap, idx, &blk);
2640         if (err != 0)
2641                 return (err);
2642         return (zap_get_leaf_byblk(zap, blk, lp));
2643 }
2644
2645 #define CHAIN_END       0xffff  /* end of the chunk chain */
2646 #define LEAF_HASH(l, h) \
2647         ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
2648         ((h) >> \
2649         (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len)))
2650 #define LEAF_HASH_ENTPTR(l, h)  (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
2651
2652 static int
2653 zap_leaf_lookup(zap_leaf_t *zl, uint64_t hash, const char *name,
2654     uint64_t integer_size, uint64_t num_integers, void *value)
2655 {
2656         int rc;
2657         uint16_t *chunkp;
2658         struct zap_leaf_entry *le;
2659
2660         /*
2661          * Make sure this chunk matches our hash.
2662          */
2663         if (zl->l_phys->l_hdr.lh_prefix_len > 0 &&
2664             zl->l_phys->l_hdr.lh_prefix !=
2665             hash >> (64 - zl->l_phys->l_hdr.lh_prefix_len))
2666                 return (EIO);
2667
2668         rc = ENOENT;
2669         for (chunkp = LEAF_HASH_ENTPTR(zl, hash);
2670             *chunkp != CHAIN_END; chunkp = &le->le_next) {
2671                 zap_leaf_chunk_t *zc;
2672                 uint16_t chunk = *chunkp;
2673
2674                 le = ZAP_LEAF_ENTRY(zl, chunk);
2675                 if (le->le_hash != hash)
2676                         continue;
2677                 zc = &ZAP_LEAF_CHUNK(zl, chunk);
2678                 if (fzap_name_equal(zl, zc, name)) {
2679                         if (zc->l_entry.le_value_intlen > integer_size) {
2680                                 rc = EINVAL;
2681                         } else {
2682                                 fzap_leaf_array(zl, zc, integer_size,
2683                                     num_integers, value);
2684                                 rc = 0;
2685                         }
2686                         break;
2687                 }
2688         }
2689         return (rc);
2690 }
2691
2692 /*
2693  * Lookup a value in a fatzap directory.
2694  */
2695 static int
2696 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2697     const char *name, uint64_t integer_size, uint64_t num_integers,
2698     void *value)
2699 {
2700         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2701         fat_zap_t z;
2702         zap_leaf_t *zl;
2703         uint64_t hash;
2704         int rc;
2705
2706         if (zh->zap_magic != ZAP_MAGIC)
2707                 return (EIO);
2708
2709         if ((rc = fzap_check_size(integer_size, num_integers)) != 0) {
2710                 return (rc);
2711         }
2712
2713         z.zap_block_shift = ilog2(bsize);
2714         z.zap_phys = zh;
2715         z.zap_spa = spa;
2716         z.zap_dnode = dnode;
2717
2718         hash = zap_hash(zh->zap_salt, name);
2719         rc = zap_deref_leaf(&z, hash, &zl);
2720         if (rc != 0)
2721                 return (rc);
2722
2723         rc = zap_leaf_lookup(zl, hash, name, integer_size, num_integers, value);
2724
2725         zap_leaf_free(zl);
2726         return (rc);
2727 }
2728
2729 /*
2730  * Lookup a name in a zap object and return its value as a uint64_t.
2731  */
2732 static int
2733 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
2734     uint64_t integer_size, uint64_t num_integers, void *value)
2735 {
2736         int rc;
2737         zap_phys_t *zap;
2738         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2739
2740         zap = malloc(size);
2741         if (zap == NULL)
2742                 return (ENOMEM);
2743
2744         rc = dnode_read(spa, dnode, 0, zap, size);
2745         if (rc)
2746                 goto done;
2747
2748         switch (zap->zap_block_type) {
2749         case ZBT_MICRO:
2750                 rc = mzap_lookup((const mzap_phys_t *)zap, size, name, value);
2751                 break;
2752         case ZBT_HEADER:
2753                 rc = fzap_lookup(spa, dnode, zap, name, integer_size,
2754                     num_integers, value);
2755                 break;
2756         default:
2757                 printf("ZFS: invalid zap_type=%" PRIx64 "\n",
2758                     zap->zap_block_type);
2759                 rc = EIO;
2760         }
2761 done:
2762         free(zap);
2763         return (rc);
2764 }
2765
2766 /*
2767  * List a microzap directory.
2768  */
2769 static int
2770 mzap_list(const mzap_phys_t *mz, size_t size,
2771     int (*callback)(const char *, uint64_t))
2772 {
2773         const mzap_ent_phys_t *mze;
2774         int chunks, i, rc;
2775
2776         /*
2777          * Microzap objects use exactly one block. Read the whole
2778          * thing.
2779          */
2780         rc = 0;
2781         chunks = size / MZAP_ENT_LEN - 1;
2782         for (i = 0; i < chunks; i++) {
2783                 mze = &mz->mz_chunk[i];
2784                 if (mze->mze_name[0]) {
2785                         rc = callback(mze->mze_name, mze->mze_value);
2786                         if (rc != 0)
2787                                 break;
2788                 }
2789         }
2790
2791         return (rc);
2792 }
2793
2794 /*
2795  * List a fatzap directory.
2796  */
2797 static int
2798 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2799     int (*callback)(const char *, uint64_t))
2800 {
2801         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2802         fat_zap_t z;
2803         uint64_t i;
2804         int j, rc;
2805
2806         if (zh->zap_magic != ZAP_MAGIC)
2807                 return (EIO);
2808
2809         z.zap_block_shift = ilog2(bsize);
2810         z.zap_phys = zh;
2811
2812         /*
2813          * This assumes that the leaf blocks start at block 1. The
2814          * documentation isn't exactly clear on this.
2815          */
2816         zap_leaf_t zl;
2817         zl.l_bs = z.zap_block_shift;
2818         zl.l_phys = malloc(bsize);
2819         if (zl.l_phys == NULL)
2820                 return (ENOMEM);
2821
2822         for (i = 0; i < zh->zap_num_leafs; i++) {
2823                 off_t off = ((off_t)(i + 1)) << zl.l_bs;
2824                 char name[256], *p;
2825                 uint64_t value;
2826
2827                 if (dnode_read(spa, dnode, off, zl.l_phys, bsize)) {
2828                         free(zl.l_phys);
2829                         return (EIO);
2830                 }
2831
2832                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
2833                         zap_leaf_chunk_t *zc, *nc;
2834                         int namelen;
2835
2836                         zc = &ZAP_LEAF_CHUNK(&zl, j);
2837                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
2838                                 continue;
2839                         namelen = zc->l_entry.le_name_numints;
2840                         if (namelen > sizeof(name))
2841                                 namelen = sizeof(name);
2842
2843                         /*
2844                          * Paste the name back together.
2845                          */
2846                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
2847                         p = name;
2848                         while (namelen > 0) {
2849                                 int len;
2850                                 len = namelen;
2851                                 if (len > ZAP_LEAF_ARRAY_BYTES)
2852                                         len = ZAP_LEAF_ARRAY_BYTES;
2853                                 memcpy(p, nc->l_array.la_array, len);
2854                                 p += len;
2855                                 namelen -= len;
2856                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
2857                         }
2858
2859                         /*
2860                          * Assume the first eight bytes of the value are
2861                          * a uint64_t.
2862                          */
2863                         value = fzap_leaf_value(&zl, zc);
2864
2865                         /* printf("%s 0x%jx\n", name, (uintmax_t)value); */
2866                         rc = callback((const char *)name, value);
2867                         if (rc != 0) {
2868                                 free(zl.l_phys);
2869                                 return (rc);
2870                         }
2871                 }
2872         }
2873
2874         free(zl.l_phys);
2875         return (0);
2876 }
2877
2878 static int zfs_printf(const char *name, uint64_t value __unused)
2879 {
2880
2881         printf("%s\n", name);
2882
2883         return (0);
2884 }
2885
2886 /*
2887  * List a zap directory.
2888  */
2889 static int
2890 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
2891 {
2892         zap_phys_t *zap;
2893         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2894         int rc;
2895
2896         zap = malloc(size);
2897         if (zap == NULL)
2898                 return (ENOMEM);
2899
2900         rc = dnode_read(spa, dnode, 0, zap, size);
2901         if (rc == 0) {
2902                 if (zap->zap_block_type == ZBT_MICRO)
2903                         rc = mzap_list((const mzap_phys_t *)zap, size,
2904                             zfs_printf);
2905                 else
2906                         rc = fzap_list(spa, dnode, zap, zfs_printf);
2907         }
2908         free(zap);
2909         return (rc);
2910 }
2911
2912 static int
2913 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum,
2914     dnode_phys_t *dnode)
2915 {
2916         off_t offset;
2917
2918         offset = objnum * sizeof(dnode_phys_t);
2919         return dnode_read(spa, &os->os_meta_dnode, offset,
2920                 dnode, sizeof(dnode_phys_t));
2921 }
2922
2923 /*
2924  * Lookup a name in a microzap directory.
2925  */
2926 static int
2927 mzap_rlookup(const mzap_phys_t *mz, size_t size, char *name, uint64_t value)
2928 {
2929         const mzap_ent_phys_t *mze;
2930         int chunks, i;
2931
2932         /*
2933          * Microzap objects use exactly one block. Read the whole
2934          * thing.
2935          */
2936         chunks = size / MZAP_ENT_LEN - 1;
2937         for (i = 0; i < chunks; i++) {
2938                 mze = &mz->mz_chunk[i];
2939                 if (value == mze->mze_value) {
2940                         strcpy(name, mze->mze_name);
2941                         return (0);
2942                 }
2943         }
2944
2945         return (ENOENT);
2946 }
2947
2948 static void
2949 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
2950 {
2951         size_t namelen;
2952         const zap_leaf_chunk_t *nc;
2953         char *p;
2954
2955         namelen = zc->l_entry.le_name_numints;
2956
2957         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2958         p = name;
2959         while (namelen > 0) {
2960                 size_t len;
2961                 len = namelen;
2962                 if (len > ZAP_LEAF_ARRAY_BYTES)
2963                         len = ZAP_LEAF_ARRAY_BYTES;
2964                 memcpy(p, nc->l_array.la_array, len);
2965                 p += len;
2966                 namelen -= len;
2967                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2968         }
2969
2970         *p = '\0';
2971 }
2972
2973 static int
2974 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2975     char *name, uint64_t value)
2976 {
2977         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2978         fat_zap_t z;
2979         uint64_t i;
2980         int j, rc;
2981
2982         if (zh->zap_magic != ZAP_MAGIC)
2983                 return (EIO);
2984
2985         z.zap_block_shift = ilog2(bsize);
2986         z.zap_phys = zh;
2987
2988         /*
2989          * This assumes that the leaf blocks start at block 1. The
2990          * documentation isn't exactly clear on this.
2991          */
2992         zap_leaf_t zl;
2993         zl.l_bs = z.zap_block_shift;
2994         zl.l_phys = malloc(bsize);
2995         if (zl.l_phys == NULL)
2996                 return (ENOMEM);
2997
2998         for (i = 0; i < zh->zap_num_leafs; i++) {
2999                 off_t off = ((off_t)(i + 1)) << zl.l_bs;
3000
3001                 rc = dnode_read(spa, dnode, off, zl.l_phys, bsize);
3002                 if (rc != 0)
3003                         goto done;
3004
3005                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
3006                         zap_leaf_chunk_t *zc;
3007
3008                         zc = &ZAP_LEAF_CHUNK(&zl, j);
3009                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
3010                                 continue;
3011                         if (zc->l_entry.le_value_intlen != 8 ||
3012                             zc->l_entry.le_value_numints != 1)
3013                                 continue;
3014
3015                         if (fzap_leaf_value(&zl, zc) == value) {
3016                                 fzap_name_copy(&zl, zc, name);
3017                                 goto done;
3018                         }
3019                 }
3020         }
3021
3022         rc = ENOENT;
3023 done:
3024         free(zl.l_phys);
3025         return (rc);
3026 }
3027
3028 static int
3029 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name,
3030     uint64_t value)
3031 {
3032         zap_phys_t *zap;
3033         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
3034         int rc;
3035
3036         zap = malloc(size);
3037         if (zap == NULL)
3038                 return (ENOMEM);
3039
3040         rc = dnode_read(spa, dnode, 0, zap, size);
3041         if (rc == 0) {
3042                 if (zap->zap_block_type == ZBT_MICRO)
3043                         rc = mzap_rlookup((const mzap_phys_t *)zap, size,
3044                             name, value);
3045                 else
3046                         rc = fzap_rlookup(spa, dnode, zap, name, value);
3047         }
3048         free(zap);
3049         return (rc);
3050 }
3051
3052 static int
3053 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
3054 {
3055         char name[256];
3056         char component[256];
3057         uint64_t dir_obj, parent_obj, child_dir_zapobj;
3058         dnode_phys_t child_dir_zap, snapnames_zap, dataset, dir, parent;
3059         dsl_dir_phys_t *dd;
3060         dsl_dataset_phys_t *ds;
3061         char *p;
3062         int len;
3063         boolean_t issnap = B_FALSE;
3064
3065         p = &name[sizeof(name) - 1];
3066         *p = '\0';
3067
3068         if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) {
3069                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3070                 return (EIO);
3071         }
3072         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3073         dir_obj = ds->ds_dir_obj;
3074         if (ds->ds_snapnames_zapobj == 0)
3075                 issnap = B_TRUE;
3076
3077         for (;;) {
3078                 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir) != 0)
3079                         return (EIO);
3080                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3081
3082                 /* Actual loop condition. */
3083                 parent_obj = dd->dd_parent_obj;
3084                 if (parent_obj == 0)
3085                         break;
3086
3087                 if (objset_get_dnode(spa, spa->spa_mos, parent_obj,
3088                     &parent) != 0)
3089                         return (EIO);
3090                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
3091                 if (issnap == B_TRUE) {
3092                         /*
3093                          * The dataset we are looking up is a snapshot
3094                          * the dir_obj is the parent already, we don't want
3095                          * the grandparent just yet. Reset to the parent.
3096                          */
3097                         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3098                         /* Lookup the dataset to get the snapname ZAP */
3099                         if (objset_get_dnode(spa, spa->spa_mos,
3100                             dd->dd_head_dataset_obj, &dataset))
3101                                 return (EIO);
3102                         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3103                         if (objset_get_dnode(spa, spa->spa_mos,
3104                             ds->ds_snapnames_zapobj, &snapnames_zap) != 0)
3105                                 return (EIO);
3106                         /* Get the name of the snapshot */
3107                         if (zap_rlookup(spa, &snapnames_zap, component,
3108                             objnum) != 0)
3109                                 return (EIO);
3110                         len = strlen(component);
3111                         p -= len;
3112                         memcpy(p, component, len);
3113                         --p;
3114                         *p = '@';
3115                         issnap = B_FALSE;
3116                         continue;
3117                 }
3118
3119                 child_dir_zapobj = dd->dd_child_dir_zapobj;
3120                 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj,
3121                     &child_dir_zap) != 0)
3122                         return (EIO);
3123                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
3124                         return (EIO);
3125
3126                 len = strlen(component);
3127                 p -= len;
3128                 memcpy(p, component, len);
3129                 --p;
3130                 *p = '/';
3131
3132                 /* Actual loop iteration. */
3133                 dir_obj = parent_obj;
3134         }
3135
3136         if (*p != '\0')
3137                 ++p;
3138         strcpy(result, p);
3139
3140         return (0);
3141 }
3142
3143 static int
3144 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
3145 {
3146         char element[256];
3147         uint64_t dir_obj, child_dir_zapobj;
3148         dnode_phys_t child_dir_zap, snapnames_zap, dir, dataset;
3149         dsl_dir_phys_t *dd;
3150         dsl_dataset_phys_t *ds;
3151         const char *p, *q;
3152         boolean_t issnap = B_FALSE;
3153
3154         if (objset_get_dnode(spa, spa->spa_mos,
3155             DMU_POOL_DIRECTORY_OBJECT, &dir))
3156                 return (EIO);
3157         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
3158             1, &dir_obj))
3159                 return (EIO);
3160
3161         p = name;
3162         for (;;) {
3163                 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir))
3164                         return (EIO);
3165                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3166
3167                 while (*p == '/')
3168                         p++;
3169                 /* Actual loop condition #1. */
3170                 if (*p == '\0')
3171                         break;
3172
3173                 q = strchr(p, '/');
3174                 if (q) {
3175                         memcpy(element, p, q - p);
3176                         element[q - p] = '\0';
3177                         p = q + 1;
3178                 } else {
3179                         strcpy(element, p);
3180                         p += strlen(p);
3181                 }
3182
3183                 if (issnap == B_TRUE) {
3184                         if (objset_get_dnode(spa, spa->spa_mos,
3185                             dd->dd_head_dataset_obj, &dataset))
3186                                 return (EIO);
3187                         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3188                         if (objset_get_dnode(spa, spa->spa_mos,
3189                             ds->ds_snapnames_zapobj, &snapnames_zap) != 0)
3190                                 return (EIO);
3191                         /* Actual loop condition #2. */
3192                         if (zap_lookup(spa, &snapnames_zap, element,
3193                             sizeof (dir_obj), 1, &dir_obj) != 0)
3194                                 return (ENOENT);
3195                         *objnum = dir_obj;
3196                         return (0);
3197                 } else if ((q = strchr(element, '@')) != NULL) {
3198                         issnap = B_TRUE;
3199                         element[q - element] = '\0';
3200                         p = q + 1;
3201                 }
3202                 child_dir_zapobj = dd->dd_child_dir_zapobj;
3203                 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj,
3204                     &child_dir_zap) != 0)
3205                         return (EIO);
3206
3207                 /* Actual loop condition #2. */
3208                 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
3209                     1, &dir_obj) != 0)
3210                         return (ENOENT);
3211         }
3212
3213         *objnum = dd->dd_head_dataset_obj;
3214         return (0);
3215 }
3216
3217 #ifndef BOOT2
3218 static int
3219 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
3220 {
3221         uint64_t dir_obj, child_dir_zapobj;
3222         dnode_phys_t child_dir_zap, dir, dataset;
3223         dsl_dataset_phys_t *ds;
3224         dsl_dir_phys_t *dd;
3225
3226         if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) {
3227                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3228                 return (EIO);
3229         }
3230         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3231         dir_obj = ds->ds_dir_obj;
3232
3233         if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) {
3234                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
3235                 return (EIO);
3236         }
3237         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3238
3239         child_dir_zapobj = dd->dd_child_dir_zapobj;
3240         if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj,
3241             &child_dir_zap) != 0) {
3242                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
3243                 return (EIO);
3244         }
3245
3246         return (zap_list(spa, &child_dir_zap) != 0);
3247 }
3248
3249 int
3250 zfs_callback_dataset(const spa_t *spa, uint64_t objnum,
3251     int (*callback)(const char *, uint64_t))
3252 {
3253         uint64_t dir_obj, child_dir_zapobj;
3254         dnode_phys_t child_dir_zap, dir, dataset;
3255         dsl_dataset_phys_t *ds;
3256         dsl_dir_phys_t *dd;
3257         zap_phys_t *zap;
3258         size_t size;
3259         int err;
3260
3261         err = objset_get_dnode(spa, spa->spa_mos, objnum, &dataset);
3262         if (err != 0) {
3263                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3264                 return (err);
3265         }
3266         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3267         dir_obj = ds->ds_dir_obj;
3268
3269         err = objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir);
3270         if (err != 0) {
3271                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
3272                 return (err);
3273         }
3274         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3275
3276         child_dir_zapobj = dd->dd_child_dir_zapobj;
3277         err = objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj,
3278             &child_dir_zap);
3279         if (err != 0) {
3280                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
3281                 return (err);
3282         }
3283
3284         size = child_dir_zap.dn_datablkszsec << SPA_MINBLOCKSHIFT;
3285         zap = malloc(size);
3286         if (zap != NULL) {
3287                 err = dnode_read(spa, &child_dir_zap, 0, zap, size);
3288                 if (err != 0)
3289                         goto done;
3290
3291                 if (zap->zap_block_type == ZBT_MICRO)
3292                         err = mzap_list((const mzap_phys_t *)zap, size,
3293                             callback);
3294                 else
3295                         err = fzap_list(spa, &child_dir_zap, zap, callback);
3296         } else {
3297                 err = ENOMEM;
3298         }
3299 done:
3300         free(zap);
3301         return (err);
3302 }
3303 #endif
3304
3305 /*
3306  * Find the object set given the object number of its dataset object
3307  * and return its details in *objset
3308  */
3309 static int
3310 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
3311 {
3312         dnode_phys_t dataset;
3313         dsl_dataset_phys_t *ds;
3314
3315         if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) {
3316                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3317                 return (EIO);
3318         }
3319
3320         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3321         if (zio_read(spa, &ds->ds_bp, objset)) {
3322                 printf("ZFS: can't read object set for dataset %ju\n",
3323                     (uintmax_t)objnum);
3324                 return (EIO);
3325         }
3326
3327         return (0);
3328 }
3329
3330 /*
3331  * Find the object set pointed to by the BOOTFS property or the root
3332  * dataset if there is none and return its details in *objset
3333  */
3334 static int
3335 zfs_get_root(const spa_t *spa, uint64_t *objid)
3336 {
3337         dnode_phys_t dir, propdir;
3338         uint64_t props, bootfs, root;
3339
3340         *objid = 0;
3341
3342         /*
3343          * Start with the MOS directory object.
3344          */
3345         if (objset_get_dnode(spa, spa->spa_mos,
3346             DMU_POOL_DIRECTORY_OBJECT, &dir)) {
3347                 printf("ZFS: can't read MOS object directory\n");
3348                 return (EIO);
3349         }
3350
3351         /*
3352          * Lookup the pool_props and see if we can find a bootfs.
3353          */
3354         if (zap_lookup(spa, &dir, DMU_POOL_PROPS,
3355             sizeof(props), 1, &props) == 0 &&
3356             objset_get_dnode(spa, spa->spa_mos, props, &propdir) == 0 &&
3357             zap_lookup(spa, &propdir, "bootfs",
3358             sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) {
3359                 *objid = bootfs;
3360                 return (0);
3361         }
3362         /*
3363          * Lookup the root dataset directory
3364          */
3365         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET,
3366             sizeof(root), 1, &root) ||
3367             objset_get_dnode(spa, spa->spa_mos, root, &dir)) {
3368                 printf("ZFS: can't find root dsl_dir\n");
3369                 return (EIO);
3370         }
3371
3372         /*
3373          * Use the information from the dataset directory's bonus buffer
3374          * to find the dataset object and from that the object set itself.
3375          */
3376         dsl_dir_phys_t *dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3377         *objid = dd->dd_head_dataset_obj;
3378         return (0);
3379 }
3380
3381 static int
3382 zfs_mount_impl(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
3383 {
3384
3385         mount->spa = spa;
3386
3387         /*
3388          * Find the root object set if not explicitly provided
3389          */
3390         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
3391                 printf("ZFS: can't find root filesystem\n");
3392                 return (EIO);
3393         }
3394
3395         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
3396                 printf("ZFS: can't open root filesystem\n");
3397                 return (EIO);
3398         }
3399
3400         mount->rootobj = rootobj;
3401
3402         return (0);
3403 }
3404
3405 /*
3406  * callback function for feature name checks.
3407  */
3408 static int
3409 check_feature(const char *name, uint64_t value)
3410 {
3411         int i;
3412
3413         if (value == 0)
3414                 return (0);
3415         if (name[0] == '\0')
3416                 return (0);
3417
3418         for (i = 0; features_for_read[i] != NULL; i++) {
3419                 if (strcmp(name, features_for_read[i]) == 0)
3420                         return (0);
3421         }
3422         printf("ZFS: unsupported feature: %s\n", name);
3423         return (EIO);
3424 }
3425
3426 /*
3427  * Checks whether the MOS features that are active are supported.
3428  */
3429 static int
3430 check_mos_features(const spa_t *spa)
3431 {
3432         dnode_phys_t dir;
3433         zap_phys_t *zap;
3434         uint64_t objnum;
3435         size_t size;
3436         int rc;
3437
3438         if ((rc = objset_get_dnode(spa, spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
3439             &dir)) != 0)
3440                 return (rc);
3441         if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
3442             sizeof (objnum), 1, &objnum)) != 0) {
3443                 /*
3444                  * It is older pool without features. As we have already
3445                  * tested the label, just return without raising the error.
3446                  */
3447                 return (0);
3448         }
3449
3450         if ((rc = objset_get_dnode(spa, spa->spa_mos, objnum, &dir)) != 0)
3451                 return (rc);
3452
3453         if (dir.dn_type != DMU_OTN_ZAP_METADATA)
3454                 return (EIO);
3455
3456         size = dir.dn_datablkszsec << SPA_MINBLOCKSHIFT;
3457         zap = malloc(size);
3458         if (zap == NULL)
3459                 return (ENOMEM);
3460
3461         if (dnode_read(spa, &dir, 0, zap, size)) {
3462                 free(zap);
3463                 return (EIO);
3464         }
3465
3466         if (zap->zap_block_type == ZBT_MICRO)
3467                 rc = mzap_list((const mzap_phys_t *)zap, size, check_feature);
3468         else
3469                 rc = fzap_list(spa, &dir, zap, check_feature);
3470
3471         free(zap);
3472         return (rc);
3473 }
3474
3475 static int
3476 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
3477 {
3478         dnode_phys_t dir;
3479         size_t size;
3480         int rc;
3481         char *nv;
3482
3483         *value = NULL;
3484         if ((rc = objset_get_dnode(spa, spa->spa_mos, obj, &dir)) != 0)
3485                 return (rc);
3486         if (dir.dn_type != DMU_OT_PACKED_NVLIST &&
3487             dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) {
3488                 return (EIO);
3489         }
3490
3491         if (dir.dn_bonuslen != sizeof (uint64_t))
3492                 return (EIO);
3493
3494         size = *(uint64_t *)DN_BONUS(&dir);
3495         nv = malloc(size);
3496         if (nv == NULL)
3497                 return (ENOMEM);
3498
3499         rc = dnode_read(spa, &dir, 0, nv, size);
3500         if (rc != 0) {
3501                 free(nv);
3502                 nv = NULL;
3503                 return (rc);
3504         }
3505         *value = nvlist_import(nv, size);
3506         free(nv);
3507         return (rc);
3508 }
3509
3510 static int
3511 zfs_spa_init(spa_t *spa)
3512 {
3513         struct uberblock checkpoint;
3514         dnode_phys_t dir;
3515         uint64_t config_object;
3516         nvlist_t *nvlist;
3517         int rc;
3518
3519         if (zio_read(spa, &spa->spa_uberblock->ub_rootbp, spa->spa_mos)) {
3520                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
3521                 return (EIO);
3522         }
3523         if (spa->spa_mos->os_type != DMU_OST_META) {
3524                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
3525                 return (EIO);
3526         }
3527
3528         if (objset_get_dnode(spa, &spa->spa_mos_master,
3529             DMU_POOL_DIRECTORY_OBJECT, &dir)) {
3530                 printf("ZFS: failed to read pool %s directory object\n",
3531                     spa->spa_name);
3532                 return (EIO);
3533         }
3534         /* this is allowed to fail, older pools do not have salt */
3535         rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
3536             sizeof (spa->spa_cksum_salt.zcs_bytes),
3537             spa->spa_cksum_salt.zcs_bytes);
3538
3539         rc = check_mos_features(spa);
3540         if (rc != 0) {
3541                 printf("ZFS: pool %s is not supported\n", spa->spa_name);
3542                 return (rc);
3543         }
3544
3545         rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG,
3546             sizeof (config_object), 1, &config_object);
3547         if (rc != 0) {
3548                 printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG);
3549                 return (EIO);
3550         }
3551         rc = load_nvlist(spa, config_object, &nvlist);
3552         if (rc != 0)
3553                 return (rc);
3554
3555         rc = zap_lookup(spa, &dir, DMU_POOL_ZPOOL_CHECKPOINT,
3556             sizeof(uint64_t), sizeof(checkpoint) / sizeof(uint64_t),
3557             &checkpoint);
3558         if (rc == 0 && checkpoint.ub_checkpoint_txg != 0) {
3559                 memcpy(&spa->spa_uberblock_checkpoint, &checkpoint,
3560                     sizeof(checkpoint));
3561                 if (zio_read(spa, &spa->spa_uberblock_checkpoint.ub_rootbp,
3562                     &spa->spa_mos_checkpoint)) {
3563                         printf("ZFS: can not read checkpoint data.\n");
3564                         return (EIO);
3565                 }
3566         }
3567
3568         /*
3569          * Update vdevs from MOS config. Note, we do skip encoding bytes
3570          * here. See also vdev_label_read_config().
3571          */
3572         rc = vdev_init_from_nvlist(spa, nvlist);
3573         nvlist_destroy(nvlist);
3574         return (rc);
3575 }
3576
3577 static int
3578 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
3579 {
3580
3581         if (dn->dn_bonustype != DMU_OT_SA) {
3582                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
3583
3584                 sb->st_mode = zp->zp_mode;
3585                 sb->st_uid = zp->zp_uid;
3586                 sb->st_gid = zp->zp_gid;
3587                 sb->st_size = zp->zp_size;
3588         } else {
3589                 sa_hdr_phys_t *sahdrp;
3590                 int hdrsize;
3591                 size_t size = 0;
3592                 void *buf = NULL;
3593
3594                 if (dn->dn_bonuslen != 0)
3595                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3596                 else {
3597                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
3598                                 blkptr_t *bp = DN_SPILL_BLKPTR(dn);
3599                                 int error;
3600
3601                                 size = BP_GET_LSIZE(bp);
3602                                 buf = malloc(size);
3603                                 if (buf == NULL)
3604                                         error = ENOMEM;
3605                                 else
3606                                         error = zio_read(spa, bp, buf);
3607
3608                                 if (error != 0) {
3609                                         free(buf);
3610                                         return (error);
3611                                 }
3612                                 sahdrp = buf;
3613                         } else {
3614                                 return (EIO);
3615                         }
3616                 }
3617                 hdrsize = SA_HDR_SIZE(sahdrp);
3618                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
3619                     SA_MODE_OFFSET);
3620                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
3621                     SA_UID_OFFSET);
3622                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
3623                     SA_GID_OFFSET);
3624                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
3625                     SA_SIZE_OFFSET);
3626                 free(buf);
3627         }
3628
3629         return (0);
3630 }
3631
3632 static int
3633 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
3634 {
3635         int rc = 0;
3636
3637         if (dn->dn_bonustype == DMU_OT_SA) {
3638                 sa_hdr_phys_t *sahdrp = NULL;
3639                 size_t size = 0;
3640                 void *buf = NULL;
3641                 int hdrsize;
3642                 char *p;
3643
3644                 if (dn->dn_bonuslen != 0) {
3645                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3646                 } else {
3647                         blkptr_t *bp;
3648
3649                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
3650                                 return (EIO);
3651                         bp = DN_SPILL_BLKPTR(dn);
3652
3653                         size = BP_GET_LSIZE(bp);
3654                         buf = malloc(size);
3655                         if (buf == NULL)
3656                                 rc = ENOMEM;
3657                         else
3658                                 rc = zio_read(spa, bp, buf);
3659                         if (rc != 0) {
3660                                 free(buf);
3661                                 return (rc);
3662                         }
3663                         sahdrp = buf;
3664                 }
3665                 hdrsize = SA_HDR_SIZE(sahdrp);
3666                 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
3667                 memcpy(path, p, psize);
3668                 free(buf);
3669                 return (0);
3670         }
3671         /*
3672          * Second test is purely to silence bogus compiler
3673          * warning about accessing past the end of dn_bonus.
3674          */
3675         if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
3676             sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
3677                 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
3678         } else {
3679                 rc = dnode_read(spa, dn, 0, path, psize);
3680         }
3681         return (rc);
3682 }
3683
3684 struct obj_list {
3685         uint64_t                objnum;
3686         STAILQ_ENTRY(obj_list)  entry;
3687 };
3688
3689 /*
3690  * Lookup a file and return its dnode.
3691  */
3692 static int
3693 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
3694 {
3695         int rc;
3696         uint64_t objnum;
3697         const spa_t *spa;
3698         dnode_phys_t dn;
3699         const char *p, *q;
3700         char element[256];
3701         char path[1024];
3702         int symlinks_followed = 0;
3703         struct stat sb;
3704         struct obj_list *entry, *tentry;
3705         STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
3706
3707         spa = mount->spa;
3708         if (mount->objset.os_type != DMU_OST_ZFS) {
3709                 printf("ZFS: unexpected object set type %ju\n",
3710                     (uintmax_t)mount->objset.os_type);
3711                 return (EIO);
3712         }
3713
3714         if ((entry = malloc(sizeof(struct obj_list))) == NULL)
3715                 return (ENOMEM);
3716
3717         /*
3718          * Get the root directory dnode.
3719          */
3720         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
3721         if (rc) {
3722                 free(entry);
3723                 return (rc);
3724         }
3725
3726         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof(objnum), 1, &objnum);
3727         if (rc) {
3728                 free(entry);
3729                 return (rc);
3730         }
3731         entry->objnum = objnum;
3732         STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3733
3734         rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3735         if (rc != 0)
3736                 goto done;
3737
3738         p = upath;
3739         while (p && *p) {
3740                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3741                 if (rc != 0)
3742                         goto done;
3743
3744                 while (*p == '/')
3745                         p++;
3746                 if (*p == '\0')
3747                         break;
3748                 q = p;
3749                 while (*q != '\0' && *q != '/')
3750                         q++;
3751
3752                 /* skip dot */
3753                 if (p + 1 == q && p[0] == '.') {
3754                         p++;
3755                         continue;
3756                 }
3757                 /* double dot */
3758                 if (p + 2 == q && p[0] == '.' && p[1] == '.') {
3759                         p += 2;
3760                         if (STAILQ_FIRST(&on_cache) ==
3761                             STAILQ_LAST(&on_cache, obj_list, entry)) {
3762                                 rc = ENOENT;
3763                                 goto done;
3764                         }
3765                         entry = STAILQ_FIRST(&on_cache);
3766                         STAILQ_REMOVE_HEAD(&on_cache, entry);
3767                         free(entry);
3768                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
3769                         continue;
3770                 }
3771                 if (q - p + 1 > sizeof(element)) {
3772                         rc = ENAMETOOLONG;
3773                         goto done;
3774                 }
3775                 memcpy(element, p, q - p);
3776                 element[q - p] = 0;
3777                 p = q;
3778
3779                 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
3780                         goto done;
3781                 if (!S_ISDIR(sb.st_mode)) {
3782                         rc = ENOTDIR;
3783                         goto done;
3784                 }
3785
3786                 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
3787                 if (rc)
3788                         goto done;
3789                 objnum = ZFS_DIRENT_OBJ(objnum);
3790
3791                 if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
3792                         rc = ENOMEM;
3793                         goto done;
3794                 }
3795                 entry->objnum = objnum;
3796                 STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3797                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3798                 if (rc)
3799                         goto done;
3800
3801                 /*
3802                  * Check for symlink.
3803                  */
3804                 rc = zfs_dnode_stat(spa, &dn, &sb);
3805                 if (rc)
3806                         goto done;
3807                 if (S_ISLNK(sb.st_mode)) {
3808                         if (symlinks_followed > 10) {
3809                                 rc = EMLINK;
3810                                 goto done;
3811                         }
3812                         symlinks_followed++;
3813
3814                         /*
3815                          * Read the link value and copy the tail of our
3816                          * current path onto the end.
3817                          */
3818                         if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
3819                                 rc = ENAMETOOLONG;
3820                                 goto done;
3821                         }
3822                         strcpy(&path[sb.st_size], p);
3823
3824                         rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
3825                         if (rc != 0)
3826                                 goto done;
3827
3828                         /*
3829                          * Restart with the new path, starting either at
3830                          * the root or at the parent depending whether or
3831                          * not the link is relative.
3832                          */
3833                         p = path;
3834                         if (*p == '/') {
3835                                 while (STAILQ_FIRST(&on_cache) !=
3836                                     STAILQ_LAST(&on_cache, obj_list, entry)) {
3837                                         entry = STAILQ_FIRST(&on_cache);
3838                                         STAILQ_REMOVE_HEAD(&on_cache, entry);
3839                                         free(entry);
3840                                 }
3841                         } else {
3842                                 entry = STAILQ_FIRST(&on_cache);
3843                                 STAILQ_REMOVE_HEAD(&on_cache, entry);
3844                                 free(entry);
3845                         }
3846                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
3847                 }
3848         }
3849
3850         *dnode = dn;
3851 done:
3852         STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
3853                 free(entry);
3854         return (rc);
3855 }
3856
3857 /*
3858  * Return either a cached copy of the bootenv, or read each of the vdev children
3859  * looking for the bootenv. Cache what's found and return the results. Returns 0
3860  * when benvp is filled in, and some errno when not.
3861  */
3862 static int
3863 zfs_get_bootenv_spa(spa_t *spa, nvlist_t **benvp)
3864 {
3865         vdev_t *vd;
3866         nvlist_t *benv = NULL;
3867
3868         if (spa->spa_bootenv == NULL) {
3869                 STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children,
3870                     v_childlink) {
3871                         benv = vdev_read_bootenv(vd);
3872
3873                         if (benv != NULL)
3874                                 break;
3875                 }
3876                 spa->spa_bootenv = benv;
3877         }
3878         benv = spa->spa_bootenv;
3879
3880         if (benv == NULL)
3881                 return (ENOENT);
3882
3883         *benvp = benv;
3884         return (0);
3885 }
3886
3887 /*
3888  * Store nvlist to pool label bootenv area. Also updates cached pointer in spa.
3889  */
3890 static int
3891 zfs_set_bootenv_spa(spa_t *spa, nvlist_t *benv)
3892 {
3893         vdev_t *vd;
3894
3895         STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
3896                 vdev_write_bootenv(vd, benv);
3897         }
3898
3899         spa->spa_bootenv = benv;
3900         return (0);
3901 }
3902
3903 /*
3904  * Get bootonce value by key. The bootonce <key, value> pair is removed from the
3905  * bootenv nvlist and the remaining nvlist is committed back to disk. This process
3906  * the bootonce flag since we've reached the point in the boot that we've 'used'
3907  * the BE. For chained boot scenarios, we may reach this point multiple times (but
3908  * only remove it and return 0 the first time).
3909  */
3910 static int
3911 zfs_get_bootonce_spa(spa_t *spa, const char *key, char *buf, size_t size)
3912 {
3913         nvlist_t *benv;
3914         char *result = NULL;
3915         int result_size, rv;
3916
3917         if ((rv = zfs_get_bootenv_spa(spa, &benv)) != 0)
3918                 return (rv);
3919
3920         if ((rv = nvlist_find(benv, key, DATA_TYPE_STRING, NULL,
3921             &result, &result_size)) == 0) {
3922                 if (result_size == 0) {
3923                         /* ignore empty string */
3924                         rv = ENOENT;
3925                 } else if (buf != NULL) {
3926                         size = MIN((size_t)result_size + 1, size);
3927                         strlcpy(buf, result, size);
3928                 }
3929                 (void)nvlist_remove(benv, key, DATA_TYPE_STRING);
3930                 (void)zfs_set_bootenv_spa(spa, benv);
3931         }
3932
3933         return (rv);
3934 }