]> CyberLeo.Net >> Repos - FreeBSD/releng/8.2.git/blob - sys/boot/zfs/zfsimpl.c
Copy stable/8 to releng/8.2 in preparation for FreeBSD-8.2 release.
[FreeBSD/releng/8.2.git] / sys / boot / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include "zfsimpl.h"
35 #include "zfssubr.c"
36
37 /*
38  * List of all vdevs, chained through v_alllink.
39  */
40 static vdev_list_t zfs_vdevs;
41
42 /*
43  * List of all pools, chained through spa_link.
44  */
45 static spa_list_t zfs_pools;
46
47 static uint64_t zfs_crc64_table[256];
48 static const dnode_phys_t *dnode_cache_obj = 0;
49 static uint64_t dnode_cache_bn;
50 static char *dnode_cache_buf;
51 static char *zap_scratch;
52 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
53
54 #define TEMP_SIZE       (1024 * 1024)
55
56 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
57
58 static void
59 zfs_init(void)
60 {
61         STAILQ_INIT(&zfs_vdevs);
62         STAILQ_INIT(&zfs_pools);
63
64         zfs_temp_buf = malloc(TEMP_SIZE);
65         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
66         zfs_temp_ptr = zfs_temp_buf;
67         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
68         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
69
70         zfs_init_crc();
71 }
72
73 static char *
74 zfs_alloc_temp(size_t sz)
75 {
76         char *p;
77
78         if (zfs_temp_ptr + sz > zfs_temp_end) {
79                 printf("ZFS: out of temporary buffer space\n");
80                 for (;;) ;
81         }
82         p = zfs_temp_ptr;
83         zfs_temp_ptr += sz;
84
85         return (p);
86 }
87
88 static void
89 zfs_reset_temp(void)
90 {
91
92         zfs_temp_ptr = zfs_temp_buf;
93 }
94
95 static int
96 xdr_int(const unsigned char **xdr, int *ip)
97 {
98         *ip = ((*xdr)[0] << 24)
99                 | ((*xdr)[1] << 16)
100                 | ((*xdr)[2] << 8)
101                 | ((*xdr)[3] << 0);
102         (*xdr) += 4;
103         return (0);
104 }
105
106 static int
107 xdr_u_int(const unsigned char **xdr, u_int *ip)
108 {
109         *ip = ((*xdr)[0] << 24)
110                 | ((*xdr)[1] << 16)
111                 | ((*xdr)[2] << 8)
112                 | ((*xdr)[3] << 0);
113         (*xdr) += 4;
114         return (0);
115 }
116
117 static int
118 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
119 {
120         u_int hi, lo;
121
122         xdr_u_int(xdr, &hi);
123         xdr_u_int(xdr, &lo);
124         *lp = (((uint64_t) hi) << 32) | lo;
125         return (0);
126 }
127
128 static int
129 nvlist_find(const unsigned char *nvlist, const char *name, int type,
130             int* elementsp, void *valuep)
131 {
132         const unsigned char *p, *pair;
133         int junk;
134         int encoded_size, decoded_size;
135
136         p = nvlist;
137         xdr_int(&p, &junk);
138         xdr_int(&p, &junk);
139
140         pair = p;
141         xdr_int(&p, &encoded_size);
142         xdr_int(&p, &decoded_size);
143         while (encoded_size && decoded_size) {
144                 int namelen, pairtype, elements;
145                 const char *pairname;
146
147                 xdr_int(&p, &namelen);
148                 pairname = (const char*) p;
149                 p += roundup(namelen, 4);
150                 xdr_int(&p, &pairtype);
151
152                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
153                         xdr_int(&p, &elements);
154                         if (elementsp)
155                                 *elementsp = elements;
156                         if (type == DATA_TYPE_UINT64) {
157                                 xdr_uint64_t(&p, (uint64_t *) valuep);
158                                 return (0);
159                         } else if (type == DATA_TYPE_STRING) {
160                                 int len;
161                                 xdr_int(&p, &len);
162                                 (*(const char**) valuep) = (const char*) p;
163                                 return (0);
164                         } else if (type == DATA_TYPE_NVLIST
165                                    || type == DATA_TYPE_NVLIST_ARRAY) {
166                                 (*(const unsigned char**) valuep) =
167                                          (const unsigned char*) p;
168                                 return (0);
169                         } else {
170                                 return (EIO);
171                         }
172                 } else {
173                         /*
174                          * Not the pair we are looking for, skip to the next one.
175                          */
176                         p = pair + encoded_size;
177                 }
178
179                 pair = p;
180                 xdr_int(&p, &encoded_size);
181                 xdr_int(&p, &decoded_size);
182         }
183
184         return (EIO);
185 }
186
187 /*
188  * Return the next nvlist in an nvlist array.
189  */
190 static const unsigned char *
191 nvlist_next(const unsigned char *nvlist)
192 {
193         const unsigned char *p, *pair;
194         int junk;
195         int encoded_size, decoded_size;
196
197         p = nvlist;
198         xdr_int(&p, &junk);
199         xdr_int(&p, &junk);
200
201         pair = p;
202         xdr_int(&p, &encoded_size);
203         xdr_int(&p, &decoded_size);
204         while (encoded_size && decoded_size) {
205                 p = pair + encoded_size;
206
207                 pair = p;
208                 xdr_int(&p, &encoded_size);
209                 xdr_int(&p, &decoded_size);
210         }
211
212         return p;
213 }
214
215 #ifdef TEST
216
217 static const unsigned char *
218 nvlist_print(const unsigned char *nvlist, unsigned int indent)
219 {
220         static const char* typenames[] = {
221                 "DATA_TYPE_UNKNOWN",
222                 "DATA_TYPE_BOOLEAN",
223                 "DATA_TYPE_BYTE",
224                 "DATA_TYPE_INT16",
225                 "DATA_TYPE_UINT16",
226                 "DATA_TYPE_INT32",
227                 "DATA_TYPE_UINT32",
228                 "DATA_TYPE_INT64",
229                 "DATA_TYPE_UINT64",
230                 "DATA_TYPE_STRING",
231                 "DATA_TYPE_BYTE_ARRAY",
232                 "DATA_TYPE_INT16_ARRAY",
233                 "DATA_TYPE_UINT16_ARRAY",
234                 "DATA_TYPE_INT32_ARRAY",
235                 "DATA_TYPE_UINT32_ARRAY",
236                 "DATA_TYPE_INT64_ARRAY",
237                 "DATA_TYPE_UINT64_ARRAY",
238                 "DATA_TYPE_STRING_ARRAY",
239                 "DATA_TYPE_HRTIME",
240                 "DATA_TYPE_NVLIST",
241                 "DATA_TYPE_NVLIST_ARRAY",
242                 "DATA_TYPE_BOOLEAN_VALUE",
243                 "DATA_TYPE_INT8",
244                 "DATA_TYPE_UINT8",
245                 "DATA_TYPE_BOOLEAN_ARRAY",
246                 "DATA_TYPE_INT8_ARRAY",
247                 "DATA_TYPE_UINT8_ARRAY"
248         };
249
250         unsigned int i, j;
251         const unsigned char *p, *pair;
252         int junk;
253         int encoded_size, decoded_size;
254
255         p = nvlist;
256         xdr_int(&p, &junk);
257         xdr_int(&p, &junk);
258
259         pair = p;
260         xdr_int(&p, &encoded_size);
261         xdr_int(&p, &decoded_size);
262         while (encoded_size && decoded_size) {
263                 int namelen, pairtype, elements;
264                 const char *pairname;
265
266                 xdr_int(&p, &namelen);
267                 pairname = (const char*) p;
268                 p += roundup(namelen, 4);
269                 xdr_int(&p, &pairtype);
270
271                 for (i = 0; i < indent; i++)
272                         printf(" ");
273                 printf("%s %s", typenames[pairtype], pairname);
274
275                 xdr_int(&p, &elements);
276                 switch (pairtype) {
277                 case DATA_TYPE_UINT64: {
278                         uint64_t val;
279                         xdr_uint64_t(&p, &val);
280                         printf(" = 0x%llx\n", val);
281                         break;
282                 }
283
284                 case DATA_TYPE_STRING: {
285                         int len;
286                         xdr_int(&p, &len);
287                         printf(" = \"%s\"\n", p);
288                         break;
289                 }
290
291                 case DATA_TYPE_NVLIST:
292                         printf("\n");
293                         nvlist_print(p, indent + 1);
294                         break;
295
296                 case DATA_TYPE_NVLIST_ARRAY:
297                         for (j = 0; j < elements; j++) {
298                                 printf("[%d]\n", j);
299                                 p = nvlist_print(p, indent + 1);
300                                 if (j != elements - 1) {
301                                         for (i = 0; i < indent; i++)
302                                                 printf(" ");
303                                         printf("%s %s", typenames[pairtype], pairname);
304                                 }
305                         }
306                         break;
307
308                 default:
309                         printf("\n");
310                 }
311
312                 p = pair + encoded_size;
313
314                 pair = p;
315                 xdr_int(&p, &encoded_size);
316                 xdr_int(&p, &decoded_size);
317         }
318
319         return p;
320 }
321
322 #endif
323
324 static int
325 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
326     off_t offset, size_t size)
327 {
328         size_t psize;
329         int rc;
330
331         if (!vdev->v_phys_read)
332                 return (EIO);
333
334         if (bp) {
335                 psize = BP_GET_PSIZE(bp);
336         } else {
337                 psize = size;
338         }
339
340         /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
341         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
342         if (rc)
343                 return (rc);
344         if (bp && zio_checksum_error(bp, buf))
345                 return (EIO);
346
347         return (0);
348 }
349
350 static int
351 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
352     off_t offset, size_t bytes)
353 {
354
355         return (vdev_read_phys(vdev, bp, buf,
356                 offset + VDEV_LABEL_START_SIZE, bytes));
357 }
358
359
360 static int
361 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
362     off_t offset, size_t bytes)
363 {
364         vdev_t *kid;
365         int rc;
366
367         rc = EIO;
368         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
369                 if (kid->v_state != VDEV_STATE_HEALTHY)
370                         continue;
371                 rc = kid->v_read(kid, bp, buf, offset, bytes);
372                 if (!rc)
373                         return (0);
374         }
375
376         return (rc);
377 }
378
379 static int
380 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
381     off_t offset, size_t bytes)
382 {
383         vdev_t *kid;
384
385         /*
386          * Here we should have two kids:
387          * First one which is the one we are replacing and we can trust
388          * only this one to have valid data, but it might not be present.
389          * Second one is that one we are replacing with. It is most likely
390          * healthy, but we can't trust it has needed data, so we won't use it.
391          */
392         kid = STAILQ_FIRST(&vdev->v_children);
393         if (kid == NULL)
394                 return (EIO);
395         if (kid->v_state != VDEV_STATE_HEALTHY)
396                 return (EIO);
397         return (kid->v_read(kid, bp, buf, offset, bytes));
398 }
399
400 static vdev_t *
401 vdev_find(uint64_t guid)
402 {
403         vdev_t *vdev;
404
405         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
406                 if (vdev->v_guid == guid)
407                         return (vdev);
408
409         return (0);
410 }
411
412 static vdev_t *
413 vdev_create(uint64_t guid, vdev_read_t *read)
414 {
415         vdev_t *vdev;
416
417         vdev = malloc(sizeof(vdev_t));
418         memset(vdev, 0, sizeof(vdev_t));
419         STAILQ_INIT(&vdev->v_children);
420         vdev->v_guid = guid;
421         vdev->v_state = VDEV_STATE_OFFLINE;
422         vdev->v_read = read;
423         vdev->v_phys_read = 0;
424         vdev->v_read_priv = 0;
425         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
426
427         return (vdev);
428 }
429
430 static int
431 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
432 {
433         int rc;
434         uint64_t guid, id, ashift, nparity;
435         const char *type;
436         const char *path;
437         vdev_t *vdev, *kid;
438         const unsigned char *kids;
439         int nkids, i, is_new;
440         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
441
442         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
443                         DATA_TYPE_UINT64, 0, &guid)
444             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
445                            DATA_TYPE_UINT64, 0, &id)
446             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
447                            DATA_TYPE_STRING, 0, &type)) {
448                 printf("ZFS: can't find vdev details\n");
449                 return (ENOENT);
450         }
451
452         if (strcmp(type, VDEV_TYPE_MIRROR)
453             && strcmp(type, VDEV_TYPE_DISK)
454             && strcmp(type, VDEV_TYPE_RAIDZ)
455             && strcmp(type, VDEV_TYPE_REPLACING)) {
456                 printf("ZFS: can only boot from disk, mirror or raidz vdevs\n");
457                 return (EIO);
458         }
459
460         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
461
462         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
463                         &is_offline);
464         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
465                         &is_removed);
466         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
467                         &is_faulted);
468         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
469                         &is_degraded);
470         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
471                         &isnt_present);
472
473         vdev = vdev_find(guid);
474         if (!vdev) {
475                 is_new = 1;
476
477                 if (!strcmp(type, VDEV_TYPE_MIRROR))
478                         vdev = vdev_create(guid, vdev_mirror_read);
479                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
480                         vdev = vdev_create(guid, vdev_raidz_read);
481                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
482                         vdev = vdev_create(guid, vdev_replacing_read);
483                 else
484                         vdev = vdev_create(guid, vdev_disk_read);
485
486                 vdev->v_id = id;
487                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
488                         DATA_TYPE_UINT64, 0, &ashift) == 0)
489                         vdev->v_ashift = ashift;
490                 else
491                         vdev->v_ashift = 0;
492                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
493                         DATA_TYPE_UINT64, 0, &nparity) == 0)
494                         vdev->v_nparity = nparity;
495                 else
496                         vdev->v_nparity = 0;
497                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
498                                 DATA_TYPE_STRING, 0, &path) == 0) {
499                         if (strncmp(path, "/dev/", 5) == 0)
500                                 path += 5;
501                         vdev->v_name = strdup(path);
502                 } else {
503                         if (!strcmp(type, "raidz")) {
504                                 if (vdev->v_nparity == 1)
505                                         vdev->v_name = "raidz1";
506                                 else
507                                         vdev->v_name = "raidz2";
508                         } else {
509                                 vdev->v_name = strdup(type);
510                         }
511                 }
512         } else {
513                 is_new = 0;
514         }
515
516         if (is_new || is_newer) {
517                 /*
518                  * This is either new vdev or we've already seen this vdev,
519                  * but from an older vdev label, so let's refresh its state
520                  * from the newer label.
521                  */
522                 if (is_offline)
523                         vdev->v_state = VDEV_STATE_OFFLINE;
524                 else if (is_removed)
525                         vdev->v_state = VDEV_STATE_REMOVED;
526                 else if (is_faulted)
527                         vdev->v_state = VDEV_STATE_FAULTED;
528                 else if (is_degraded)
529                         vdev->v_state = VDEV_STATE_DEGRADED;
530                 else if (isnt_present)
531                         vdev->v_state = VDEV_STATE_CANT_OPEN;
532                 else
533                         vdev->v_state = VDEV_STATE_HEALTHY;
534         }
535
536         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
537                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
538         /*
539          * Its ok if we don't have any kids.
540          */
541         if (rc == 0) {
542                 vdev->v_nchildren = nkids;
543                 for (i = 0; i < nkids; i++) {
544                         rc = vdev_init_from_nvlist(kids, &kid, is_newer);
545                         if (rc)
546                                 return (rc);
547                         if (is_new)
548                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
549                                                    v_childlink);
550                         kids = nvlist_next(kids);
551                 }
552         } else {
553                 vdev->v_nchildren = 0;
554         }
555
556         if (vdevp)
557                 *vdevp = vdev;
558         return (0);
559 }
560
561 static void
562 vdev_set_state(vdev_t *vdev)
563 {
564         vdev_t *kid;
565         int good_kids;
566         int bad_kids;
567
568         /*
569          * A mirror or raidz is healthy if all its kids are healthy. A
570          * mirror is degraded if any of its kids is healthy; a raidz
571          * is degraded if at most nparity kids are offline.
572          */
573         if (STAILQ_FIRST(&vdev->v_children)) {
574                 good_kids = 0;
575                 bad_kids = 0;
576                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
577                         if (kid->v_state == VDEV_STATE_HEALTHY)
578                                 good_kids++;
579                         else
580                                 bad_kids++;
581                 }
582                 if (bad_kids == 0) {
583                         vdev->v_state = VDEV_STATE_HEALTHY;
584                 } else {
585                         if (vdev->v_read == vdev_mirror_read) {
586                                 if (good_kids) {
587                                         vdev->v_state = VDEV_STATE_DEGRADED;
588                                 } else {
589                                         vdev->v_state = VDEV_STATE_OFFLINE;
590                                 }
591                         } else if (vdev->v_read == vdev_raidz_read) {
592                                 if (bad_kids > vdev->v_nparity) {
593                                         vdev->v_state = VDEV_STATE_OFFLINE;
594                                 } else {
595                                         vdev->v_state = VDEV_STATE_DEGRADED;
596                                 }
597                         }
598                 }
599         }
600 }
601
602 static spa_t *
603 spa_find_by_guid(uint64_t guid)
604 {
605         spa_t *spa;
606
607         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
608                 if (spa->spa_guid == guid)
609                         return (spa);
610
611         return (0);
612 }
613
614 #ifdef BOOT2
615
616 static spa_t *
617 spa_find_by_name(const char *name)
618 {
619         spa_t *spa;
620
621         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
622                 if (!strcmp(spa->spa_name, name))
623                         return (spa);
624
625         return (0);
626 }
627
628 #endif
629
630 static spa_t *
631 spa_create(uint64_t guid)
632 {
633         spa_t *spa;
634
635         spa = malloc(sizeof(spa_t));
636         memset(spa, 0, sizeof(spa_t));
637         STAILQ_INIT(&spa->spa_vdevs);
638         spa->spa_guid = guid;
639         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
640
641         return (spa);
642 }
643
644 static const char *
645 state_name(vdev_state_t state)
646 {
647         static const char* names[] = {
648                 "UNKNOWN",
649                 "CLOSED",
650                 "OFFLINE",
651                 "REMOVED",
652                 "CANT_OPEN",
653                 "FAULTED",
654                 "DEGRADED",
655                 "ONLINE"
656         };
657         return names[state];
658 }
659
660 #ifdef BOOT2
661
662 #define pager_printf printf
663
664 #else
665
666 static void
667 pager_printf(const char *fmt, ...)
668 {
669         char line[80];
670         va_list args;
671
672         va_start(args, fmt);
673         vsprintf(line, fmt, args);
674         va_end(args);
675         pager_output(line);
676 }
677
678 #endif
679
680 #define STATUS_FORMAT   "        %s %s\n"
681
682 static void
683 print_state(int indent, const char *name, vdev_state_t state)
684 {
685         int i;
686         char buf[512];
687
688         buf[0] = 0;
689         for (i = 0; i < indent; i++)
690                 strcat(buf, "  ");
691         strcat(buf, name);
692         pager_printf(STATUS_FORMAT, buf, state_name(state));
693         
694 }
695
696 static void
697 vdev_status(vdev_t *vdev, int indent)
698 {
699         vdev_t *kid;
700         print_state(indent, vdev->v_name, vdev->v_state);
701
702         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
703                 vdev_status(kid, indent + 1);
704         }
705 }
706
707 static void
708 spa_status(spa_t *spa)
709 {
710         vdev_t *vdev;
711         int good_kids, bad_kids, degraded_kids;
712         vdev_state_t state;
713
714         pager_printf("  pool: %s\n", spa->spa_name);
715         pager_printf("config:\n\n");
716         pager_printf(STATUS_FORMAT, "NAME", "STATE");
717
718         good_kids = 0;
719         degraded_kids = 0;
720         bad_kids = 0;
721         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
722                 if (vdev->v_state == VDEV_STATE_HEALTHY)
723                         good_kids++;
724                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
725                         degraded_kids++;
726                 else
727                         bad_kids++;
728         }
729
730         state = VDEV_STATE_CLOSED;
731         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
732                 state = VDEV_STATE_HEALTHY;
733         else if ((good_kids + degraded_kids) > 0)
734                 state = VDEV_STATE_DEGRADED;
735
736         print_state(0, spa->spa_name, state);
737         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
738                 vdev_status(vdev, 1);
739         }
740 }
741
742 static void
743 spa_all_status(void)
744 {
745         spa_t *spa;
746         int first = 1;
747
748         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
749                 if (!first)
750                         pager_printf("\n");
751                 first = 0;
752                 spa_status(spa);
753         }
754 }
755
756 static int
757 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
758 {
759         vdev_t vtmp;
760         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
761         spa_t *spa;
762         vdev_t *vdev, *top_vdev, *pool_vdev;
763         off_t off;
764         blkptr_t bp;
765         const unsigned char *nvlist;
766         uint64_t val;
767         uint64_t guid;
768         uint64_t pool_txg, pool_guid;
769         uint64_t is_log;
770         const char *pool_name;
771         const unsigned char *vdevs;
772         int i, rc, is_newer;
773         char upbuf[1024];
774         const struct uberblock *up;
775
776         /*
777          * Load the vdev label and figure out which
778          * uberblock is most current.
779          */
780         memset(&vtmp, 0, sizeof(vtmp));
781         vtmp.v_phys_read = read;
782         vtmp.v_read_priv = read_priv;
783         off = offsetof(vdev_label_t, vl_vdev_phys);
784         BP_ZERO(&bp);
785         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
786         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
787         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
788         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
789         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
790         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
791                 return (EIO);
792
793         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
794                 return (EIO);
795         }
796
797         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
798
799         if (nvlist_find(nvlist,
800                         ZPOOL_CONFIG_VERSION,
801                         DATA_TYPE_UINT64, 0, &val)) {
802                 return (EIO);
803         }
804
805         if (val > SPA_VERSION) {
806                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
807                     (unsigned) val, (unsigned) SPA_VERSION);
808                 return (EIO);
809         }
810
811         if (nvlist_find(nvlist,
812                         ZPOOL_CONFIG_POOL_STATE,
813                         DATA_TYPE_UINT64, 0, &val)) {
814                 return (EIO);
815         }
816
817 #ifndef TEST
818         if (val != POOL_STATE_ACTIVE) {
819                 /*
820                  * Don't print a message here. If we happen to reboot
821                  * while where is an exported pool around, we don't
822                  * need a cascade of confusing messages during boot.
823                  */
824                 /*printf("ZFS: pool is not active\n");*/
825                 return (EIO);
826         }
827 #endif
828
829         if (nvlist_find(nvlist,
830                         ZPOOL_CONFIG_POOL_TXG,
831                         DATA_TYPE_UINT64, 0, &pool_txg)
832             || nvlist_find(nvlist,
833                            ZPOOL_CONFIG_POOL_GUID,
834                            DATA_TYPE_UINT64, 0, &pool_guid)
835             || nvlist_find(nvlist,
836                            ZPOOL_CONFIG_POOL_NAME,
837                            DATA_TYPE_STRING, 0, &pool_name)) {
838                 /*
839                  * Cache and spare devices end up here - just ignore
840                  * them.
841                  */
842                 /*printf("ZFS: can't find pool details\n");*/
843                 return (EIO);
844         }
845
846         is_log = 0;
847         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
848             &is_log);
849         if (is_log)
850                 return (EIO);
851
852         /*
853          * Create the pool if this is the first time we've seen it.
854          */
855         spa = spa_find_by_guid(pool_guid);
856         if (!spa) {
857                 spa = spa_create(pool_guid);
858                 spa->spa_name = strdup(pool_name);
859         }
860         if (pool_txg > spa->spa_txg) {
861                 spa->spa_txg = pool_txg;
862                 is_newer = 1;
863         } else
864                 is_newer = 0;
865
866         /*
867          * Get the vdev tree and create our in-core copy of it.
868          * If we already have a vdev with this guid, this must
869          * be some kind of alias (overlapping slices, dangerously dedicated
870          * disks etc).
871          */
872         if (nvlist_find(nvlist,
873                         ZPOOL_CONFIG_GUID,
874                         DATA_TYPE_UINT64, 0, &guid)) {
875                 return (EIO);
876         }
877         vdev = vdev_find(guid);
878         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
879                 return (EIO);
880
881         if (nvlist_find(nvlist,
882                         ZPOOL_CONFIG_VDEV_TREE,
883                         DATA_TYPE_NVLIST, 0, &vdevs)) {
884                 return (EIO);
885         }
886
887         rc = vdev_init_from_nvlist(vdevs, &top_vdev, is_newer);
888         if (rc)
889                 return (rc);
890
891         /*
892          * Add the toplevel vdev to the pool if its not already there.
893          */
894         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
895                 if (top_vdev == pool_vdev)
896                         break;
897         if (!pool_vdev && top_vdev)
898                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
899
900         /*
901          * We should already have created an incomplete vdev for this
902          * vdev. Find it and initialise it with our read proc.
903          */
904         vdev = vdev_find(guid);
905         if (vdev) {
906                 vdev->v_phys_read = read;
907                 vdev->v_read_priv = read_priv;
908         } else {
909                 printf("ZFS: inconsistent nvlist contents\n");
910                 return (EIO);
911         }
912
913         /*
914          * Re-evaluate top-level vdev state.
915          */
916         vdev_set_state(top_vdev);
917
918         /*
919          * Ok, we are happy with the pool so far. Lets find
920          * the best uberblock and then we can actually access
921          * the contents of the pool.
922          */
923         for (i = 0;
924              i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
925              i++) {
926                 off = offsetof(vdev_label_t, vl_uberblock);
927                 off += i << UBERBLOCK_SHIFT;
928                 BP_ZERO(&bp);
929                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
930                 BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
931                 BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
932                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
933                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
934                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
935                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
936                         continue;
937
938                 up = (const struct uberblock *) upbuf;
939                 if (up->ub_magic != UBERBLOCK_MAGIC)
940                         continue;
941                 if (up->ub_txg < spa->spa_txg)
942                         continue;
943                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
944                         spa->spa_uberblock = *up;
945                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
946                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
947                                 spa->spa_uberblock = *up;
948                 }
949         }
950
951         if (spap)
952                 *spap = spa;
953         return (0);
954 }
955
956 static int
957 ilog2(int n)
958 {
959         int v;
960
961         for (v = 0; v < 32; v++)
962                 if (n == (1 << v))
963                         return v;
964         return -1;
965 }
966
967 static int
968 zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
969 {
970         zio_gbh_phys_t zio_gb;
971         vdev_t *vdev;
972         int vdevid;
973         off_t offset;
974         int i;
975
976         vdevid = DVA_GET_VDEV(dva);
977         offset = DVA_GET_OFFSET(dva);
978         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
979                 if (vdev->v_id == vdevid)
980                         break;
981         if (!vdev || !vdev->v_read)
982                 return (EIO);
983         if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE))
984                 return (EIO);
985
986         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
987                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
988
989                 if (BP_IS_HOLE(gbp))
990                         continue;
991                 if (zio_read(spa, gbp, buf))
992                         return (EIO);
993                 buf = (char*)buf + BP_GET_PSIZE(gbp);
994         }
995  
996         return (0);
997 }
998
999 static int
1000 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
1001 {
1002         int cpfunc = BP_GET_COMPRESS(bp);
1003         size_t lsize = BP_GET_LSIZE(bp);
1004         size_t psize = BP_GET_PSIZE(bp);
1005         void *pbuf;
1006         int i;
1007
1008         zfs_reset_temp();
1009         if (cpfunc != ZIO_COMPRESS_OFF)
1010                 pbuf = zfs_alloc_temp(psize);
1011         else
1012                 pbuf = buf;
1013
1014         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1015                 const dva_t *dva = &bp->blk_dva[i];
1016                 vdev_t *vdev;
1017                 int vdevid;
1018                 off_t offset;
1019
1020                 if (!dva->dva_word[0] && !dva->dva_word[1])
1021                         continue;
1022
1023                 if (DVA_GET_GANG(dva)) {
1024                         if (zio_read_gang(spa, bp, dva, buf))
1025                                 continue;
1026                 } else {
1027                         vdevid = DVA_GET_VDEV(dva);
1028                         offset = DVA_GET_OFFSET(dva);
1029                         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
1030                                 if (vdev->v_id == vdevid)
1031                                         break;
1032                         if (!vdev || !vdev->v_read) {
1033                                 continue;
1034                         }
1035                         if (vdev->v_read(vdev, bp, pbuf, offset, psize))
1036                                 continue;
1037
1038                         if (cpfunc != ZIO_COMPRESS_OFF) {
1039                                 if (zio_decompress_data(cpfunc, pbuf, psize,
1040                                     buf, lsize))
1041                                         return (EIO);
1042                         }
1043                 }
1044
1045                 return (0);
1046         }
1047         printf("ZFS: i/o error - all block copies unavailable\n");
1048
1049         return (EIO);
1050 }
1051
1052 static int
1053 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1054 {
1055         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1056         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1057         int nlevels = dnode->dn_nlevels;
1058         int i, rc;
1059
1060         /*
1061          * Note: bsize may not be a power of two here so we need to do an
1062          * actual divide rather than a bitshift.
1063          */
1064         while (buflen > 0) {
1065                 uint64_t bn = offset / bsize;
1066                 int boff = offset % bsize;
1067                 int ibn;
1068                 const blkptr_t *indbp;
1069                 blkptr_t bp;
1070
1071                 if (bn > dnode->dn_maxblkid)
1072                         return (EIO);
1073
1074                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1075                         goto cached;
1076
1077                 indbp = dnode->dn_blkptr;
1078                 for (i = 0; i < nlevels; i++) {
1079                         /*
1080                          * Copy the bp from the indirect array so that
1081                          * we can re-use the scratch buffer for multi-level
1082                          * objects.
1083                          */
1084                         ibn = bn >> ((nlevels - i - 1) * ibshift);
1085                         ibn &= ((1 << ibshift) - 1);
1086                         bp = indbp[ibn];
1087                         rc = zio_read(spa, &bp, dnode_cache_buf);
1088                         if (rc)
1089                                 return (rc);
1090                         indbp = (const blkptr_t *) dnode_cache_buf;
1091                 }
1092                 dnode_cache_obj = dnode;
1093                 dnode_cache_bn = bn;
1094         cached:
1095
1096                 /*
1097                  * The buffer contains our data block. Copy what we
1098                  * need from it and loop.
1099                  */ 
1100                 i = bsize - boff;
1101                 if (i > buflen) i = buflen;
1102                 memcpy(buf, &dnode_cache_buf[boff], i);
1103                 buf = ((char*) buf) + i;
1104                 offset += i;
1105                 buflen -= i;
1106         }
1107
1108         return (0);
1109 }
1110
1111 /*
1112  * Lookup a value in a microzap directory. Assumes that the zap
1113  * scratch buffer contains the directory contents.
1114  */
1115 static int
1116 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1117 {
1118         const mzap_phys_t *mz;
1119         const mzap_ent_phys_t *mze;
1120         size_t size;
1121         int chunks, i;
1122
1123         /*
1124          * Microzap objects use exactly one block. Read the whole
1125          * thing.
1126          */
1127         size = dnode->dn_datablkszsec * 512;
1128
1129         mz = (const mzap_phys_t *) zap_scratch;
1130         chunks = size / MZAP_ENT_LEN - 1;
1131
1132         for (i = 0; i < chunks; i++) {
1133                 mze = &mz->mz_chunk[i];
1134                 if (!strcmp(mze->mze_name, name)) {
1135                         *value = mze->mze_value;
1136                         return (0);
1137                 }
1138         }
1139
1140         return (ENOENT);
1141 }
1142
1143 /*
1144  * Compare a name with a zap leaf entry. Return non-zero if the name
1145  * matches.
1146  */
1147 static int
1148 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1149 {
1150         size_t namelen;
1151         const zap_leaf_chunk_t *nc;
1152         const char *p;
1153
1154         namelen = zc->l_entry.le_name_length;
1155                         
1156         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1157         p = name;
1158         while (namelen > 0) {
1159                 size_t len;
1160                 len = namelen;
1161                 if (len > ZAP_LEAF_ARRAY_BYTES)
1162                         len = ZAP_LEAF_ARRAY_BYTES;
1163                 if (memcmp(p, nc->l_array.la_array, len))
1164                         return (0);
1165                 p += len;
1166                 namelen -= len;
1167                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1168         }
1169
1170         return 1;
1171 }
1172
1173 /*
1174  * Extract a uint64_t value from a zap leaf entry.
1175  */
1176 static uint64_t
1177 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1178 {
1179         const zap_leaf_chunk_t *vc;
1180         int i;
1181         uint64_t value;
1182         const uint8_t *p;
1183
1184         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1185         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1186                 value = (value << 8) | p[i];
1187         }
1188
1189         return value;
1190 }
1191
1192 /*
1193  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1194  * buffer contains the directory header.
1195  */
1196 static int
1197 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1198 {
1199         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1200         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1201         fat_zap_t z;
1202         uint64_t *ptrtbl;
1203         uint64_t hash;
1204         int rc;
1205
1206         if (zh.zap_magic != ZAP_MAGIC)
1207                 return (EIO);
1208
1209         z.zap_block_shift = ilog2(bsize);
1210         z.zap_phys = (zap_phys_t *) zap_scratch;
1211
1212         /*
1213          * Figure out where the pointer table is and read it in if necessary.
1214          */
1215         if (zh.zap_ptrtbl.zt_blk) {
1216                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1217                                zap_scratch, bsize);
1218                 if (rc)
1219                         return (rc);
1220                 ptrtbl = (uint64_t *) zap_scratch;
1221         } else {
1222                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1223         }
1224
1225         hash = zap_hash(zh.zap_salt, name);
1226
1227         zap_leaf_t zl;
1228         zl.l_bs = z.zap_block_shift;
1229
1230         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1231         zap_leaf_chunk_t *zc;
1232
1233         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1234         if (rc)
1235                 return (rc);
1236
1237         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1238
1239         /*
1240          * Make sure this chunk matches our hash.
1241          */
1242         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1243             && zl.l_phys->l_hdr.lh_prefix
1244             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1245                 return (ENOENT);
1246
1247         /*
1248          * Hash within the chunk to find our entry.
1249          */
1250         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1251         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1252         h = zl.l_phys->l_hash[h];
1253         if (h == 0xffff)
1254                 return (ENOENT);
1255         zc = &ZAP_LEAF_CHUNK(&zl, h);
1256         while (zc->l_entry.le_hash != hash) {
1257                 if (zc->l_entry.le_next == 0xffff) {
1258                         zc = 0;
1259                         break;
1260                 }
1261                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1262         }
1263         if (fzap_name_equal(&zl, zc, name)) {
1264                 *value = fzap_leaf_value(&zl, zc);
1265                 return (0);
1266         }
1267
1268         return (ENOENT);
1269 }
1270
1271 /*
1272  * Lookup a name in a zap object and return its value as a uint64_t.
1273  */
1274 static int
1275 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1276 {
1277         int rc;
1278         uint64_t zap_type;
1279         size_t size = dnode->dn_datablkszsec * 512;
1280
1281         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1282         if (rc)
1283                 return (rc);
1284
1285         zap_type = *(uint64_t *) zap_scratch;
1286         if (zap_type == ZBT_MICRO)
1287                 return mzap_lookup(spa, dnode, name, value);
1288         else
1289                 return fzap_lookup(spa, dnode, name, value);
1290 }
1291
1292 #ifdef BOOT2
1293
1294 /*
1295  * List a microzap directory. Assumes that the zap scratch buffer contains
1296  * the directory contents.
1297  */
1298 static int
1299 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
1300 {
1301         const mzap_phys_t *mz;
1302         const mzap_ent_phys_t *mze;
1303         size_t size;
1304         int chunks, i;
1305
1306         /*
1307          * Microzap objects use exactly one block. Read the whole
1308          * thing.
1309          */
1310         size = dnode->dn_datablkszsec * 512;
1311         mz = (const mzap_phys_t *) zap_scratch;
1312         chunks = size / MZAP_ENT_LEN - 1;
1313
1314         for (i = 0; i < chunks; i++) {
1315                 mze = &mz->mz_chunk[i];
1316                 if (mze->mze_name[0])
1317                         //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
1318                         printf("%s\n", mze->mze_name);
1319         }
1320
1321         return (0);
1322 }
1323
1324 /*
1325  * List a fatzap directory. Assumes that the zap scratch buffer contains
1326  * the directory header.
1327  */
1328 static int
1329 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
1330 {
1331         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1332         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1333         fat_zap_t z;
1334         int i, j;
1335
1336         if (zh.zap_magic != ZAP_MAGIC)
1337                 return (EIO);
1338
1339         z.zap_block_shift = ilog2(bsize);
1340         z.zap_phys = (zap_phys_t *) zap_scratch;
1341
1342         /*
1343          * This assumes that the leaf blocks start at block 1. The
1344          * documentation isn't exactly clear on this.
1345          */
1346         zap_leaf_t zl;
1347         zl.l_bs = z.zap_block_shift;
1348         for (i = 0; i < zh.zap_num_leafs; i++) {
1349                 off_t off = (i + 1) << zl.l_bs;
1350                 char name[256], *p;
1351                 uint64_t value;
1352
1353                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1354                         return (EIO);
1355
1356                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1357
1358                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1359                         zap_leaf_chunk_t *zc, *nc;
1360                         int namelen;
1361
1362                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1363                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1364                                 continue;
1365                         namelen = zc->l_entry.le_name_length;
1366                         if (namelen > sizeof(name))
1367                                 namelen = sizeof(name);
1368                         
1369                         /*
1370                          * Paste the name back together.
1371                          */
1372                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1373                         p = name;
1374                         while (namelen > 0) {
1375                                 int len;
1376                                 len = namelen;
1377                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1378                                         len = ZAP_LEAF_ARRAY_BYTES;
1379                                 memcpy(p, nc->l_array.la_array, len);
1380                                 p += len;
1381                                 namelen -= len;
1382                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1383                         }
1384
1385                         /*
1386                          * Assume the first eight bytes of the value are
1387                          * a uint64_t.
1388                          */
1389                         value = fzap_leaf_value(&zl, zc);
1390
1391                         printf("%s 0x%llx\n", name, value);
1392                 }
1393         }
1394
1395         return (0);
1396 }
1397
1398 /*
1399  * List a zap directory.
1400  */
1401 static int
1402 zap_list(spa_t *spa, const dnode_phys_t *dnode)
1403 {
1404         uint64_t zap_type;
1405         size_t size = dnode->dn_datablkszsec * 512;
1406
1407         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1408                 return (EIO);
1409
1410         zap_type = *(uint64_t *) zap_scratch;
1411         if (zap_type == ZBT_MICRO)
1412                 return mzap_list(spa, dnode);
1413         else
1414                 return fzap_list(spa, dnode);
1415 }
1416
1417 #endif
1418
1419 static int
1420 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1421 {
1422         off_t offset;
1423
1424         offset = objnum * sizeof(dnode_phys_t);
1425         return dnode_read(spa, &os->os_meta_dnode, offset,
1426                 dnode, sizeof(dnode_phys_t));
1427 }
1428
1429 /*
1430  * Find the object set given the object number of its dataset object
1431  * and return its details in *objset
1432  */
1433 static int
1434 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1435 {
1436         dnode_phys_t dataset;
1437         dsl_dataset_phys_t *ds;
1438
1439         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1440                 printf("ZFS: can't find dataset %llu\n", objnum);
1441                 return (EIO);
1442         }
1443
1444         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1445         if (zio_read(spa, &ds->ds_bp, objset)) {
1446                 printf("ZFS: can't read object set for dataset %llu\n", objnum);
1447                 return (EIO);
1448         }
1449
1450         return (0);
1451 }
1452
1453 /*
1454  * Find the object set pointed to by the BOOTFS property or the root
1455  * dataset if there is none and return its details in *objset
1456  */
1457 static int
1458 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
1459 {
1460         dnode_phys_t dir, propdir;
1461         uint64_t props, bootfs, root;
1462
1463         /*
1464          * Start with the MOS directory object.
1465          */
1466         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1467                 printf("ZFS: can't read MOS object directory\n");
1468                 return (EIO);
1469         }
1470
1471         /*
1472          * Lookup the pool_props and see if we can find a bootfs.
1473          */
1474         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1475              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1476              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
1477              && bootfs != 0)
1478                 return zfs_mount_dataset(spa, bootfs, objset);
1479
1480         /*
1481          * Lookup the root dataset directory
1482          */
1483         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1484             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1485                 printf("ZFS: can't find root dsl_dir\n");
1486                 return (EIO);
1487         }
1488
1489         /*
1490          * Use the information from the dataset directory's bonus buffer
1491          * to find the dataset object and from that the object set itself.
1492          */
1493         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1494         return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
1495 }
1496
1497 static int
1498 zfs_mount_pool(spa_t *spa)
1499 {
1500         /*
1501          * Find the MOS and work our way in from there.
1502          */
1503         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
1504                 printf("ZFS: can't read MOS\n");
1505                 return (EIO);
1506         }
1507
1508         /*
1509          * Find the root object set
1510          */
1511         if (zfs_mount_root(spa, &spa->spa_root_objset)) {
1512                 printf("Can't find root filesystem - giving up\n");
1513                 return (EIO);
1514         }
1515
1516         return (0);
1517 }
1518
1519 /*
1520  * Lookup a file and return its dnode.
1521  */
1522 static int
1523 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
1524 {
1525         int rc;
1526         uint64_t objnum, rootnum, parentnum;
1527         dnode_phys_t dn;
1528         const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
1529         const char *p, *q;
1530         char element[256];
1531         char path[1024];
1532         int symlinks_followed = 0;
1533
1534         if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
1535                 printf("ZFS: unexpected object set type %llu\n",
1536                        spa->spa_root_objset.os_type);
1537                 return (EIO);
1538         }
1539
1540         /*
1541          * Get the root directory dnode.
1542          */
1543         rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
1544         if (rc)
1545                 return (rc);
1546
1547         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
1548         if (rc)
1549                 return (rc);
1550
1551         rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
1552         if (rc)
1553                 return (rc);
1554
1555         objnum = rootnum;
1556         p = upath;
1557         while (p && *p) {
1558                 while (*p == '/')
1559                         p++;
1560                 if (!*p)
1561                         break;
1562                 q = strchr(p, '/');
1563                 if (q) {
1564                         memcpy(element, p, q - p);
1565                         element[q - p] = 0;
1566                         p = q;
1567                 } else {
1568                         strcpy(element, p);
1569                         p = 0;
1570                 }
1571
1572                 if ((zp->zp_mode >> 12) != 0x4) {
1573                         return (ENOTDIR);
1574                 }
1575
1576                 parentnum = objnum;
1577                 rc = zap_lookup(spa, &dn, element, &objnum);
1578                 if (rc)
1579                         return (rc);
1580                 objnum = ZFS_DIRENT_OBJ(objnum);
1581
1582                 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1583                 if (rc)
1584                         return (rc);
1585
1586                 /*
1587                  * Check for symlink.
1588                  */
1589                 if ((zp->zp_mode >> 12) == 0xa) {
1590                         if (symlinks_followed > 10)
1591                                 return (EMLINK);
1592                         symlinks_followed++;
1593
1594                         /*
1595                          * Read the link value and copy the tail of our
1596                          * current path onto the end.
1597                          */
1598                         if (p)
1599                                 strcpy(&path[zp->zp_size], p);
1600                         else
1601                                 path[zp->zp_size] = 0;
1602                         if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
1603                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
1604                                         zp->zp_size);
1605                         } else {
1606                                 rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
1607                                 if (rc)
1608                                         return (rc);
1609                         }
1610
1611                         /*
1612                          * Restart with the new path, starting either at
1613                          * the root or at the parent depending whether or
1614                          * not the link is relative.
1615                          */
1616                         p = path;
1617                         if (*p == '/')
1618                                 objnum = rootnum;
1619                         else
1620                                 objnum = parentnum;
1621                         objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1622                 }
1623         }
1624
1625         *dnode = dn;
1626         return (0);
1627 }