]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/boot/zfs/zfsimpl.c
MFC r226549,r226550,r226551,r226552,r226553,r226554,r226568,r226569,r226611,
[FreeBSD/releng/9.0.git] / sys / boot / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <sys/stat.h>
35
36 #include "zfsimpl.h"
37 #include "zfssubr.c"
38
39 /*
40  * List of all vdevs, chained through v_alllink.
41  */
42 static vdev_list_t zfs_vdevs;
43
44 /*
45  * List of all pools, chained through spa_link.
46  */
47 static spa_list_t zfs_pools;
48
49 static uint64_t zfs_crc64_table[256];
50 static const dnode_phys_t *dnode_cache_obj = 0;
51 static uint64_t dnode_cache_bn;
52 static char *dnode_cache_buf;
53 static char *zap_scratch;
54 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
55
56 #define TEMP_SIZE       (1024 * 1024)
57
58 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
59
60 static void
61 zfs_init(void)
62 {
63         STAILQ_INIT(&zfs_vdevs);
64         STAILQ_INIT(&zfs_pools);
65
66         zfs_temp_buf = malloc(TEMP_SIZE);
67         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
68         zfs_temp_ptr = zfs_temp_buf;
69         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
70         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
71
72         zfs_init_crc();
73 }
74
75 static void *
76 zfs_alloc(size_t size)
77 {
78         char *ptr;
79
80         if (zfs_temp_ptr + size > zfs_temp_end) {
81                 printf("ZFS: out of temporary buffer space\n");
82                 for (;;) ;
83         }
84         ptr = zfs_temp_ptr;
85         zfs_temp_ptr += size;
86
87         return (ptr);
88 }
89
90 static void
91 zfs_free(void *ptr, size_t size)
92 {
93
94         zfs_temp_ptr -= size;
95         if (zfs_temp_ptr != ptr) {
96                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
97                 for (;;) ;
98         }
99 }
100
101 static int
102 xdr_int(const unsigned char **xdr, int *ip)
103 {
104         *ip = ((*xdr)[0] << 24)
105                 | ((*xdr)[1] << 16)
106                 | ((*xdr)[2] << 8)
107                 | ((*xdr)[3] << 0);
108         (*xdr) += 4;
109         return (0);
110 }
111
112 static int
113 xdr_u_int(const unsigned char **xdr, u_int *ip)
114 {
115         *ip = ((*xdr)[0] << 24)
116                 | ((*xdr)[1] << 16)
117                 | ((*xdr)[2] << 8)
118                 | ((*xdr)[3] << 0);
119         (*xdr) += 4;
120         return (0);
121 }
122
123 static int
124 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
125 {
126         u_int hi, lo;
127
128         xdr_u_int(xdr, &hi);
129         xdr_u_int(xdr, &lo);
130         *lp = (((uint64_t) hi) << 32) | lo;
131         return (0);
132 }
133
134 static int
135 nvlist_find(const unsigned char *nvlist, const char *name, int type,
136             int* elementsp, void *valuep)
137 {
138         const unsigned char *p, *pair;
139         int junk;
140         int encoded_size, decoded_size;
141
142         p = nvlist;
143         xdr_int(&p, &junk);
144         xdr_int(&p, &junk);
145
146         pair = p;
147         xdr_int(&p, &encoded_size);
148         xdr_int(&p, &decoded_size);
149         while (encoded_size && decoded_size) {
150                 int namelen, pairtype, elements;
151                 const char *pairname;
152
153                 xdr_int(&p, &namelen);
154                 pairname = (const char*) p;
155                 p += roundup(namelen, 4);
156                 xdr_int(&p, &pairtype);
157
158                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
159                         xdr_int(&p, &elements);
160                         if (elementsp)
161                                 *elementsp = elements;
162                         if (type == DATA_TYPE_UINT64) {
163                                 xdr_uint64_t(&p, (uint64_t *) valuep);
164                                 return (0);
165                         } else if (type == DATA_TYPE_STRING) {
166                                 int len;
167                                 xdr_int(&p, &len);
168                                 (*(const char**) valuep) = (const char*) p;
169                                 return (0);
170                         } else if (type == DATA_TYPE_NVLIST
171                                    || type == DATA_TYPE_NVLIST_ARRAY) {
172                                 (*(const unsigned char**) valuep) =
173                                          (const unsigned char*) p;
174                                 return (0);
175                         } else {
176                                 return (EIO);
177                         }
178                 } else {
179                         /*
180                          * Not the pair we are looking for, skip to the next one.
181                          */
182                         p = pair + encoded_size;
183                 }
184
185                 pair = p;
186                 xdr_int(&p, &encoded_size);
187                 xdr_int(&p, &decoded_size);
188         }
189
190         return (EIO);
191 }
192
193 /*
194  * Return the next nvlist in an nvlist array.
195  */
196 static const unsigned char *
197 nvlist_next(const unsigned char *nvlist)
198 {
199         const unsigned char *p, *pair;
200         int junk;
201         int encoded_size, decoded_size;
202
203         p = nvlist;
204         xdr_int(&p, &junk);
205         xdr_int(&p, &junk);
206
207         pair = p;
208         xdr_int(&p, &encoded_size);
209         xdr_int(&p, &decoded_size);
210         while (encoded_size && decoded_size) {
211                 p = pair + encoded_size;
212
213                 pair = p;
214                 xdr_int(&p, &encoded_size);
215                 xdr_int(&p, &decoded_size);
216         }
217
218         return p;
219 }
220
221 #ifdef TEST
222
223 static const unsigned char *
224 nvlist_print(const unsigned char *nvlist, unsigned int indent)
225 {
226         static const char* typenames[] = {
227                 "DATA_TYPE_UNKNOWN",
228                 "DATA_TYPE_BOOLEAN",
229                 "DATA_TYPE_BYTE",
230                 "DATA_TYPE_INT16",
231                 "DATA_TYPE_UINT16",
232                 "DATA_TYPE_INT32",
233                 "DATA_TYPE_UINT32",
234                 "DATA_TYPE_INT64",
235                 "DATA_TYPE_UINT64",
236                 "DATA_TYPE_STRING",
237                 "DATA_TYPE_BYTE_ARRAY",
238                 "DATA_TYPE_INT16_ARRAY",
239                 "DATA_TYPE_UINT16_ARRAY",
240                 "DATA_TYPE_INT32_ARRAY",
241                 "DATA_TYPE_UINT32_ARRAY",
242                 "DATA_TYPE_INT64_ARRAY",
243                 "DATA_TYPE_UINT64_ARRAY",
244                 "DATA_TYPE_STRING_ARRAY",
245                 "DATA_TYPE_HRTIME",
246                 "DATA_TYPE_NVLIST",
247                 "DATA_TYPE_NVLIST_ARRAY",
248                 "DATA_TYPE_BOOLEAN_VALUE",
249                 "DATA_TYPE_INT8",
250                 "DATA_TYPE_UINT8",
251                 "DATA_TYPE_BOOLEAN_ARRAY",
252                 "DATA_TYPE_INT8_ARRAY",
253                 "DATA_TYPE_UINT8_ARRAY"
254         };
255
256         unsigned int i, j;
257         const unsigned char *p, *pair;
258         int junk;
259         int encoded_size, decoded_size;
260
261         p = nvlist;
262         xdr_int(&p, &junk);
263         xdr_int(&p, &junk);
264
265         pair = p;
266         xdr_int(&p, &encoded_size);
267         xdr_int(&p, &decoded_size);
268         while (encoded_size && decoded_size) {
269                 int namelen, pairtype, elements;
270                 const char *pairname;
271
272                 xdr_int(&p, &namelen);
273                 pairname = (const char*) p;
274                 p += roundup(namelen, 4);
275                 xdr_int(&p, &pairtype);
276
277                 for (i = 0; i < indent; i++)
278                         printf(" ");
279                 printf("%s %s", typenames[pairtype], pairname);
280
281                 xdr_int(&p, &elements);
282                 switch (pairtype) {
283                 case DATA_TYPE_UINT64: {
284                         uint64_t val;
285                         xdr_uint64_t(&p, &val);
286                         printf(" = 0x%llx\n", val);
287                         break;
288                 }
289
290                 case DATA_TYPE_STRING: {
291                         int len;
292                         xdr_int(&p, &len);
293                         printf(" = \"%s\"\n", p);
294                         break;
295                 }
296
297                 case DATA_TYPE_NVLIST:
298                         printf("\n");
299                         nvlist_print(p, indent + 1);
300                         break;
301
302                 case DATA_TYPE_NVLIST_ARRAY:
303                         for (j = 0; j < elements; j++) {
304                                 printf("[%d]\n", j);
305                                 p = nvlist_print(p, indent + 1);
306                                 if (j != elements - 1) {
307                                         for (i = 0; i < indent; i++)
308                                                 printf(" ");
309                                         printf("%s %s", typenames[pairtype], pairname);
310                                 }
311                         }
312                         break;
313
314                 default:
315                         printf("\n");
316                 }
317
318                 p = pair + encoded_size;
319
320                 pair = p;
321                 xdr_int(&p, &encoded_size);
322                 xdr_int(&p, &decoded_size);
323         }
324
325         return p;
326 }
327
328 #endif
329
330 static int
331 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
332     off_t offset, size_t size)
333 {
334         size_t psize;
335         int rc;
336
337         if (!vdev->v_phys_read)
338                 return (EIO);
339
340         if (bp) {
341                 psize = BP_GET_PSIZE(bp);
342         } else {
343                 psize = size;
344         }
345
346         /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
347         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
348         if (rc)
349                 return (rc);
350         if (bp && zio_checksum_verify(bp, buf))
351                 return (EIO);
352
353         return (0);
354 }
355
356 static int
357 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
358     off_t offset, size_t bytes)
359 {
360
361         return (vdev_read_phys(vdev, bp, buf,
362                 offset + VDEV_LABEL_START_SIZE, bytes));
363 }
364
365
366 static int
367 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
368     off_t offset, size_t bytes)
369 {
370         vdev_t *kid;
371         int rc;
372
373         rc = EIO;
374         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
375                 if (kid->v_state != VDEV_STATE_HEALTHY)
376                         continue;
377                 rc = kid->v_read(kid, bp, buf, offset, bytes);
378                 if (!rc)
379                         return (0);
380         }
381
382         return (rc);
383 }
384
385 static int
386 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
387     off_t offset, size_t bytes)
388 {
389         vdev_t *kid;
390
391         /*
392          * Here we should have two kids:
393          * First one which is the one we are replacing and we can trust
394          * only this one to have valid data, but it might not be present.
395          * Second one is that one we are replacing with. It is most likely
396          * healthy, but we can't trust it has needed data, so we won't use it.
397          */
398         kid = STAILQ_FIRST(&vdev->v_children);
399         if (kid == NULL)
400                 return (EIO);
401         if (kid->v_state != VDEV_STATE_HEALTHY)
402                 return (EIO);
403         return (kid->v_read(kid, bp, buf, offset, bytes));
404 }
405
406 static vdev_t *
407 vdev_find(uint64_t guid)
408 {
409         vdev_t *vdev;
410
411         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
412                 if (vdev->v_guid == guid)
413                         return (vdev);
414
415         return (0);
416 }
417
418 static vdev_t *
419 vdev_create(uint64_t guid, vdev_read_t *read)
420 {
421         vdev_t *vdev;
422
423         vdev = malloc(sizeof(vdev_t));
424         memset(vdev, 0, sizeof(vdev_t));
425         STAILQ_INIT(&vdev->v_children);
426         vdev->v_guid = guid;
427         vdev->v_state = VDEV_STATE_OFFLINE;
428         vdev->v_read = read;
429         vdev->v_phys_read = 0;
430         vdev->v_read_priv = 0;
431         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
432
433         return (vdev);
434 }
435
436 static int
437 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
438     vdev_t **vdevp, int is_newer)
439 {
440         int rc;
441         uint64_t guid, id, ashift, nparity;
442         const char *type;
443         const char *path;
444         vdev_t *vdev, *kid;
445         const unsigned char *kids;
446         int nkids, i, is_new;
447         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
448
449         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
450                         DATA_TYPE_UINT64, 0, &guid)
451             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
452                            DATA_TYPE_UINT64, 0, &id)
453             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
454                            DATA_TYPE_STRING, 0, &type)) {
455                 printf("ZFS: can't find vdev details\n");
456                 return (ENOENT);
457         }
458
459         if (strcmp(type, VDEV_TYPE_MIRROR)
460             && strcmp(type, VDEV_TYPE_DISK)
461             && strcmp(type, VDEV_TYPE_RAIDZ)
462             && strcmp(type, VDEV_TYPE_REPLACING)) {
463                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
464                 return (EIO);
465         }
466
467         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
468
469         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
470                         &is_offline);
471         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
472                         &is_removed);
473         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
474                         &is_faulted);
475         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
476                         &is_degraded);
477         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
478                         &isnt_present);
479
480         vdev = vdev_find(guid);
481         if (!vdev) {
482                 is_new = 1;
483
484                 if (!strcmp(type, VDEV_TYPE_MIRROR))
485                         vdev = vdev_create(guid, vdev_mirror_read);
486                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
487                         vdev = vdev_create(guid, vdev_raidz_read);
488                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
489                         vdev = vdev_create(guid, vdev_replacing_read);
490                 else
491                         vdev = vdev_create(guid, vdev_disk_read);
492
493                 vdev->v_id = id;
494                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
495                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
496                         DATA_TYPE_UINT64, 0, &ashift) == 0)
497                         vdev->v_ashift = ashift;
498                 else
499                         vdev->v_ashift = 0;
500                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
501                         DATA_TYPE_UINT64, 0, &nparity) == 0)
502                         vdev->v_nparity = nparity;
503                 else
504                         vdev->v_nparity = 0;
505                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
506                                 DATA_TYPE_STRING, 0, &path) == 0) {
507                         if (strncmp(path, "/dev/", 5) == 0)
508                                 path += 5;
509                         vdev->v_name = strdup(path);
510                 } else {
511                         if (!strcmp(type, "raidz")) {
512                                 if (vdev->v_nparity == 1)
513                                         vdev->v_name = "raidz1";
514                                 else if (vdev->v_nparity == 2)
515                                         vdev->v_name = "raidz2";
516                                 else if (vdev->v_nparity == 3)
517                                         vdev->v_name = "raidz3";
518                                 else {
519                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
520                                         return (EIO);
521                                 }
522                         } else {
523                                 vdev->v_name = strdup(type);
524                         }
525                 }
526         } else {
527                 is_new = 0;
528         }
529
530         if (is_new || is_newer) {
531                 /*
532                  * This is either new vdev or we've already seen this vdev,
533                  * but from an older vdev label, so let's refresh its state
534                  * from the newer label.
535                  */
536                 if (is_offline)
537                         vdev->v_state = VDEV_STATE_OFFLINE;
538                 else if (is_removed)
539                         vdev->v_state = VDEV_STATE_REMOVED;
540                 else if (is_faulted)
541                         vdev->v_state = VDEV_STATE_FAULTED;
542                 else if (is_degraded)
543                         vdev->v_state = VDEV_STATE_DEGRADED;
544                 else if (isnt_present)
545                         vdev->v_state = VDEV_STATE_CANT_OPEN;
546         }
547
548         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
549                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
550         /*
551          * Its ok if we don't have any kids.
552          */
553         if (rc == 0) {
554                 vdev->v_nchildren = nkids;
555                 for (i = 0; i < nkids; i++) {
556                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
557                         if (rc)
558                                 return (rc);
559                         if (is_new)
560                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
561                                                    v_childlink);
562                         kids = nvlist_next(kids);
563                 }
564         } else {
565                 vdev->v_nchildren = 0;
566         }
567
568         if (vdevp)
569                 *vdevp = vdev;
570         return (0);
571 }
572
573 static void
574 vdev_set_state(vdev_t *vdev)
575 {
576         vdev_t *kid;
577         int good_kids;
578         int bad_kids;
579
580         /*
581          * A mirror or raidz is healthy if all its kids are healthy. A
582          * mirror is degraded if any of its kids is healthy; a raidz
583          * is degraded if at most nparity kids are offline.
584          */
585         if (STAILQ_FIRST(&vdev->v_children)) {
586                 good_kids = 0;
587                 bad_kids = 0;
588                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
589                         if (kid->v_state == VDEV_STATE_HEALTHY)
590                                 good_kids++;
591                         else
592                                 bad_kids++;
593                 }
594                 if (bad_kids == 0) {
595                         vdev->v_state = VDEV_STATE_HEALTHY;
596                 } else {
597                         if (vdev->v_read == vdev_mirror_read) {
598                                 if (good_kids) {
599                                         vdev->v_state = VDEV_STATE_DEGRADED;
600                                 } else {
601                                         vdev->v_state = VDEV_STATE_OFFLINE;
602                                 }
603                         } else if (vdev->v_read == vdev_raidz_read) {
604                                 if (bad_kids > vdev->v_nparity) {
605                                         vdev->v_state = VDEV_STATE_OFFLINE;
606                                 } else {
607                                         vdev->v_state = VDEV_STATE_DEGRADED;
608                                 }
609                         }
610                 }
611         }
612 }
613
614 static spa_t *
615 spa_find_by_guid(uint64_t guid)
616 {
617         spa_t *spa;
618
619         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
620                 if (spa->spa_guid == guid)
621                         return (spa);
622
623         return (0);
624 }
625
626 #ifdef BOOT2
627
628 static spa_t *
629 spa_find_by_name(const char *name)
630 {
631         spa_t *spa;
632
633         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
634                 if (!strcmp(spa->spa_name, name))
635                         return (spa);
636
637         return (0);
638 }
639
640 #endif
641
642 static spa_t *
643 spa_create(uint64_t guid)
644 {
645         spa_t *spa;
646
647         spa = malloc(sizeof(spa_t));
648         memset(spa, 0, sizeof(spa_t));
649         STAILQ_INIT(&spa->spa_vdevs);
650         spa->spa_guid = guid;
651         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
652
653         return (spa);
654 }
655
656 static const char *
657 state_name(vdev_state_t state)
658 {
659         static const char* names[] = {
660                 "UNKNOWN",
661                 "CLOSED",
662                 "OFFLINE",
663                 "REMOVED",
664                 "CANT_OPEN",
665                 "FAULTED",
666                 "DEGRADED",
667                 "ONLINE"
668         };
669         return names[state];
670 }
671
672 #ifdef BOOT2
673
674 #define pager_printf printf
675
676 #else
677
678 static void
679 pager_printf(const char *fmt, ...)
680 {
681         char line[80];
682         va_list args;
683
684         va_start(args, fmt);
685         vsprintf(line, fmt, args);
686         va_end(args);
687         pager_output(line);
688 }
689
690 #endif
691
692 #define STATUS_FORMAT   "        %s %s\n"
693
694 static void
695 print_state(int indent, const char *name, vdev_state_t state)
696 {
697         int i;
698         char buf[512];
699
700         buf[0] = 0;
701         for (i = 0; i < indent; i++)
702                 strcat(buf, "  ");
703         strcat(buf, name);
704         pager_printf(STATUS_FORMAT, buf, state_name(state));
705         
706 }
707
708 static void
709 vdev_status(vdev_t *vdev, int indent)
710 {
711         vdev_t *kid;
712         print_state(indent, vdev->v_name, vdev->v_state);
713
714         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
715                 vdev_status(kid, indent + 1);
716         }
717 }
718
719 static void
720 spa_status(spa_t *spa)
721 {
722         vdev_t *vdev;
723         int good_kids, bad_kids, degraded_kids;
724         vdev_state_t state;
725
726         pager_printf("  pool: %s\n", spa->spa_name);
727         pager_printf("config:\n\n");
728         pager_printf(STATUS_FORMAT, "NAME", "STATE");
729
730         good_kids = 0;
731         degraded_kids = 0;
732         bad_kids = 0;
733         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
734                 if (vdev->v_state == VDEV_STATE_HEALTHY)
735                         good_kids++;
736                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
737                         degraded_kids++;
738                 else
739                         bad_kids++;
740         }
741
742         state = VDEV_STATE_CLOSED;
743         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
744                 state = VDEV_STATE_HEALTHY;
745         else if ((good_kids + degraded_kids) > 0)
746                 state = VDEV_STATE_DEGRADED;
747
748         print_state(0, spa->spa_name, state);
749         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
750                 vdev_status(vdev, 1);
751         }
752 }
753
754 static void
755 spa_all_status(void)
756 {
757         spa_t *spa;
758         int first = 1;
759
760         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
761                 if (!first)
762                         pager_printf("\n");
763                 first = 0;
764                 spa_status(spa);
765         }
766 }
767
768 static int
769 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
770 {
771         vdev_t vtmp;
772         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
773         spa_t *spa;
774         vdev_t *vdev, *top_vdev, *pool_vdev;
775         off_t off;
776         blkptr_t bp;
777         const unsigned char *nvlist;
778         uint64_t val;
779         uint64_t guid;
780         uint64_t pool_txg, pool_guid;
781         uint64_t is_log;
782         const char *pool_name;
783         const unsigned char *vdevs;
784         int i, rc, is_newer;
785         char *upbuf;
786         const struct uberblock *up;
787
788         /*
789          * Load the vdev label and figure out which
790          * uberblock is most current.
791          */
792         memset(&vtmp, 0, sizeof(vtmp));
793         vtmp.v_phys_read = read;
794         vtmp.v_read_priv = read_priv;
795         off = offsetof(vdev_label_t, vl_vdev_phys);
796         BP_ZERO(&bp);
797         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
798         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
799         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
800         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
801         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
802         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
803         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
804                 return (EIO);
805
806         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
807                 return (EIO);
808         }
809
810         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
811
812         if (nvlist_find(nvlist,
813                         ZPOOL_CONFIG_VERSION,
814                         DATA_TYPE_UINT64, 0, &val)) {
815                 return (EIO);
816         }
817
818         if (val > SPA_VERSION) {
819                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
820                     (unsigned) val, (unsigned) SPA_VERSION);
821                 return (EIO);
822         }
823
824         if (nvlist_find(nvlist,
825                         ZPOOL_CONFIG_POOL_STATE,
826                         DATA_TYPE_UINT64, 0, &val)) {
827                 return (EIO);
828         }
829
830         if (val == POOL_STATE_DESTROYED) {
831                 /* We don't boot only from destroyed pools. */
832                 return (EIO);
833         }
834
835         if (nvlist_find(nvlist,
836                         ZPOOL_CONFIG_POOL_TXG,
837                         DATA_TYPE_UINT64, 0, &pool_txg)
838             || nvlist_find(nvlist,
839                            ZPOOL_CONFIG_POOL_GUID,
840                            DATA_TYPE_UINT64, 0, &pool_guid)
841             || nvlist_find(nvlist,
842                            ZPOOL_CONFIG_POOL_NAME,
843                            DATA_TYPE_STRING, 0, &pool_name)) {
844                 /*
845                  * Cache and spare devices end up here - just ignore
846                  * them.
847                  */
848                 /*printf("ZFS: can't find pool details\n");*/
849                 return (EIO);
850         }
851
852         is_log = 0;
853         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
854             &is_log);
855         if (is_log)
856                 return (EIO);
857
858         /*
859          * Create the pool if this is the first time we've seen it.
860          */
861         spa = spa_find_by_guid(pool_guid);
862         if (!spa) {
863                 spa = spa_create(pool_guid);
864                 spa->spa_name = strdup(pool_name);
865         }
866         if (pool_txg > spa->spa_txg) {
867                 spa->spa_txg = pool_txg;
868                 is_newer = 1;
869         } else
870                 is_newer = 0;
871
872         /*
873          * Get the vdev tree and create our in-core copy of it.
874          * If we already have a vdev with this guid, this must
875          * be some kind of alias (overlapping slices, dangerously dedicated
876          * disks etc).
877          */
878         if (nvlist_find(nvlist,
879                         ZPOOL_CONFIG_GUID,
880                         DATA_TYPE_UINT64, 0, &guid)) {
881                 return (EIO);
882         }
883         vdev = vdev_find(guid);
884         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
885                 return (EIO);
886
887         if (nvlist_find(nvlist,
888                         ZPOOL_CONFIG_VDEV_TREE,
889                         DATA_TYPE_NVLIST, 0, &vdevs)) {
890                 return (EIO);
891         }
892
893         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
894         if (rc)
895                 return (rc);
896
897         /*
898          * Add the toplevel vdev to the pool if its not already there.
899          */
900         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
901                 if (top_vdev == pool_vdev)
902                         break;
903         if (!pool_vdev && top_vdev)
904                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
905
906         /*
907          * We should already have created an incomplete vdev for this
908          * vdev. Find it and initialise it with our read proc.
909          */
910         vdev = vdev_find(guid);
911         if (vdev) {
912                 vdev->v_phys_read = read;
913                 vdev->v_read_priv = read_priv;
914                 vdev->v_state = VDEV_STATE_HEALTHY;
915         } else {
916                 printf("ZFS: inconsistent nvlist contents\n");
917                 return (EIO);
918         }
919
920         /*
921          * Re-evaluate top-level vdev state.
922          */
923         vdev_set_state(top_vdev);
924
925         /*
926          * Ok, we are happy with the pool so far. Lets find
927          * the best uberblock and then we can actually access
928          * the contents of the pool.
929          */
930         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
931         up = (const struct uberblock *)upbuf;
932         for (i = 0;
933              i < VDEV_UBERBLOCK_COUNT(vdev);
934              i++) {
935                 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
936                 BP_ZERO(&bp);
937                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
938                 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
939                 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
940                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
941                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
942                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
943
944                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
945                         continue;
946
947                 if (up->ub_magic != UBERBLOCK_MAGIC)
948                         continue;
949                 if (up->ub_txg < spa->spa_txg)
950                         continue;
951                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
952                         spa->spa_uberblock = *up;
953                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
954                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
955                                 spa->spa_uberblock = *up;
956                 }
957         }
958         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
959
960         if (spap)
961                 *spap = spa;
962         return (0);
963 }
964
965 static int
966 ilog2(int n)
967 {
968         int v;
969
970         for (v = 0; v < 32; v++)
971                 if (n == (1 << v))
972                         return v;
973         return -1;
974 }
975
976 static int
977 zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
978 {
979         blkptr_t gbh_bp;
980         zio_gbh_phys_t zio_gb;
981         char *pbuf;
982         int i;
983
984         /* Artificial BP for gang block header. */
985         gbh_bp = *bp;
986         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
987         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
988         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
989         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
990         for (i = 0; i < SPA_DVAS_PER_BP; i++)
991                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
992
993         /* Read gang header block using the artificial BP. */
994         if (zio_read(spa, &gbh_bp, &zio_gb))
995                 return (EIO);
996
997         pbuf = buf;
998         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
999                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1000
1001                 if (BP_IS_HOLE(gbp))
1002                         continue;
1003                 if (zio_read(spa, gbp, pbuf))
1004                         return (EIO);
1005                 pbuf += BP_GET_PSIZE(gbp);
1006         }
1007
1008         if (zio_checksum_verify(bp, buf))
1009                 return (EIO);
1010         return (0);
1011 }
1012
1013 static int
1014 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
1015 {
1016         int cpfunc = BP_GET_COMPRESS(bp);
1017         uint64_t align, size;
1018         void *pbuf;
1019         int i, error;
1020
1021         error = EIO;
1022
1023         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1024                 const dva_t *dva = &bp->blk_dva[i];
1025                 vdev_t *vdev;
1026                 int vdevid;
1027                 off_t offset;
1028
1029                 if (!dva->dva_word[0] && !dva->dva_word[1])
1030                         continue;
1031
1032                 vdevid = DVA_GET_VDEV(dva);
1033                 offset = DVA_GET_OFFSET(dva);
1034                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1035                         if (vdev->v_id == vdevid)
1036                                 break;
1037                 }
1038                 if (!vdev || !vdev->v_read)
1039                         continue;
1040
1041                 size = BP_GET_PSIZE(bp);
1042                 if (vdev->v_read == vdev_raidz_read) {
1043                         align = 1ULL << vdev->v_top->v_ashift;
1044                         if (P2PHASE(size, align) != 0)
1045                                 size = P2ROUNDUP(size, align);
1046                 }
1047                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1048                         pbuf = zfs_alloc(size);
1049                 else
1050                         pbuf = buf;
1051
1052                 if (DVA_GET_GANG(dva))
1053                         error = zio_read_gang(spa, bp, pbuf);
1054                 else
1055                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
1056                 if (error == 0) {
1057                         if (cpfunc != ZIO_COMPRESS_OFF)
1058                                 error = zio_decompress_data(cpfunc, pbuf,
1059                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1060                         else if (size != BP_GET_PSIZE(bp))
1061                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1062                 }
1063                 if (buf != pbuf)
1064                         zfs_free(pbuf, size);
1065                 if (error == 0)
1066                         break;
1067         }
1068         if (error != 0)
1069                 printf("ZFS: i/o error - all block copies unavailable\n");
1070         return (error);
1071 }
1072
1073 static int
1074 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1075 {
1076         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1077         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1078         int nlevels = dnode->dn_nlevels;
1079         int i, rc;
1080
1081         /*
1082          * Note: bsize may not be a power of two here so we need to do an
1083          * actual divide rather than a bitshift.
1084          */
1085         while (buflen > 0) {
1086                 uint64_t bn = offset / bsize;
1087                 int boff = offset % bsize;
1088                 int ibn;
1089                 const blkptr_t *indbp;
1090                 blkptr_t bp;
1091
1092                 if (bn > dnode->dn_maxblkid)
1093                         return (EIO);
1094
1095                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1096                         goto cached;
1097
1098                 indbp = dnode->dn_blkptr;
1099                 for (i = 0; i < nlevels; i++) {
1100                         /*
1101                          * Copy the bp from the indirect array so that
1102                          * we can re-use the scratch buffer for multi-level
1103                          * objects.
1104                          */
1105                         ibn = bn >> ((nlevels - i - 1) * ibshift);
1106                         ibn &= ((1 << ibshift) - 1);
1107                         bp = indbp[ibn];
1108                         rc = zio_read(spa, &bp, dnode_cache_buf);
1109                         if (rc)
1110                                 return (rc);
1111                         indbp = (const blkptr_t *) dnode_cache_buf;
1112                 }
1113                 dnode_cache_obj = dnode;
1114                 dnode_cache_bn = bn;
1115         cached:
1116
1117                 /*
1118                  * The buffer contains our data block. Copy what we
1119                  * need from it and loop.
1120                  */ 
1121                 i = bsize - boff;
1122                 if (i > buflen) i = buflen;
1123                 memcpy(buf, &dnode_cache_buf[boff], i);
1124                 buf = ((char*) buf) + i;
1125                 offset += i;
1126                 buflen -= i;
1127         }
1128
1129         return (0);
1130 }
1131
1132 /*
1133  * Lookup a value in a microzap directory. Assumes that the zap
1134  * scratch buffer contains the directory contents.
1135  */
1136 static int
1137 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1138 {
1139         const mzap_phys_t *mz;
1140         const mzap_ent_phys_t *mze;
1141         size_t size;
1142         int chunks, i;
1143
1144         /*
1145          * Microzap objects use exactly one block. Read the whole
1146          * thing.
1147          */
1148         size = dnode->dn_datablkszsec * 512;
1149
1150         mz = (const mzap_phys_t *) zap_scratch;
1151         chunks = size / MZAP_ENT_LEN - 1;
1152
1153         for (i = 0; i < chunks; i++) {
1154                 mze = &mz->mz_chunk[i];
1155                 if (!strcmp(mze->mze_name, name)) {
1156                         *value = mze->mze_value;
1157                         return (0);
1158                 }
1159         }
1160
1161         return (ENOENT);
1162 }
1163
1164 /*
1165  * Compare a name with a zap leaf entry. Return non-zero if the name
1166  * matches.
1167  */
1168 static int
1169 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1170 {
1171         size_t namelen;
1172         const zap_leaf_chunk_t *nc;
1173         const char *p;
1174
1175         namelen = zc->l_entry.le_name_length;
1176                         
1177         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1178         p = name;
1179         while (namelen > 0) {
1180                 size_t len;
1181                 len = namelen;
1182                 if (len > ZAP_LEAF_ARRAY_BYTES)
1183                         len = ZAP_LEAF_ARRAY_BYTES;
1184                 if (memcmp(p, nc->l_array.la_array, len))
1185                         return (0);
1186                 p += len;
1187                 namelen -= len;
1188                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1189         }
1190
1191         return 1;
1192 }
1193
1194 /*
1195  * Extract a uint64_t value from a zap leaf entry.
1196  */
1197 static uint64_t
1198 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1199 {
1200         const zap_leaf_chunk_t *vc;
1201         int i;
1202         uint64_t value;
1203         const uint8_t *p;
1204
1205         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1206         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1207                 value = (value << 8) | p[i];
1208         }
1209
1210         return value;
1211 }
1212
1213 /*
1214  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1215  * buffer contains the directory header.
1216  */
1217 static int
1218 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1219 {
1220         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1221         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1222         fat_zap_t z;
1223         uint64_t *ptrtbl;
1224         uint64_t hash;
1225         int rc;
1226
1227         if (zh.zap_magic != ZAP_MAGIC)
1228                 return (EIO);
1229
1230         z.zap_block_shift = ilog2(bsize);
1231         z.zap_phys = (zap_phys_t *) zap_scratch;
1232
1233         /*
1234          * Figure out where the pointer table is and read it in if necessary.
1235          */
1236         if (zh.zap_ptrtbl.zt_blk) {
1237                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1238                                zap_scratch, bsize);
1239                 if (rc)
1240                         return (rc);
1241                 ptrtbl = (uint64_t *) zap_scratch;
1242         } else {
1243                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1244         }
1245
1246         hash = zap_hash(zh.zap_salt, name);
1247
1248         zap_leaf_t zl;
1249         zl.l_bs = z.zap_block_shift;
1250
1251         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1252         zap_leaf_chunk_t *zc;
1253
1254         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1255         if (rc)
1256                 return (rc);
1257
1258         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1259
1260         /*
1261          * Make sure this chunk matches our hash.
1262          */
1263         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1264             && zl.l_phys->l_hdr.lh_prefix
1265             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1266                 return (ENOENT);
1267
1268         /*
1269          * Hash within the chunk to find our entry.
1270          */
1271         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1272         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1273         h = zl.l_phys->l_hash[h];
1274         if (h == 0xffff)
1275                 return (ENOENT);
1276         zc = &ZAP_LEAF_CHUNK(&zl, h);
1277         while (zc->l_entry.le_hash != hash) {
1278                 if (zc->l_entry.le_next == 0xffff) {
1279                         zc = 0;
1280                         break;
1281                 }
1282                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1283         }
1284         if (fzap_name_equal(&zl, zc, name)) {
1285                 *value = fzap_leaf_value(&zl, zc);
1286                 return (0);
1287         }
1288
1289         return (ENOENT);
1290 }
1291
1292 /*
1293  * Lookup a name in a zap object and return its value as a uint64_t.
1294  */
1295 static int
1296 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1297 {
1298         int rc;
1299         uint64_t zap_type;
1300         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1301
1302         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1303         if (rc)
1304                 return (rc);
1305
1306         zap_type = *(uint64_t *) zap_scratch;
1307         if (zap_type == ZBT_MICRO)
1308                 return mzap_lookup(spa, dnode, name, value);
1309         else if (zap_type == ZBT_HEADER)
1310                 return fzap_lookup(spa, dnode, name, value);
1311         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1312         return (EIO);
1313 }
1314
1315 #ifdef BOOT2
1316
1317 /*
1318  * List a microzap directory. Assumes that the zap scratch buffer contains
1319  * the directory contents.
1320  */
1321 static int
1322 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
1323 {
1324         const mzap_phys_t *mz;
1325         const mzap_ent_phys_t *mze;
1326         size_t size;
1327         int chunks, i;
1328
1329         /*
1330          * Microzap objects use exactly one block. Read the whole
1331          * thing.
1332          */
1333         size = dnode->dn_datablkszsec * 512;
1334         mz = (const mzap_phys_t *) zap_scratch;
1335         chunks = size / MZAP_ENT_LEN - 1;
1336
1337         for (i = 0; i < chunks; i++) {
1338                 mze = &mz->mz_chunk[i];
1339                 if (mze->mze_name[0])
1340                         //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
1341                         printf("%s\n", mze->mze_name);
1342         }
1343
1344         return (0);
1345 }
1346
1347 /*
1348  * List a fatzap directory. Assumes that the zap scratch buffer contains
1349  * the directory header.
1350  */
1351 static int
1352 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
1353 {
1354         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1355         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1356         fat_zap_t z;
1357         int i, j;
1358
1359         if (zh.zap_magic != ZAP_MAGIC)
1360                 return (EIO);
1361
1362         z.zap_block_shift = ilog2(bsize);
1363         z.zap_phys = (zap_phys_t *) zap_scratch;
1364
1365         /*
1366          * This assumes that the leaf blocks start at block 1. The
1367          * documentation isn't exactly clear on this.
1368          */
1369         zap_leaf_t zl;
1370         zl.l_bs = z.zap_block_shift;
1371         for (i = 0; i < zh.zap_num_leafs; i++) {
1372                 off_t off = (i + 1) << zl.l_bs;
1373                 char name[256], *p;
1374                 uint64_t value;
1375
1376                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1377                         return (EIO);
1378
1379                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1380
1381                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1382                         zap_leaf_chunk_t *zc, *nc;
1383                         int namelen;
1384
1385                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1386                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1387                                 continue;
1388                         namelen = zc->l_entry.le_name_length;
1389                         if (namelen > sizeof(name))
1390                                 namelen = sizeof(name);
1391                         
1392                         /*
1393                          * Paste the name back together.
1394                          */
1395                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1396                         p = name;
1397                         while (namelen > 0) {
1398                                 int len;
1399                                 len = namelen;
1400                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1401                                         len = ZAP_LEAF_ARRAY_BYTES;
1402                                 memcpy(p, nc->l_array.la_array, len);
1403                                 p += len;
1404                                 namelen -= len;
1405                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1406                         }
1407
1408                         /*
1409                          * Assume the first eight bytes of the value are
1410                          * a uint64_t.
1411                          */
1412                         value = fzap_leaf_value(&zl, zc);
1413
1414                         printf("%s 0x%llx\n", name, value);
1415                 }
1416         }
1417
1418         return (0);
1419 }
1420
1421 /*
1422  * List a zap directory.
1423  */
1424 static int
1425 zap_list(spa_t *spa, const dnode_phys_t *dnode)
1426 {
1427         uint64_t zap_type;
1428         size_t size = dnode->dn_datablkszsec * 512;
1429
1430         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1431                 return (EIO);
1432
1433         zap_type = *(uint64_t *) zap_scratch;
1434         if (zap_type == ZBT_MICRO)
1435                 return mzap_list(spa, dnode);
1436         else
1437                 return fzap_list(spa, dnode);
1438 }
1439
1440 #endif
1441
1442 static int
1443 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1444 {
1445         off_t offset;
1446
1447         offset = objnum * sizeof(dnode_phys_t);
1448         return dnode_read(spa, &os->os_meta_dnode, offset,
1449                 dnode, sizeof(dnode_phys_t));
1450 }
1451
1452 /*
1453  * Find the object set given the object number of its dataset object
1454  * and return its details in *objset
1455  */
1456 static int
1457 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1458 {
1459         dnode_phys_t dataset;
1460         dsl_dataset_phys_t *ds;
1461
1462         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1463                 printf("ZFS: can't find dataset %llu\n", objnum);
1464                 return (EIO);
1465         }
1466
1467         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1468         if (zio_read(spa, &ds->ds_bp, objset)) {
1469                 printf("ZFS: can't read object set for dataset %llu\n", objnum);
1470                 return (EIO);
1471         }
1472
1473         return (0);
1474 }
1475
1476 /*
1477  * Find the object set pointed to by the BOOTFS property or the root
1478  * dataset if there is none and return its details in *objset
1479  */
1480 static int
1481 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
1482 {
1483         dnode_phys_t dir, propdir;
1484         uint64_t props, bootfs, root;
1485
1486         /*
1487          * Start with the MOS directory object.
1488          */
1489         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1490                 printf("ZFS: can't read MOS object directory\n");
1491                 return (EIO);
1492         }
1493
1494         /*
1495          * Lookup the pool_props and see if we can find a bootfs.
1496          */
1497         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1498              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1499              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
1500              && bootfs != 0)
1501                 return zfs_mount_dataset(spa, bootfs, objset);
1502
1503         /*
1504          * Lookup the root dataset directory
1505          */
1506         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1507             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1508                 printf("ZFS: can't find root dsl_dir\n");
1509                 return (EIO);
1510         }
1511
1512         /*
1513          * Use the information from the dataset directory's bonus buffer
1514          * to find the dataset object and from that the object set itself.
1515          */
1516         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1517         return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
1518 }
1519
1520 static int
1521 zfs_mount_pool(spa_t *spa)
1522 {
1523
1524         /*
1525          * Find the MOS and work our way in from there.
1526          */
1527         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
1528                 printf("ZFS: can't read MOS\n");
1529                 return (EIO);
1530         }
1531
1532         /*
1533          * Find the root object set
1534          */
1535         if (zfs_mount_root(spa, &spa->spa_root_objset)) {
1536                 printf("Can't find root filesystem - giving up\n");
1537                 return (EIO);
1538         }
1539
1540         return (0);
1541 }
1542
1543 static int
1544 zfs_dnode_stat(spa_t *spa, dnode_phys_t *dn, struct stat *sb)
1545 {
1546
1547         if (dn->dn_bonustype != DMU_OT_SA) {
1548                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
1549
1550                 sb->st_mode = zp->zp_mode;
1551                 sb->st_uid = zp->zp_uid;
1552                 sb->st_gid = zp->zp_gid;
1553                 sb->st_size = zp->zp_size;
1554         } else {
1555                 sa_hdr_phys_t *sahdrp;
1556                 int hdrsize;
1557                 size_t size = 0;
1558                 void *buf = NULL;
1559
1560                 if (dn->dn_bonuslen != 0)
1561                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
1562                 else {
1563                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
1564                                 blkptr_t *bp = &dn->dn_spill;
1565                                 int error;
1566
1567                                 size = BP_GET_LSIZE(bp);
1568                                 buf = zfs_alloc(size);
1569                                 error = zio_read(spa, bp, buf);
1570                                 if (error != 0) {
1571                                         zfs_free(buf, size);
1572                                         return (error);
1573                                 }
1574                                 sahdrp = buf;
1575                         } else {
1576                                 return (EIO);
1577                         }
1578                 }
1579                 hdrsize = SA_HDR_SIZE(sahdrp);
1580                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
1581                     SA_MODE_OFFSET);
1582                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
1583                     SA_UID_OFFSET);
1584                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
1585                     SA_GID_OFFSET);
1586                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
1587                     SA_SIZE_OFFSET);
1588                 if (buf != NULL)
1589                         zfs_free(buf, size);
1590         }
1591
1592         return (0);
1593 }
1594
1595 /*
1596  * Lookup a file and return its dnode.
1597  */
1598 static int
1599 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
1600 {
1601         int rc;
1602         uint64_t objnum, rootnum, parentnum;
1603         dnode_phys_t dn;
1604         const char *p, *q;
1605         char element[256];
1606         char path[1024];
1607         int symlinks_followed = 0;
1608         struct stat sb;
1609
1610         if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
1611                 printf("ZFS: unexpected object set type %llu\n",
1612                        spa->spa_root_objset.os_type);
1613                 return (EIO);
1614         }
1615
1616         /*
1617          * Get the root directory dnode.
1618          */
1619         rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
1620         if (rc)
1621                 return (rc);
1622
1623         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
1624         if (rc)
1625                 return (rc);
1626
1627         rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
1628         if (rc)
1629                 return (rc);
1630
1631         objnum = rootnum;
1632         p = upath;
1633         while (p && *p) {
1634                 while (*p == '/')
1635                         p++;
1636                 if (!*p)
1637                         break;
1638                 q = strchr(p, '/');
1639                 if (q) {
1640                         memcpy(element, p, q - p);
1641                         element[q - p] = 0;
1642                         p = q;
1643                 } else {
1644                         strcpy(element, p);
1645                         p = 0;
1646                 }
1647
1648                 rc = zfs_dnode_stat(spa, &dn, &sb);
1649                 if (rc)
1650                         return (rc);
1651                 if (!S_ISDIR(sb.st_mode))
1652                         return (ENOTDIR);
1653
1654                 parentnum = objnum;
1655                 rc = zap_lookup(spa, &dn, element, &objnum);
1656                 if (rc)
1657                         return (rc);
1658                 objnum = ZFS_DIRENT_OBJ(objnum);
1659
1660                 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1661                 if (rc)
1662                         return (rc);
1663
1664                 /*
1665                  * Check for symlink.
1666                  */
1667                 rc = zfs_dnode_stat(spa, &dn, &sb);
1668                 if (rc)
1669                         return (rc);
1670                 if (S_ISLNK(sb.st_mode)) {
1671                         if (symlinks_followed > 10)
1672                                 return (EMLINK);
1673                         symlinks_followed++;
1674
1675                         /*
1676                          * Read the link value and copy the tail of our
1677                          * current path onto the end.
1678                          */
1679                         if (p)
1680                                 strcpy(&path[sb.st_size], p);
1681                         else
1682                                 path[sb.st_size] = 0;
1683                         if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
1684                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
1685                                         sb.st_size);
1686                         } else {
1687                                 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
1688                                 if (rc)
1689                                         return (rc);
1690                         }
1691
1692                         /*
1693                          * Restart with the new path, starting either at
1694                          * the root or at the parent depending whether or
1695                          * not the link is relative.
1696                          */
1697                         p = path;
1698                         if (*p == '/')
1699                                 objnum = rootnum;
1700                         else
1701                                 objnum = parentnum;
1702                         objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1703                 }
1704         }
1705
1706         *dnode = dn;
1707         return (0);
1708 }