]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/boot/zfs/zfsimpl.c
MFV r308954:
[FreeBSD/FreeBSD.git] / sys / boot / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <sys/stat.h>
35 #include <sys/stdint.h>
36
37 #include "zfsimpl.h"
38 #include "zfssubr.c"
39
40
41 struct zfsmount {
42         const spa_t     *spa;
43         objset_phys_t   objset;
44         uint64_t        rootobj;
45 };
46
47 /*
48  * List of all vdevs, chained through v_alllink.
49  */
50 static vdev_list_t zfs_vdevs;
51
52  /*
53  * List of ZFS features supported for read
54  */
55 static const char *features_for_read[] = {
56         "org.illumos:lz4_compress",
57         "com.delphix:hole_birth",
58         "com.delphix:extensible_dataset",
59         "com.delphix:embedded_data",
60         "org.open-zfs:large_blocks",
61         "org.illumos:sha512",
62         "org.illumos:skein",
63         NULL
64 };
65
66 /*
67  * List of all pools, chained through spa_link.
68  */
69 static spa_list_t zfs_pools;
70
71 static uint64_t zfs_crc64_table[256];
72 static const dnode_phys_t *dnode_cache_obj = 0;
73 static uint64_t dnode_cache_bn;
74 static char *dnode_cache_buf;
75 static char *zap_scratch;
76 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
77
78 #define TEMP_SIZE       (1024 * 1024)
79
80 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
81 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
82 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
83 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
84     const char *name, uint64_t integer_size, uint64_t num_integers,
85     void *value);
86
87 static void
88 zfs_init(void)
89 {
90         STAILQ_INIT(&zfs_vdevs);
91         STAILQ_INIT(&zfs_pools);
92
93         zfs_temp_buf = malloc(TEMP_SIZE);
94         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
95         zfs_temp_ptr = zfs_temp_buf;
96         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
97         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
98
99         zfs_init_crc();
100 }
101
102 static void *
103 zfs_alloc(size_t size)
104 {
105         char *ptr;
106
107         if (zfs_temp_ptr + size > zfs_temp_end) {
108                 printf("ZFS: out of temporary buffer space\n");
109                 for (;;) ;
110         }
111         ptr = zfs_temp_ptr;
112         zfs_temp_ptr += size;
113
114         return (ptr);
115 }
116
117 static void
118 zfs_free(void *ptr, size_t size)
119 {
120
121         zfs_temp_ptr -= size;
122         if (zfs_temp_ptr != ptr) {
123                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
124                 for (;;) ;
125         }
126 }
127
128 static int
129 xdr_int(const unsigned char **xdr, int *ip)
130 {
131         *ip = ((*xdr)[0] << 24)
132                 | ((*xdr)[1] << 16)
133                 | ((*xdr)[2] << 8)
134                 | ((*xdr)[3] << 0);
135         (*xdr) += 4;
136         return (0);
137 }
138
139 static int
140 xdr_u_int(const unsigned char **xdr, u_int *ip)
141 {
142         *ip = ((*xdr)[0] << 24)
143                 | ((*xdr)[1] << 16)
144                 | ((*xdr)[2] << 8)
145                 | ((*xdr)[3] << 0);
146         (*xdr) += 4;
147         return (0);
148 }
149
150 static int
151 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
152 {
153         u_int hi, lo;
154
155         xdr_u_int(xdr, &hi);
156         xdr_u_int(xdr, &lo);
157         *lp = (((uint64_t) hi) << 32) | lo;
158         return (0);
159 }
160
161 static int
162 nvlist_find(const unsigned char *nvlist, const char *name, int type,
163             int* elementsp, void *valuep)
164 {
165         const unsigned char *p, *pair;
166         int junk;
167         int encoded_size, decoded_size;
168
169         p = nvlist;
170         xdr_int(&p, &junk);
171         xdr_int(&p, &junk);
172
173         pair = p;
174         xdr_int(&p, &encoded_size);
175         xdr_int(&p, &decoded_size);
176         while (encoded_size && decoded_size) {
177                 int namelen, pairtype, elements;
178                 const char *pairname;
179
180                 xdr_int(&p, &namelen);
181                 pairname = (const char*) p;
182                 p += roundup(namelen, 4);
183                 xdr_int(&p, &pairtype);
184
185                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
186                         xdr_int(&p, &elements);
187                         if (elementsp)
188                                 *elementsp = elements;
189                         if (type == DATA_TYPE_UINT64) {
190                                 xdr_uint64_t(&p, (uint64_t *) valuep);
191                                 return (0);
192                         } else if (type == DATA_TYPE_STRING) {
193                                 int len;
194                                 xdr_int(&p, &len);
195                                 (*(const char**) valuep) = (const char*) p;
196                                 return (0);
197                         } else if (type == DATA_TYPE_NVLIST
198                                    || type == DATA_TYPE_NVLIST_ARRAY) {
199                                 (*(const unsigned char**) valuep) =
200                                          (const unsigned char*) p;
201                                 return (0);
202                         } else {
203                                 return (EIO);
204                         }
205                 } else {
206                         /*
207                          * Not the pair we are looking for, skip to the next one.
208                          */
209                         p = pair + encoded_size;
210                 }
211
212                 pair = p;
213                 xdr_int(&p, &encoded_size);
214                 xdr_int(&p, &decoded_size);
215         }
216
217         return (EIO);
218 }
219
220 static int
221 nvlist_check_features_for_read(const unsigned char *nvlist)
222 {
223         const unsigned char *p, *pair;
224         int junk;
225         int encoded_size, decoded_size;
226         int rc;
227
228         rc = 0;
229
230         p = nvlist;
231         xdr_int(&p, &junk);
232         xdr_int(&p, &junk);
233
234         pair = p;
235         xdr_int(&p, &encoded_size);
236         xdr_int(&p, &decoded_size);
237         while (encoded_size && decoded_size) {
238                 int namelen, pairtype;
239                 const char *pairname;
240                 int i, found;
241
242                 found = 0;
243
244                 xdr_int(&p, &namelen);
245                 pairname = (const char*) p;
246                 p += roundup(namelen, 4);
247                 xdr_int(&p, &pairtype);
248
249                 for (i = 0; features_for_read[i] != NULL; i++) {
250                         if (!memcmp(pairname, features_for_read[i], namelen)) {
251                                 found = 1;
252                                 break;
253                         }
254                 }
255
256                 if (!found) {
257                         printf("ZFS: unsupported feature: %s\n", pairname);
258                         rc = EIO;
259                 }
260
261                 p = pair + encoded_size;
262
263                 pair = p;
264                 xdr_int(&p, &encoded_size);
265                 xdr_int(&p, &decoded_size);
266         }
267
268         return (rc);
269 }
270
271 /*
272  * Return the next nvlist in an nvlist array.
273  */
274 static const unsigned char *
275 nvlist_next(const unsigned char *nvlist)
276 {
277         const unsigned char *p, *pair;
278         int junk;
279         int encoded_size, decoded_size;
280
281         p = nvlist;
282         xdr_int(&p, &junk);
283         xdr_int(&p, &junk);
284
285         pair = p;
286         xdr_int(&p, &encoded_size);
287         xdr_int(&p, &decoded_size);
288         while (encoded_size && decoded_size) {
289                 p = pair + encoded_size;
290
291                 pair = p;
292                 xdr_int(&p, &encoded_size);
293                 xdr_int(&p, &decoded_size);
294         }
295
296         return p;
297 }
298
299 #ifdef TEST
300
301 static const unsigned char *
302 nvlist_print(const unsigned char *nvlist, unsigned int indent)
303 {
304         static const char* typenames[] = {
305                 "DATA_TYPE_UNKNOWN",
306                 "DATA_TYPE_BOOLEAN",
307                 "DATA_TYPE_BYTE",
308                 "DATA_TYPE_INT16",
309                 "DATA_TYPE_UINT16",
310                 "DATA_TYPE_INT32",
311                 "DATA_TYPE_UINT32",
312                 "DATA_TYPE_INT64",
313                 "DATA_TYPE_UINT64",
314                 "DATA_TYPE_STRING",
315                 "DATA_TYPE_BYTE_ARRAY",
316                 "DATA_TYPE_INT16_ARRAY",
317                 "DATA_TYPE_UINT16_ARRAY",
318                 "DATA_TYPE_INT32_ARRAY",
319                 "DATA_TYPE_UINT32_ARRAY",
320                 "DATA_TYPE_INT64_ARRAY",
321                 "DATA_TYPE_UINT64_ARRAY",
322                 "DATA_TYPE_STRING_ARRAY",
323                 "DATA_TYPE_HRTIME",
324                 "DATA_TYPE_NVLIST",
325                 "DATA_TYPE_NVLIST_ARRAY",
326                 "DATA_TYPE_BOOLEAN_VALUE",
327                 "DATA_TYPE_INT8",
328                 "DATA_TYPE_UINT8",
329                 "DATA_TYPE_BOOLEAN_ARRAY",
330                 "DATA_TYPE_INT8_ARRAY",
331                 "DATA_TYPE_UINT8_ARRAY"
332         };
333
334         unsigned int i, j;
335         const unsigned char *p, *pair;
336         int junk;
337         int encoded_size, decoded_size;
338
339         p = nvlist;
340         xdr_int(&p, &junk);
341         xdr_int(&p, &junk);
342
343         pair = p;
344         xdr_int(&p, &encoded_size);
345         xdr_int(&p, &decoded_size);
346         while (encoded_size && decoded_size) {
347                 int namelen, pairtype, elements;
348                 const char *pairname;
349
350                 xdr_int(&p, &namelen);
351                 pairname = (const char*) p;
352                 p += roundup(namelen, 4);
353                 xdr_int(&p, &pairtype);
354
355                 for (i = 0; i < indent; i++)
356                         printf(" ");
357                 printf("%s %s", typenames[pairtype], pairname);
358
359                 xdr_int(&p, &elements);
360                 switch (pairtype) {
361                 case DATA_TYPE_UINT64: {
362                         uint64_t val;
363                         xdr_uint64_t(&p, &val);
364                         printf(" = 0x%jx\n", (uintmax_t)val);
365                         break;
366                 }
367
368                 case DATA_TYPE_STRING: {
369                         int len;
370                         xdr_int(&p, &len);
371                         printf(" = \"%s\"\n", p);
372                         break;
373                 }
374
375                 case DATA_TYPE_NVLIST:
376                         printf("\n");
377                         nvlist_print(p, indent + 1);
378                         break;
379
380                 case DATA_TYPE_NVLIST_ARRAY:
381                         for (j = 0; j < elements; j++) {
382                                 printf("[%d]\n", j);
383                                 p = nvlist_print(p, indent + 1);
384                                 if (j != elements - 1) {
385                                         for (i = 0; i < indent; i++)
386                                                 printf(" ");
387                                         printf("%s %s", typenames[pairtype], pairname);
388                                 }
389                         }
390                         break;
391
392                 default:
393                         printf("\n");
394                 }
395
396                 p = pair + encoded_size;
397
398                 pair = p;
399                 xdr_int(&p, &encoded_size);
400                 xdr_int(&p, &decoded_size);
401         }
402
403         return p;
404 }
405
406 #endif
407
408 static int
409 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
410     off_t offset, size_t size)
411 {
412         size_t psize;
413         int rc;
414
415         if (!vdev->v_phys_read)
416                 return (EIO);
417
418         if (bp) {
419                 psize = BP_GET_PSIZE(bp);
420         } else {
421                 psize = size;
422         }
423
424         /*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
425         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
426         if (rc)
427                 return (rc);
428         if (bp && zio_checksum_verify(vdev->spa, bp, buf))
429                 return (EIO);
430
431         return (0);
432 }
433
434 static int
435 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
436     off_t offset, size_t bytes)
437 {
438
439         return (vdev_read_phys(vdev, bp, buf,
440                 offset + VDEV_LABEL_START_SIZE, bytes));
441 }
442
443
444 static int
445 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
446     off_t offset, size_t bytes)
447 {
448         vdev_t *kid;
449         int rc;
450
451         rc = EIO;
452         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
453                 if (kid->v_state != VDEV_STATE_HEALTHY)
454                         continue;
455                 rc = kid->v_read(kid, bp, buf, offset, bytes);
456                 if (!rc)
457                         return (0);
458         }
459
460         return (rc);
461 }
462
463 static int
464 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
465     off_t offset, size_t bytes)
466 {
467         vdev_t *kid;
468
469         /*
470          * Here we should have two kids:
471          * First one which is the one we are replacing and we can trust
472          * only this one to have valid data, but it might not be present.
473          * Second one is that one we are replacing with. It is most likely
474          * healthy, but we can't trust it has needed data, so we won't use it.
475          */
476         kid = STAILQ_FIRST(&vdev->v_children);
477         if (kid == NULL)
478                 return (EIO);
479         if (kid->v_state != VDEV_STATE_HEALTHY)
480                 return (EIO);
481         return (kid->v_read(kid, bp, buf, offset, bytes));
482 }
483
484 static vdev_t *
485 vdev_find(uint64_t guid)
486 {
487         vdev_t *vdev;
488
489         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
490                 if (vdev->v_guid == guid)
491                         return (vdev);
492
493         return (0);
494 }
495
496 static vdev_t *
497 vdev_create(uint64_t guid, vdev_read_t *read)
498 {
499         vdev_t *vdev;
500
501         vdev = malloc(sizeof(vdev_t));
502         memset(vdev, 0, sizeof(vdev_t));
503         STAILQ_INIT(&vdev->v_children);
504         vdev->v_guid = guid;
505         vdev->v_state = VDEV_STATE_OFFLINE;
506         vdev->v_read = read;
507         vdev->v_phys_read = 0;
508         vdev->v_read_priv = 0;
509         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
510
511         return (vdev);
512 }
513
514 static int
515 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
516     vdev_t **vdevp, int is_newer)
517 {
518         int rc;
519         uint64_t guid, id, ashift, nparity;
520         const char *type;
521         const char *path;
522         vdev_t *vdev, *kid;
523         const unsigned char *kids;
524         int nkids, i, is_new;
525         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
526
527         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
528                         DATA_TYPE_UINT64, 0, &guid)
529             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
530                            DATA_TYPE_UINT64, 0, &id)
531             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
532                            DATA_TYPE_STRING, 0, &type)) {
533                 printf("ZFS: can't find vdev details\n");
534                 return (ENOENT);
535         }
536
537         if (strcmp(type, VDEV_TYPE_MIRROR)
538             && strcmp(type, VDEV_TYPE_DISK)
539 #ifdef ZFS_TEST
540             && strcmp(type, VDEV_TYPE_FILE)
541 #endif
542             && strcmp(type, VDEV_TYPE_RAIDZ)
543             && strcmp(type, VDEV_TYPE_REPLACING)) {
544                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
545                 return (EIO);
546         }
547
548         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
549
550         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
551                         &is_offline);
552         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
553                         &is_removed);
554         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
555                         &is_faulted);
556         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
557                         &is_degraded);
558         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
559                         &isnt_present);
560
561         vdev = vdev_find(guid);
562         if (!vdev) {
563                 is_new = 1;
564
565                 if (!strcmp(type, VDEV_TYPE_MIRROR))
566                         vdev = vdev_create(guid, vdev_mirror_read);
567                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
568                         vdev = vdev_create(guid, vdev_raidz_read);
569                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
570                         vdev = vdev_create(guid, vdev_replacing_read);
571                 else
572                         vdev = vdev_create(guid, vdev_disk_read);
573
574                 vdev->v_id = id;
575                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
576                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
577                         DATA_TYPE_UINT64, 0, &ashift) == 0)
578                         vdev->v_ashift = ashift;
579                 else
580                         vdev->v_ashift = 0;
581                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
582                         DATA_TYPE_UINT64, 0, &nparity) == 0)
583                         vdev->v_nparity = nparity;
584                 else
585                         vdev->v_nparity = 0;
586                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
587                                 DATA_TYPE_STRING, 0, &path) == 0) {
588                         if (strncmp(path, "/dev/", 5) == 0)
589                                 path += 5;
590                         vdev->v_name = strdup(path);
591                 } else {
592                         if (!strcmp(type, "raidz")) {
593                                 if (vdev->v_nparity == 1)
594                                         vdev->v_name = "raidz1";
595                                 else if (vdev->v_nparity == 2)
596                                         vdev->v_name = "raidz2";
597                                 else if (vdev->v_nparity == 3)
598                                         vdev->v_name = "raidz3";
599                                 else {
600                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
601                                         return (EIO);
602                                 }
603                         } else {
604                                 vdev->v_name = strdup(type);
605                         }
606                 }
607         } else {
608                 is_new = 0;
609         }
610
611         if (is_new || is_newer) {
612                 /*
613                  * This is either new vdev or we've already seen this vdev,
614                  * but from an older vdev label, so let's refresh its state
615                  * from the newer label.
616                  */
617                 if (is_offline)
618                         vdev->v_state = VDEV_STATE_OFFLINE;
619                 else if (is_removed)
620                         vdev->v_state = VDEV_STATE_REMOVED;
621                 else if (is_faulted)
622                         vdev->v_state = VDEV_STATE_FAULTED;
623                 else if (is_degraded)
624                         vdev->v_state = VDEV_STATE_DEGRADED;
625                 else if (isnt_present)
626                         vdev->v_state = VDEV_STATE_CANT_OPEN;
627         }
628
629         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
630                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
631         /*
632          * Its ok if we don't have any kids.
633          */
634         if (rc == 0) {
635                 vdev->v_nchildren = nkids;
636                 for (i = 0; i < nkids; i++) {
637                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
638                         if (rc)
639                                 return (rc);
640                         if (is_new)
641                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
642                                                    v_childlink);
643                         kids = nvlist_next(kids);
644                 }
645         } else {
646                 vdev->v_nchildren = 0;
647         }
648
649         if (vdevp)
650                 *vdevp = vdev;
651         return (0);
652 }
653
654 static void
655 vdev_set_state(vdev_t *vdev)
656 {
657         vdev_t *kid;
658         int good_kids;
659         int bad_kids;
660
661         /*
662          * A mirror or raidz is healthy if all its kids are healthy. A
663          * mirror is degraded if any of its kids is healthy; a raidz
664          * is degraded if at most nparity kids are offline.
665          */
666         if (STAILQ_FIRST(&vdev->v_children)) {
667                 good_kids = 0;
668                 bad_kids = 0;
669                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
670                         if (kid->v_state == VDEV_STATE_HEALTHY)
671                                 good_kids++;
672                         else
673                                 bad_kids++;
674                 }
675                 if (bad_kids == 0) {
676                         vdev->v_state = VDEV_STATE_HEALTHY;
677                 } else {
678                         if (vdev->v_read == vdev_mirror_read) {
679                                 if (good_kids) {
680                                         vdev->v_state = VDEV_STATE_DEGRADED;
681                                 } else {
682                                         vdev->v_state = VDEV_STATE_OFFLINE;
683                                 }
684                         } else if (vdev->v_read == vdev_raidz_read) {
685                                 if (bad_kids > vdev->v_nparity) {
686                                         vdev->v_state = VDEV_STATE_OFFLINE;
687                                 } else {
688                                         vdev->v_state = VDEV_STATE_DEGRADED;
689                                 }
690                         }
691                 }
692         }
693 }
694
695 static spa_t *
696 spa_find_by_guid(uint64_t guid)
697 {
698         spa_t *spa;
699
700         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
701                 if (spa->spa_guid == guid)
702                         return (spa);
703
704         return (0);
705 }
706
707 static spa_t *
708 spa_find_by_name(const char *name)
709 {
710         spa_t *spa;
711
712         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
713                 if (!strcmp(spa->spa_name, name))
714                         return (spa);
715
716         return (0);
717 }
718
719 #ifdef BOOT2
720 static spa_t *
721 spa_get_primary(void)
722 {
723
724         return (STAILQ_FIRST(&zfs_pools));
725 }
726
727 static vdev_t *
728 spa_get_primary_vdev(const spa_t *spa)
729 {
730         vdev_t *vdev;
731         vdev_t *kid;
732
733         if (spa == NULL)
734                 spa = spa_get_primary();
735         if (spa == NULL)
736                 return (NULL);
737         vdev = STAILQ_FIRST(&spa->spa_vdevs);
738         if (vdev == NULL)
739                 return (NULL);
740         for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
741              kid = STAILQ_FIRST(&vdev->v_children))
742                 vdev = kid;
743         return (vdev);
744 }
745 #endif
746
747 static spa_t *
748 spa_create(uint64_t guid)
749 {
750         spa_t *spa;
751
752         spa = malloc(sizeof(spa_t));
753         memset(spa, 0, sizeof(spa_t));
754         STAILQ_INIT(&spa->spa_vdevs);
755         spa->spa_guid = guid;
756         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
757
758         return (spa);
759 }
760
761 static const char *
762 state_name(vdev_state_t state)
763 {
764         static const char* names[] = {
765                 "UNKNOWN",
766                 "CLOSED",
767                 "OFFLINE",
768                 "REMOVED",
769                 "CANT_OPEN",
770                 "FAULTED",
771                 "DEGRADED",
772                 "ONLINE"
773         };
774         return names[state];
775 }
776
777 #ifdef BOOT2
778
779 #define pager_printf printf
780
781 #else
782
783 static int
784 pager_printf(const char *fmt, ...)
785 {
786         char line[80];
787         va_list args;
788
789         va_start(args, fmt);
790         vsprintf(line, fmt, args);
791         va_end(args);
792         return (pager_output(line));
793 }
794
795 #endif
796
797 #define STATUS_FORMAT   "        %s %s\n"
798
799 static int
800 print_state(int indent, const char *name, vdev_state_t state)
801 {
802         int i;
803         char buf[512];
804
805         buf[0] = 0;
806         for (i = 0; i < indent; i++)
807                 strcat(buf, "  ");
808         strcat(buf, name);
809         return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
810         
811 }
812
813 static int
814 vdev_status(vdev_t *vdev, int indent)
815 {
816         vdev_t *kid;
817         int ret;
818         ret = print_state(indent, vdev->v_name, vdev->v_state);
819         if (ret != 0)
820                 return (ret);
821
822         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
823                 ret = vdev_status(kid, indent + 1);
824                 if (ret != 0)
825                         return (ret);
826         }
827         return (ret);
828 }
829
830 static int
831 spa_status(spa_t *spa)
832 {
833         static char bootfs[ZFS_MAXNAMELEN];
834         uint64_t rootid;
835         vdev_t *vdev;
836         int good_kids, bad_kids, degraded_kids, ret;
837         vdev_state_t state;
838
839         ret = pager_printf("  pool: %s\n", spa->spa_name);
840         if (ret != 0)
841                 return (ret);
842
843         if (zfs_get_root(spa, &rootid) == 0 &&
844             zfs_rlookup(spa, rootid, bootfs) == 0) {
845                 if (bootfs[0] == '\0')
846                         ret = pager_printf("bootfs: %s\n", spa->spa_name);
847                 else
848                         ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
849                             bootfs);
850                 if (ret != 0)
851                         return (ret);
852         }
853         ret = pager_printf("config:\n\n");
854         if (ret != 0)
855                 return (ret);
856         ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
857         if (ret != 0)
858                 return (ret);
859
860         good_kids = 0;
861         degraded_kids = 0;
862         bad_kids = 0;
863         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
864                 if (vdev->v_state == VDEV_STATE_HEALTHY)
865                         good_kids++;
866                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
867                         degraded_kids++;
868                 else
869                         bad_kids++;
870         }
871
872         state = VDEV_STATE_CLOSED;
873         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
874                 state = VDEV_STATE_HEALTHY;
875         else if ((good_kids + degraded_kids) > 0)
876                 state = VDEV_STATE_DEGRADED;
877
878         ret = print_state(0, spa->spa_name, state);
879         if (ret != 0)
880                 return (ret);
881         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
882                 ret = vdev_status(vdev, 1);
883                 if (ret != 0)
884                         return (ret);
885         }
886         return (ret);
887 }
888
889 static int
890 spa_all_status(void)
891 {
892         spa_t *spa;
893         int first = 1, ret = 0;
894
895         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
896                 if (!first) {
897                         ret = pager_printf("\n");
898                         if (ret != 0)
899                                 return (ret);
900                 }
901                 first = 0;
902                 ret = spa_status(spa);
903                 if (ret != 0)
904                         return (ret);
905         }
906         return (ret);
907 }
908
909 static int
910 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
911 {
912         vdev_t vtmp;
913         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
914         spa_t *spa;
915         vdev_t *vdev, *top_vdev, *pool_vdev;
916         off_t off;
917         blkptr_t bp;
918         const unsigned char *nvlist;
919         uint64_t val;
920         uint64_t guid;
921         uint64_t pool_txg, pool_guid;
922         uint64_t is_log;
923         const char *pool_name;
924         const unsigned char *vdevs;
925         const unsigned char *features;
926         int i, rc, is_newer;
927         char *upbuf;
928         const struct uberblock *up;
929
930         /*
931          * Load the vdev label and figure out which
932          * uberblock is most current.
933          */
934         memset(&vtmp, 0, sizeof(vtmp));
935         vtmp.v_phys_read = read;
936         vtmp.v_read_priv = read_priv;
937         off = offsetof(vdev_label_t, vl_vdev_phys);
938         BP_ZERO(&bp);
939         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
940         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
941         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
942         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
943         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
944         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
945         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
946                 return (EIO);
947
948         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
949                 return (EIO);
950         }
951
952         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
953
954         if (nvlist_find(nvlist,
955                         ZPOOL_CONFIG_VERSION,
956                         DATA_TYPE_UINT64, 0, &val)) {
957                 return (EIO);
958         }
959
960         if (!SPA_VERSION_IS_SUPPORTED(val)) {
961                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
962                     (unsigned) val, (unsigned) SPA_VERSION);
963                 return (EIO);
964         }
965
966         /* Check ZFS features for read */
967         if (nvlist_find(nvlist,
968                         ZPOOL_CONFIG_FEATURES_FOR_READ,
969                         DATA_TYPE_NVLIST, 0, &features) == 0
970             && nvlist_check_features_for_read(features) != 0)
971                 return (EIO);
972
973         if (nvlist_find(nvlist,
974                         ZPOOL_CONFIG_POOL_STATE,
975                         DATA_TYPE_UINT64, 0, &val)) {
976                 return (EIO);
977         }
978
979         if (val == POOL_STATE_DESTROYED) {
980                 /* We don't boot only from destroyed pools. */
981                 return (EIO);
982         }
983
984         if (nvlist_find(nvlist,
985                         ZPOOL_CONFIG_POOL_TXG,
986                         DATA_TYPE_UINT64, 0, &pool_txg)
987             || nvlist_find(nvlist,
988                            ZPOOL_CONFIG_POOL_GUID,
989                            DATA_TYPE_UINT64, 0, &pool_guid)
990             || nvlist_find(nvlist,
991                            ZPOOL_CONFIG_POOL_NAME,
992                            DATA_TYPE_STRING, 0, &pool_name)) {
993                 /*
994                  * Cache and spare devices end up here - just ignore
995                  * them.
996                  */
997                 /*printf("ZFS: can't find pool details\n");*/
998                 return (EIO);
999         }
1000
1001         is_log = 0;
1002         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
1003             &is_log);
1004         if (is_log)
1005                 return (EIO);
1006
1007         /*
1008          * Create the pool if this is the first time we've seen it.
1009          */
1010         spa = spa_find_by_guid(pool_guid);
1011         if (!spa) {
1012                 spa = spa_create(pool_guid);
1013                 spa->spa_name = strdup(pool_name);
1014         }
1015         if (pool_txg > spa->spa_txg) {
1016                 spa->spa_txg = pool_txg;
1017                 is_newer = 1;
1018         } else
1019                 is_newer = 0;
1020
1021         /*
1022          * Get the vdev tree and create our in-core copy of it.
1023          * If we already have a vdev with this guid, this must
1024          * be some kind of alias (overlapping slices, dangerously dedicated
1025          * disks etc).
1026          */
1027         if (nvlist_find(nvlist,
1028                         ZPOOL_CONFIG_GUID,
1029                         DATA_TYPE_UINT64, 0, &guid)) {
1030                 return (EIO);
1031         }
1032         vdev = vdev_find(guid);
1033         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
1034                 return (EIO);
1035
1036         if (nvlist_find(nvlist,
1037                         ZPOOL_CONFIG_VDEV_TREE,
1038                         DATA_TYPE_NVLIST, 0, &vdevs)) {
1039                 return (EIO);
1040         }
1041
1042         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1043         if (rc)
1044                 return (rc);
1045
1046         /*
1047          * Add the toplevel vdev to the pool if its not already there.
1048          */
1049         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1050                 if (top_vdev == pool_vdev)
1051                         break;
1052         if (!pool_vdev && top_vdev) {
1053                 top_vdev->spa = spa;
1054                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1055         }
1056
1057         /*
1058          * We should already have created an incomplete vdev for this
1059          * vdev. Find it and initialise it with our read proc.
1060          */
1061         vdev = vdev_find(guid);
1062         if (vdev) {
1063                 vdev->v_phys_read = read;
1064                 vdev->v_read_priv = read_priv;
1065                 vdev->v_state = VDEV_STATE_HEALTHY;
1066         } else {
1067                 printf("ZFS: inconsistent nvlist contents\n");
1068                 return (EIO);
1069         }
1070
1071         /*
1072          * Re-evaluate top-level vdev state.
1073          */
1074         vdev_set_state(top_vdev);
1075
1076         /*
1077          * Ok, we are happy with the pool so far. Lets find
1078          * the best uberblock and then we can actually access
1079          * the contents of the pool.
1080          */
1081         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1082         up = (const struct uberblock *)upbuf;
1083         for (i = 0;
1084              i < VDEV_UBERBLOCK_COUNT(vdev);
1085              i++) {
1086                 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
1087                 BP_ZERO(&bp);
1088                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
1089                 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1090                 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1091                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1092                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1093                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1094
1095                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1096                         continue;
1097
1098                 if (up->ub_magic != UBERBLOCK_MAGIC)
1099                         continue;
1100                 if (up->ub_txg < spa->spa_txg)
1101                         continue;
1102                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
1103                         spa->spa_uberblock = *up;
1104                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
1105                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
1106                                 spa->spa_uberblock = *up;
1107                 }
1108         }
1109         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1110
1111         vdev->spa = spa;
1112         if (spap)
1113                 *spap = spa;
1114         return (0);
1115 }
1116
1117 static int
1118 ilog2(int n)
1119 {
1120         int v;
1121
1122         for (v = 0; v < 32; v++)
1123                 if (n == (1 << v))
1124                         return v;
1125         return -1;
1126 }
1127
1128 static int
1129 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1130 {
1131         blkptr_t gbh_bp;
1132         zio_gbh_phys_t zio_gb;
1133         char *pbuf;
1134         int i;
1135
1136         /* Artificial BP for gang block header. */
1137         gbh_bp = *bp;
1138         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1139         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1140         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1141         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1142         for (i = 0; i < SPA_DVAS_PER_BP; i++)
1143                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1144
1145         /* Read gang header block using the artificial BP. */
1146         if (zio_read(spa, &gbh_bp, &zio_gb))
1147                 return (EIO);
1148
1149         pbuf = buf;
1150         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1151                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1152
1153                 if (BP_IS_HOLE(gbp))
1154                         continue;
1155                 if (zio_read(spa, gbp, pbuf))
1156                         return (EIO);
1157                 pbuf += BP_GET_PSIZE(gbp);
1158         }
1159
1160         if (zio_checksum_verify(spa, bp, buf))
1161                 return (EIO);
1162         return (0);
1163 }
1164
1165 static int
1166 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1167 {
1168         int cpfunc = BP_GET_COMPRESS(bp);
1169         uint64_t align, size;
1170         void *pbuf;
1171         int i, error;
1172
1173         /*
1174          * Process data embedded in block pointer
1175          */
1176         if (BP_IS_EMBEDDED(bp)) {
1177                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1178
1179                 size = BPE_GET_PSIZE(bp);
1180                 ASSERT(size <= BPE_PAYLOAD_SIZE);
1181
1182                 if (cpfunc != ZIO_COMPRESS_OFF)
1183                         pbuf = zfs_alloc(size);
1184                 else
1185                         pbuf = buf;
1186
1187                 decode_embedded_bp_compressed(bp, pbuf);
1188                 error = 0;
1189
1190                 if (cpfunc != ZIO_COMPRESS_OFF) {
1191                         error = zio_decompress_data(cpfunc, pbuf,
1192                             size, buf, BP_GET_LSIZE(bp));
1193                         zfs_free(pbuf, size);
1194                 }
1195                 if (error != 0)
1196                         printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1197                             error);
1198                 return (error);
1199         }
1200
1201         error = EIO;
1202
1203         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1204                 const dva_t *dva = &bp->blk_dva[i];
1205                 vdev_t *vdev;
1206                 int vdevid;
1207                 off_t offset;
1208
1209                 if (!dva->dva_word[0] && !dva->dva_word[1])
1210                         continue;
1211
1212                 vdevid = DVA_GET_VDEV(dva);
1213                 offset = DVA_GET_OFFSET(dva);
1214                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1215                         if (vdev->v_id == vdevid)
1216                                 break;
1217                 }
1218                 if (!vdev || !vdev->v_read)
1219                         continue;
1220
1221                 size = BP_GET_PSIZE(bp);
1222                 if (vdev->v_read == vdev_raidz_read) {
1223                         align = 1ULL << vdev->v_top->v_ashift;
1224                         if (P2PHASE(size, align) != 0)
1225                                 size = P2ROUNDUP(size, align);
1226                 }
1227                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1228                         pbuf = zfs_alloc(size);
1229                 else
1230                         pbuf = buf;
1231
1232                 if (DVA_GET_GANG(dva))
1233                         error = zio_read_gang(spa, bp, pbuf);
1234                 else
1235                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
1236                 if (error == 0) {
1237                         if (cpfunc != ZIO_COMPRESS_OFF)
1238                                 error = zio_decompress_data(cpfunc, pbuf,
1239                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1240                         else if (size != BP_GET_PSIZE(bp))
1241                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1242                 }
1243                 if (buf != pbuf)
1244                         zfs_free(pbuf, size);
1245                 if (error == 0)
1246                         break;
1247         }
1248         if (error != 0)
1249                 printf("ZFS: i/o error - all block copies unavailable\n");
1250         return (error);
1251 }
1252
1253 static int
1254 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1255 {
1256         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1257         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1258         int nlevels = dnode->dn_nlevels;
1259         int i, rc;
1260
1261         if (bsize > SPA_MAXBLOCKSIZE) {
1262                 printf("ZFS: I/O error - blocks larger than %llu are not "
1263                     "supported\n", SPA_MAXBLOCKSIZE);
1264                 return (EIO);
1265         }
1266
1267         /*
1268          * Note: bsize may not be a power of two here so we need to do an
1269          * actual divide rather than a bitshift.
1270          */
1271         while (buflen > 0) {
1272                 uint64_t bn = offset / bsize;
1273                 int boff = offset % bsize;
1274                 int ibn;
1275                 const blkptr_t *indbp;
1276                 blkptr_t bp;
1277
1278                 if (bn > dnode->dn_maxblkid)
1279                         return (EIO);
1280
1281                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1282                         goto cached;
1283
1284                 indbp = dnode->dn_blkptr;
1285                 for (i = 0; i < nlevels; i++) {
1286                         /*
1287                          * Copy the bp from the indirect array so that
1288                          * we can re-use the scratch buffer for multi-level
1289                          * objects.
1290                          */
1291                         ibn = bn >> ((nlevels - i - 1) * ibshift);
1292                         ibn &= ((1 << ibshift) - 1);
1293                         bp = indbp[ibn];
1294                         if (BP_IS_HOLE(&bp)) {
1295                                 memset(dnode_cache_buf, 0, bsize);
1296                                 break;
1297                         }
1298                         rc = zio_read(spa, &bp, dnode_cache_buf);
1299                         if (rc)
1300                                 return (rc);
1301                         indbp = (const blkptr_t *) dnode_cache_buf;
1302                 }
1303                 dnode_cache_obj = dnode;
1304                 dnode_cache_bn = bn;
1305         cached:
1306
1307                 /*
1308                  * The buffer contains our data block. Copy what we
1309                  * need from it and loop.
1310                  */ 
1311                 i = bsize - boff;
1312                 if (i > buflen) i = buflen;
1313                 memcpy(buf, &dnode_cache_buf[boff], i);
1314                 buf = ((char*) buf) + i;
1315                 offset += i;
1316                 buflen -= i;
1317         }
1318
1319         return (0);
1320 }
1321
1322 /*
1323  * Lookup a value in a microzap directory. Assumes that the zap
1324  * scratch buffer contains the directory contents.
1325  */
1326 static int
1327 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1328 {
1329         const mzap_phys_t *mz;
1330         const mzap_ent_phys_t *mze;
1331         size_t size;
1332         int chunks, i;
1333
1334         /*
1335          * Microzap objects use exactly one block. Read the whole
1336          * thing.
1337          */
1338         size = dnode->dn_datablkszsec * 512;
1339
1340         mz = (const mzap_phys_t *) zap_scratch;
1341         chunks = size / MZAP_ENT_LEN - 1;
1342
1343         for (i = 0; i < chunks; i++) {
1344                 mze = &mz->mz_chunk[i];
1345                 if (!strcmp(mze->mze_name, name)) {
1346                         *value = mze->mze_value;
1347                         return (0);
1348                 }
1349         }
1350
1351         return (ENOENT);
1352 }
1353
1354 /*
1355  * Compare a name with a zap leaf entry. Return non-zero if the name
1356  * matches.
1357  */
1358 static int
1359 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1360 {
1361         size_t namelen;
1362         const zap_leaf_chunk_t *nc;
1363         const char *p;
1364
1365         namelen = zc->l_entry.le_name_numints;
1366                         
1367         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1368         p = name;
1369         while (namelen > 0) {
1370                 size_t len;
1371                 len = namelen;
1372                 if (len > ZAP_LEAF_ARRAY_BYTES)
1373                         len = ZAP_LEAF_ARRAY_BYTES;
1374                 if (memcmp(p, nc->l_array.la_array, len))
1375                         return (0);
1376                 p += len;
1377                 namelen -= len;
1378                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1379         }
1380
1381         return 1;
1382 }
1383
1384 /*
1385  * Extract a uint64_t value from a zap leaf entry.
1386  */
1387 static uint64_t
1388 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1389 {
1390         const zap_leaf_chunk_t *vc;
1391         int i;
1392         uint64_t value;
1393         const uint8_t *p;
1394
1395         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1396         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1397                 value = (value << 8) | p[i];
1398         }
1399
1400         return value;
1401 }
1402
1403 static void
1404 stv(int len, void *addr, uint64_t value)
1405 {
1406         switch (len) {
1407         case 1:
1408                 *(uint8_t *)addr = value;
1409                 return;
1410         case 2:
1411                 *(uint16_t *)addr = value;
1412                 return;
1413         case 4:
1414                 *(uint32_t *)addr = value;
1415                 return;
1416         case 8:
1417                 *(uint64_t *)addr = value;
1418                 return;
1419         }
1420 }
1421
1422 /*
1423  * Extract a array from a zap leaf entry.
1424  */
1425 static void
1426 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
1427     uint64_t integer_size, uint64_t num_integers, void *buf)
1428 {
1429         uint64_t array_int_len = zc->l_entry.le_value_intlen;
1430         uint64_t value = 0;
1431         uint64_t *u64 = buf;
1432         char *p = buf;
1433         int len = MIN(zc->l_entry.le_value_numints, num_integers);
1434         int chunk = zc->l_entry.le_value_chunk;
1435         int byten = 0;
1436
1437         if (integer_size == 8 && len == 1) {
1438                 *u64 = fzap_leaf_value(zl, zc);
1439                 return;
1440         }
1441
1442         while (len > 0) {
1443                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
1444                 int i;
1445
1446                 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
1447                 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
1448                         value = (value << 8) | la->la_array[i];
1449                         byten++;
1450                         if (byten == array_int_len) {
1451                                 stv(integer_size, p, value);
1452                                 byten = 0;
1453                                 len--;
1454                                 if (len == 0)
1455                                         return;
1456                                 p += integer_size;
1457                         }
1458                 }
1459                 chunk = la->la_next;
1460         }
1461 }
1462
1463 /*
1464  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1465  * buffer contains the directory header.
1466  */
1467 static int
1468 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1469     uint64_t integer_size, uint64_t num_integers, void *value)
1470 {
1471         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1472         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1473         fat_zap_t z;
1474         uint64_t *ptrtbl;
1475         uint64_t hash;
1476         int rc;
1477
1478         if (zh.zap_magic != ZAP_MAGIC)
1479                 return (EIO);
1480
1481         z.zap_block_shift = ilog2(bsize);
1482         z.zap_phys = (zap_phys_t *) zap_scratch;
1483
1484         /*
1485          * Figure out where the pointer table is and read it in if necessary.
1486          */
1487         if (zh.zap_ptrtbl.zt_blk) {
1488                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1489                                zap_scratch, bsize);
1490                 if (rc)
1491                         return (rc);
1492                 ptrtbl = (uint64_t *) zap_scratch;
1493         } else {
1494                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1495         }
1496
1497         hash = zap_hash(zh.zap_salt, name);
1498
1499         zap_leaf_t zl;
1500         zl.l_bs = z.zap_block_shift;
1501
1502         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1503         zap_leaf_chunk_t *zc;
1504
1505         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1506         if (rc)
1507                 return (rc);
1508
1509         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1510
1511         /*
1512          * Make sure this chunk matches our hash.
1513          */
1514         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1515             && zl.l_phys->l_hdr.lh_prefix
1516             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1517                 return (ENOENT);
1518
1519         /*
1520          * Hash within the chunk to find our entry.
1521          */
1522         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1523         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1524         h = zl.l_phys->l_hash[h];
1525         if (h == 0xffff)
1526                 return (ENOENT);
1527         zc = &ZAP_LEAF_CHUNK(&zl, h);
1528         while (zc->l_entry.le_hash != hash) {
1529                 if (zc->l_entry.le_next == 0xffff) {
1530                         zc = 0;
1531                         break;
1532                 }
1533                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1534         }
1535         if (fzap_name_equal(&zl, zc, name)) {
1536                 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints >
1537                     integer_size * num_integers)
1538                         return (E2BIG);
1539                 fzap_leaf_array(&zl, zc, integer_size, num_integers, value);
1540                 return (0);
1541         }
1542
1543         return (ENOENT);
1544 }
1545
1546 /*
1547  * Lookup a name in a zap object and return its value as a uint64_t.
1548  */
1549 static int
1550 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1551     uint64_t integer_size, uint64_t num_integers, void *value)
1552 {
1553         int rc;
1554         uint64_t zap_type;
1555         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1556
1557         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1558         if (rc)
1559                 return (rc);
1560
1561         zap_type = *(uint64_t *) zap_scratch;
1562         if (zap_type == ZBT_MICRO)
1563                 return mzap_lookup(dnode, name, value);
1564         else if (zap_type == ZBT_HEADER) {
1565                 return fzap_lookup(spa, dnode, name, integer_size,
1566                     num_integers, value);
1567         }
1568         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1569         return (EIO);
1570 }
1571
1572 /*
1573  * List a microzap directory. Assumes that the zap scratch buffer contains
1574  * the directory contents.
1575  */
1576 static int
1577 mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1578 {
1579         const mzap_phys_t *mz;
1580         const mzap_ent_phys_t *mze;
1581         size_t size;
1582         int chunks, i, rc;
1583
1584         /*
1585          * Microzap objects use exactly one block. Read the whole
1586          * thing.
1587          */
1588         size = dnode->dn_datablkszsec * 512;
1589         mz = (const mzap_phys_t *) zap_scratch;
1590         chunks = size / MZAP_ENT_LEN - 1;
1591
1592         for (i = 0; i < chunks; i++) {
1593                 mze = &mz->mz_chunk[i];
1594                 if (mze->mze_name[0]) {
1595                         rc = callback(mze->mze_name, mze->mze_value);
1596                         if (rc != 0)
1597                                 return (rc);
1598                 }
1599         }
1600
1601         return (0);
1602 }
1603
1604 /*
1605  * List a fatzap directory. Assumes that the zap scratch buffer contains
1606  * the directory header.
1607  */
1608 static int
1609 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1610 {
1611         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1612         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1613         fat_zap_t z;
1614         int i, j, rc;
1615
1616         if (zh.zap_magic != ZAP_MAGIC)
1617                 return (EIO);
1618
1619         z.zap_block_shift = ilog2(bsize);
1620         z.zap_phys = (zap_phys_t *) zap_scratch;
1621
1622         /*
1623          * This assumes that the leaf blocks start at block 1. The
1624          * documentation isn't exactly clear on this.
1625          */
1626         zap_leaf_t zl;
1627         zl.l_bs = z.zap_block_shift;
1628         for (i = 0; i < zh.zap_num_leafs; i++) {
1629                 off_t off = (i + 1) << zl.l_bs;
1630                 char name[256], *p;
1631                 uint64_t value;
1632
1633                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1634                         return (EIO);
1635
1636                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1637
1638                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1639                         zap_leaf_chunk_t *zc, *nc;
1640                         int namelen;
1641
1642                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1643                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1644                                 continue;
1645                         namelen = zc->l_entry.le_name_numints;
1646                         if (namelen > sizeof(name))
1647                                 namelen = sizeof(name);
1648
1649                         /*
1650                          * Paste the name back together.
1651                          */
1652                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1653                         p = name;
1654                         while (namelen > 0) {
1655                                 int len;
1656                                 len = namelen;
1657                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1658                                         len = ZAP_LEAF_ARRAY_BYTES;
1659                                 memcpy(p, nc->l_array.la_array, len);
1660                                 p += len;
1661                                 namelen -= len;
1662                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1663                         }
1664
1665                         /*
1666                          * Assume the first eight bytes of the value are
1667                          * a uint64_t.
1668                          */
1669                         value = fzap_leaf_value(&zl, zc);
1670
1671                         //printf("%s 0x%jx\n", name, (uintmax_t)value);
1672                         rc = callback((const char *)name, value);
1673                         if (rc != 0)
1674                                 return (rc);
1675                 }
1676         }
1677
1678         return (0);
1679 }
1680
1681 static int zfs_printf(const char *name, uint64_t value __unused)
1682 {
1683
1684         printf("%s\n", name);
1685
1686         return (0);
1687 }
1688
1689 /*
1690  * List a zap directory.
1691  */
1692 static int
1693 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1694 {
1695         uint64_t zap_type;
1696         size_t size = dnode->dn_datablkszsec * 512;
1697
1698         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1699                 return (EIO);
1700
1701         zap_type = *(uint64_t *) zap_scratch;
1702         if (zap_type == ZBT_MICRO)
1703                 return mzap_list(dnode, zfs_printf);
1704         else
1705                 return fzap_list(spa, dnode, zfs_printf);
1706 }
1707
1708 static int
1709 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1710 {
1711         off_t offset;
1712
1713         offset = objnum * sizeof(dnode_phys_t);
1714         return dnode_read(spa, &os->os_meta_dnode, offset,
1715                 dnode, sizeof(dnode_phys_t));
1716 }
1717
1718 static int
1719 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1720 {
1721         const mzap_phys_t *mz;
1722         const mzap_ent_phys_t *mze;
1723         size_t size;
1724         int chunks, i;
1725
1726         /*
1727          * Microzap objects use exactly one block. Read the whole
1728          * thing.
1729          */
1730         size = dnode->dn_datablkszsec * 512;
1731
1732         mz = (const mzap_phys_t *) zap_scratch;
1733         chunks = size / MZAP_ENT_LEN - 1;
1734
1735         for (i = 0; i < chunks; i++) {
1736                 mze = &mz->mz_chunk[i];
1737                 if (value == mze->mze_value) {
1738                         strcpy(name, mze->mze_name);
1739                         return (0);
1740                 }
1741         }
1742
1743         return (ENOENT);
1744 }
1745
1746 static void
1747 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1748 {
1749         size_t namelen;
1750         const zap_leaf_chunk_t *nc;
1751         char *p;
1752
1753         namelen = zc->l_entry.le_name_numints;
1754
1755         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1756         p = name;
1757         while (namelen > 0) {
1758                 size_t len;
1759                 len = namelen;
1760                 if (len > ZAP_LEAF_ARRAY_BYTES)
1761                         len = ZAP_LEAF_ARRAY_BYTES;
1762                 memcpy(p, nc->l_array.la_array, len);
1763                 p += len;
1764                 namelen -= len;
1765                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1766         }
1767
1768         *p = '\0';
1769 }
1770
1771 static int
1772 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1773 {
1774         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1775         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1776         fat_zap_t z;
1777         int i, j;
1778
1779         if (zh.zap_magic != ZAP_MAGIC)
1780                 return (EIO);
1781
1782         z.zap_block_shift = ilog2(bsize);
1783         z.zap_phys = (zap_phys_t *) zap_scratch;
1784
1785         /*
1786          * This assumes that the leaf blocks start at block 1. The
1787          * documentation isn't exactly clear on this.
1788          */
1789         zap_leaf_t zl;
1790         zl.l_bs = z.zap_block_shift;
1791         for (i = 0; i < zh.zap_num_leafs; i++) {
1792                 off_t off = (i + 1) << zl.l_bs;
1793
1794                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1795                         return (EIO);
1796
1797                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1798
1799                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1800                         zap_leaf_chunk_t *zc;
1801
1802                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1803                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1804                                 continue;
1805                         if (zc->l_entry.le_value_intlen != 8 ||
1806                             zc->l_entry.le_value_numints != 1)
1807                                 continue;
1808
1809                         if (fzap_leaf_value(&zl, zc) == value) {
1810                                 fzap_name_copy(&zl, zc, name);
1811                                 return (0);
1812                         }
1813                 }
1814         }
1815
1816         return (ENOENT);
1817 }
1818
1819 static int
1820 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1821 {
1822         int rc;
1823         uint64_t zap_type;
1824         size_t size = dnode->dn_datablkszsec * 512;
1825
1826         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1827         if (rc)
1828                 return (rc);
1829
1830         zap_type = *(uint64_t *) zap_scratch;
1831         if (zap_type == ZBT_MICRO)
1832                 return mzap_rlookup(spa, dnode, name, value);
1833         else
1834                 return fzap_rlookup(spa, dnode, name, value);
1835 }
1836
1837 static int
1838 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1839 {
1840         char name[256];
1841         char component[256];
1842         uint64_t dir_obj, parent_obj, child_dir_zapobj;
1843         dnode_phys_t child_dir_zap, dataset, dir, parent;
1844         dsl_dir_phys_t *dd;
1845         dsl_dataset_phys_t *ds;
1846         char *p;
1847         int len;
1848
1849         p = &name[sizeof(name) - 1];
1850         *p = '\0';
1851
1852         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1853                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1854                 return (EIO);
1855         }
1856         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1857         dir_obj = ds->ds_dir_obj;
1858
1859         for (;;) {
1860                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1861                         return (EIO);
1862                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1863
1864                 /* Actual loop condition. */
1865                 parent_obj  = dd->dd_parent_obj;
1866                 if (parent_obj == 0)
1867                         break;
1868
1869                 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1870                         return (EIO);
1871                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1872                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1873                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1874                         return (EIO);
1875                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1876                         return (EIO);
1877
1878                 len = strlen(component);
1879                 p -= len;
1880                 memcpy(p, component, len);
1881                 --p;
1882                 *p = '/';
1883
1884                 /* Actual loop iteration. */
1885                 dir_obj = parent_obj;
1886         }
1887
1888         if (*p != '\0')
1889                 ++p;
1890         strcpy(result, p);
1891
1892         return (0);
1893 }
1894
1895 static int
1896 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1897 {
1898         char element[256];
1899         uint64_t dir_obj, child_dir_zapobj;
1900         dnode_phys_t child_dir_zap, dir;
1901         dsl_dir_phys_t *dd;
1902         const char *p, *q;
1903
1904         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1905                 return (EIO);
1906         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
1907             1, &dir_obj))
1908                 return (EIO);
1909
1910         p = name;
1911         for (;;) {
1912                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1913                         return (EIO);
1914                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1915
1916                 while (*p == '/')
1917                         p++;
1918                 /* Actual loop condition #1. */
1919                 if (*p == '\0')
1920                         break;
1921
1922                 q = strchr(p, '/');
1923                 if (q) {
1924                         memcpy(element, p, q - p);
1925                         element[q - p] = '\0';
1926                         p = q + 1;
1927                 } else {
1928                         strcpy(element, p);
1929                         p += strlen(p);
1930                 }
1931
1932                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1933                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1934                         return (EIO);
1935
1936                 /* Actual loop condition #2. */
1937                 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
1938                     1, &dir_obj) != 0)
1939                         return (ENOENT);
1940         }
1941
1942         *objnum = dd->dd_head_dataset_obj;
1943         return (0);
1944 }
1945
1946 #ifndef BOOT2
1947 static int
1948 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1949 {
1950         uint64_t dir_obj, child_dir_zapobj;
1951         dnode_phys_t child_dir_zap, dir, dataset;
1952         dsl_dataset_phys_t *ds;
1953         dsl_dir_phys_t *dd;
1954
1955         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1956                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1957                 return (EIO);
1958         }
1959         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1960         dir_obj = ds->ds_dir_obj;
1961
1962         if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
1963                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1964                 return (EIO);
1965         }
1966         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1967
1968         child_dir_zapobj = dd->dd_child_dir_zapobj;
1969         if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
1970                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
1971                 return (EIO);
1972         }
1973
1974         return (zap_list(spa, &child_dir_zap) != 0);
1975 }
1976
1977 int
1978 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t))
1979 {
1980         uint64_t dir_obj, child_dir_zapobj, zap_type;
1981         dnode_phys_t child_dir_zap, dir, dataset;
1982         dsl_dataset_phys_t *ds;
1983         dsl_dir_phys_t *dd;
1984         int err;
1985
1986         err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
1987         if (err != 0) {
1988                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1989                 return (err);
1990         }
1991         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1992         dir_obj = ds->ds_dir_obj;
1993
1994         err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
1995         if (err != 0) {
1996                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1997                 return (err);
1998         }
1999         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2000
2001         child_dir_zapobj = dd->dd_child_dir_zapobj;
2002         err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
2003         if (err != 0) {
2004                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2005                 return (err);
2006         }
2007
2008         err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
2009         if (err != 0)
2010                 return (err);
2011
2012         zap_type = *(uint64_t *) zap_scratch;
2013         if (zap_type == ZBT_MICRO)
2014                 return mzap_list(&child_dir_zap, callback);
2015         else
2016                 return fzap_list(spa, &child_dir_zap, callback);
2017 }
2018 #endif
2019
2020 /*
2021  * Find the object set given the object number of its dataset object
2022  * and return its details in *objset
2023  */
2024 static int
2025 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
2026 {
2027         dnode_phys_t dataset;
2028         dsl_dataset_phys_t *ds;
2029
2030         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2031                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2032                 return (EIO);
2033         }
2034
2035         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2036         if (zio_read(spa, &ds->ds_bp, objset)) {
2037                 printf("ZFS: can't read object set for dataset %ju\n",
2038                     (uintmax_t)objnum);
2039                 return (EIO);
2040         }
2041
2042         return (0);
2043 }
2044
2045 /*
2046  * Find the object set pointed to by the BOOTFS property or the root
2047  * dataset if there is none and return its details in *objset
2048  */
2049 static int
2050 zfs_get_root(const spa_t *spa, uint64_t *objid)
2051 {
2052         dnode_phys_t dir, propdir;
2053         uint64_t props, bootfs, root;
2054
2055         *objid = 0;
2056
2057         /*
2058          * Start with the MOS directory object.
2059          */
2060         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
2061                 printf("ZFS: can't read MOS object directory\n");
2062                 return (EIO);
2063         }
2064
2065         /*
2066          * Lookup the pool_props and see if we can find a bootfs.
2067          */
2068         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, sizeof (props), 1, &props) == 0
2069              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
2070              && zap_lookup(spa, &propdir, "bootfs", sizeof (bootfs), 1, &bootfs) == 0
2071              && bootfs != 0)
2072         {
2073                 *objid = bootfs;
2074                 return (0);
2075         }
2076         /*
2077          * Lookup the root dataset directory
2078          */
2079         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root)
2080             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
2081                 printf("ZFS: can't find root dsl_dir\n");
2082                 return (EIO);
2083         }
2084
2085         /*
2086          * Use the information from the dataset directory's bonus buffer
2087          * to find the dataset object and from that the object set itself.
2088          */
2089         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
2090         *objid = dd->dd_head_dataset_obj;
2091         return (0);
2092 }
2093
2094 static int
2095 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
2096 {
2097
2098         mount->spa = spa;
2099
2100         /*
2101          * Find the root object set if not explicitly provided
2102          */
2103         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
2104                 printf("ZFS: can't find root filesystem\n");
2105                 return (EIO);
2106         }
2107
2108         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
2109                 printf("ZFS: can't open root filesystem\n");
2110                 return (EIO);
2111         }
2112
2113         mount->rootobj = rootobj;
2114
2115         return (0);
2116 }
2117
2118 /*
2119  * callback function for feature name checks.
2120  */
2121 static int
2122 check_feature(const char *name, uint64_t value)
2123 {
2124         int i;
2125
2126         if (value == 0)
2127                 return (0);
2128         if (name[0] == '\0')
2129                 return (0);
2130
2131         for (i = 0; features_for_read[i] != NULL; i++) {
2132                 if (strcmp(name, features_for_read[i]) == 0)
2133                         return (0);
2134         }
2135         printf("ZFS: unsupported feature: %s\n", name);
2136         return (EIO);
2137 }
2138
2139 /*
2140  * Checks whether the MOS features that are active are supported.
2141  */
2142 static int
2143 check_mos_features(const spa_t *spa)
2144 {
2145         dnode_phys_t dir;
2146         uint64_t objnum, zap_type;
2147         size_t size;
2148         int rc;
2149
2150         if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
2151             &dir)) != 0)
2152                 return (rc);
2153         if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
2154             sizeof (objnum), 1, &objnum)) != 0) {
2155                 /*
2156                  * It is older pool without features. As we have already
2157                  * tested the label, just return without raising the error.
2158                  */
2159                 return (0);
2160         }
2161
2162         if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
2163                 return (rc);
2164
2165         if (dir.dn_type != DMU_OTN_ZAP_METADATA)
2166                 return (EIO);
2167
2168         size = dir.dn_datablkszsec * 512;
2169         if (dnode_read(spa, &dir, 0, zap_scratch, size))
2170                 return (EIO);
2171
2172         zap_type = *(uint64_t *) zap_scratch;
2173         if (zap_type == ZBT_MICRO)
2174                 rc = mzap_list(&dir, check_feature);
2175         else
2176                 rc = fzap_list(spa, &dir, check_feature);
2177
2178         return (rc);
2179 }
2180
2181 static int
2182 zfs_spa_init(spa_t *spa)
2183 {
2184         dnode_phys_t dir;
2185         int rc;
2186
2187         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
2188                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
2189                 return (EIO);
2190         }
2191         if (spa->spa_mos.os_type != DMU_OST_META) {
2192                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
2193                 return (EIO);
2194         }
2195
2196         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
2197             &dir)) {
2198                 printf("ZFS: failed to read pool %s directory object\n",
2199                     spa->spa_name);
2200                 return (EIO);
2201         }
2202         /* this is allowed to fail, older pools do not have salt */
2203         rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
2204             sizeof (spa->spa_cksum_salt.zcs_bytes),
2205             spa->spa_cksum_salt.zcs_bytes);
2206
2207         rc = check_mos_features(spa);
2208         if (rc != 0) {
2209                 printf("ZFS: pool %s is not supported\n", spa->spa_name);
2210         }
2211
2212         return (rc);
2213 }
2214
2215 static int
2216 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
2217 {
2218
2219         if (dn->dn_bonustype != DMU_OT_SA) {
2220                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
2221
2222                 sb->st_mode = zp->zp_mode;
2223                 sb->st_uid = zp->zp_uid;
2224                 sb->st_gid = zp->zp_gid;
2225                 sb->st_size = zp->zp_size;
2226         } else {
2227                 sa_hdr_phys_t *sahdrp;
2228                 int hdrsize;
2229                 size_t size = 0;
2230                 void *buf = NULL;
2231
2232                 if (dn->dn_bonuslen != 0)
2233                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2234                 else {
2235                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
2236                                 blkptr_t *bp = &dn->dn_spill;
2237                                 int error;
2238
2239                                 size = BP_GET_LSIZE(bp);
2240                                 buf = zfs_alloc(size);
2241                                 error = zio_read(spa, bp, buf);
2242                                 if (error != 0) {
2243                                         zfs_free(buf, size);
2244                                         return (error);
2245                                 }
2246                                 sahdrp = buf;
2247                         } else {
2248                                 return (EIO);
2249                         }
2250                 }
2251                 hdrsize = SA_HDR_SIZE(sahdrp);
2252                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2253                     SA_MODE_OFFSET);
2254                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2255                     SA_UID_OFFSET);
2256                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2257                     SA_GID_OFFSET);
2258                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2259                     SA_SIZE_OFFSET);
2260                 if (buf != NULL)
2261                         zfs_free(buf, size);
2262         }
2263
2264         return (0);
2265 }
2266
2267 /*
2268  * Lookup a file and return its dnode.
2269  */
2270 static int
2271 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2272 {
2273         int rc;
2274         uint64_t objnum, rootnum, parentnum;
2275         const spa_t *spa;
2276         dnode_phys_t dn;
2277         const char *p, *q;
2278         char element[256];
2279         char path[1024];
2280         int symlinks_followed = 0;
2281         struct stat sb;
2282
2283         spa = mount->spa;
2284         if (mount->objset.os_type != DMU_OST_ZFS) {
2285                 printf("ZFS: unexpected object set type %ju\n",
2286                     (uintmax_t)mount->objset.os_type);
2287                 return (EIO);
2288         }
2289
2290         /*
2291          * Get the root directory dnode.
2292          */
2293         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2294         if (rc)
2295                 return (rc);
2296
2297         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (rootnum), 1, &rootnum);
2298         if (rc)
2299                 return (rc);
2300
2301         rc = objset_get_dnode(spa, &mount->objset, rootnum, &dn);
2302         if (rc)
2303                 return (rc);
2304
2305         objnum = rootnum;
2306         p = upath;
2307         while (p && *p) {
2308                 while (*p == '/')
2309                         p++;
2310                 if (!*p)
2311                         break;
2312                 q = strchr(p, '/');
2313                 if (q) {
2314                         memcpy(element, p, q - p);
2315                         element[q - p] = 0;
2316                         p = q;
2317                 } else {
2318                         strcpy(element, p);
2319                         p = 0;
2320                 }
2321
2322                 rc = zfs_dnode_stat(spa, &dn, &sb);
2323                 if (rc)
2324                         return (rc);
2325                 if (!S_ISDIR(sb.st_mode))
2326                         return (ENOTDIR);
2327
2328                 parentnum = objnum;
2329                 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
2330                 if (rc)
2331                         return (rc);
2332                 objnum = ZFS_DIRENT_OBJ(objnum);
2333
2334                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2335                 if (rc)
2336                         return (rc);
2337
2338                 /*
2339                  * Check for symlink.
2340                  */
2341                 rc = zfs_dnode_stat(spa, &dn, &sb);
2342                 if (rc)
2343                         return (rc);
2344                 if (S_ISLNK(sb.st_mode)) {
2345                         if (symlinks_followed > 10)
2346                                 return (EMLINK);
2347                         symlinks_followed++;
2348
2349                         /*
2350                          * Read the link value and copy the tail of our
2351                          * current path onto the end.
2352                          */
2353                         if (p)
2354                                 strcpy(&path[sb.st_size], p);
2355                         else
2356                                 path[sb.st_size] = 0;
2357                         /*
2358                          * Second test is purely to silence bogus compiler
2359                          * warning about accessing past the end of dn_bonus.
2360                          */
2361                         if (sb.st_size + sizeof(znode_phys_t) <=
2362                             dn.dn_bonuslen && sizeof(znode_phys_t) <=
2363                             sizeof(dn.dn_bonus)) {
2364                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
2365                                         sb.st_size);
2366                         } else {
2367                                 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
2368                                 if (rc)
2369                                         return (rc);
2370                         }
2371
2372                         /*
2373                          * Restart with the new path, starting either at
2374                          * the root or at the parent depending whether or
2375                          * not the link is relative.
2376                          */
2377                         p = path;
2378                         if (*p == '/')
2379                                 objnum = rootnum;
2380                         else
2381                                 objnum = parentnum;
2382                         objset_get_dnode(spa, &mount->objset, objnum, &dn);
2383                 }
2384         }
2385
2386         *dnode = dn;
2387         return (0);
2388 }