]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/boot/zfs/zfsimpl.c
MFC r307322,r307323,r307324,r307326,r307327,r307338,r307879,r307908,r307911,
[FreeBSD/FreeBSD.git] / sys / boot / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <sys/stat.h>
35 #include <sys/stdint.h>
36
37 #include "zfsimpl.h"
38 #include "zfssubr.c"
39
40
41 struct zfsmount {
42         const spa_t     *spa;
43         objset_phys_t   objset;
44         uint64_t        rootobj;
45 };
46
47 /*
48  * List of all vdevs, chained through v_alllink.
49  */
50 static vdev_list_t zfs_vdevs;
51
52  /*
53  * List of ZFS features supported for read
54  */
55 static const char *features_for_read[] = {
56         "org.illumos:lz4_compress",
57         "com.delphix:hole_birth",
58         "com.delphix:extensible_dataset",
59         "com.delphix:embedded_data",
60         "org.open-zfs:large_blocks",
61         "org.illumos:sha512",
62         "org.illumos:skein",
63         NULL
64 };
65
66 /*
67  * List of all pools, chained through spa_link.
68  */
69 static spa_list_t zfs_pools;
70
71 static const dnode_phys_t *dnode_cache_obj = NULL;
72 static uint64_t dnode_cache_bn;
73 static char *dnode_cache_buf;
74 static char *zap_scratch;
75 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
76
77 #define TEMP_SIZE       (1024 * 1024)
78
79 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
80 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
81 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
82 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
83     const char *name, uint64_t integer_size, uint64_t num_integers,
84     void *value);
85
86 static void
87 zfs_init(void)
88 {
89         STAILQ_INIT(&zfs_vdevs);
90         STAILQ_INIT(&zfs_pools);
91
92         zfs_temp_buf = malloc(TEMP_SIZE);
93         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
94         zfs_temp_ptr = zfs_temp_buf;
95         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
96         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
97
98         zfs_init_crc();
99 }
100
101 static void *
102 zfs_alloc(size_t size)
103 {
104         char *ptr;
105
106         if (zfs_temp_ptr + size > zfs_temp_end) {
107                 printf("ZFS: out of temporary buffer space\n");
108                 for (;;) ;
109         }
110         ptr = zfs_temp_ptr;
111         zfs_temp_ptr += size;
112
113         return (ptr);
114 }
115
116 static void
117 zfs_free(void *ptr, size_t size)
118 {
119
120         zfs_temp_ptr -= size;
121         if (zfs_temp_ptr != ptr) {
122                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
123                 for (;;) ;
124         }
125 }
126
127 static int
128 xdr_int(const unsigned char **xdr, int *ip)
129 {
130         *ip = ((*xdr)[0] << 24)
131                 | ((*xdr)[1] << 16)
132                 | ((*xdr)[2] << 8)
133                 | ((*xdr)[3] << 0);
134         (*xdr) += 4;
135         return (0);
136 }
137
138 static int
139 xdr_u_int(const unsigned char **xdr, u_int *ip)
140 {
141         *ip = ((*xdr)[0] << 24)
142                 | ((*xdr)[1] << 16)
143                 | ((*xdr)[2] << 8)
144                 | ((*xdr)[3] << 0);
145         (*xdr) += 4;
146         return (0);
147 }
148
149 static int
150 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
151 {
152         u_int hi, lo;
153
154         xdr_u_int(xdr, &hi);
155         xdr_u_int(xdr, &lo);
156         *lp = (((uint64_t) hi) << 32) | lo;
157         return (0);
158 }
159
160 static int
161 nvlist_find(const unsigned char *nvlist, const char *name, int type,
162             int* elementsp, void *valuep)
163 {
164         const unsigned char *p, *pair;
165         int junk;
166         int encoded_size, decoded_size;
167
168         p = nvlist;
169         xdr_int(&p, &junk);
170         xdr_int(&p, &junk);
171
172         pair = p;
173         xdr_int(&p, &encoded_size);
174         xdr_int(&p, &decoded_size);
175         while (encoded_size && decoded_size) {
176                 int namelen, pairtype, elements;
177                 const char *pairname;
178
179                 xdr_int(&p, &namelen);
180                 pairname = (const char*) p;
181                 p += roundup(namelen, 4);
182                 xdr_int(&p, &pairtype);
183
184                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
185                         xdr_int(&p, &elements);
186                         if (elementsp)
187                                 *elementsp = elements;
188                         if (type == DATA_TYPE_UINT64) {
189                                 xdr_uint64_t(&p, (uint64_t *) valuep);
190                                 return (0);
191                         } else if (type == DATA_TYPE_STRING) {
192                                 int len;
193                                 xdr_int(&p, &len);
194                                 (*(const char**) valuep) = (const char*) p;
195                                 return (0);
196                         } else if (type == DATA_TYPE_NVLIST
197                                    || type == DATA_TYPE_NVLIST_ARRAY) {
198                                 (*(const unsigned char**) valuep) =
199                                          (const unsigned char*) p;
200                                 return (0);
201                         } else {
202                                 return (EIO);
203                         }
204                 } else {
205                         /*
206                          * Not the pair we are looking for, skip to the next one.
207                          */
208                         p = pair + encoded_size;
209                 }
210
211                 pair = p;
212                 xdr_int(&p, &encoded_size);
213                 xdr_int(&p, &decoded_size);
214         }
215
216         return (EIO);
217 }
218
219 static int
220 nvlist_check_features_for_read(const unsigned char *nvlist)
221 {
222         const unsigned char *p, *pair;
223         int junk;
224         int encoded_size, decoded_size;
225         int rc;
226
227         rc = 0;
228
229         p = nvlist;
230         xdr_int(&p, &junk);
231         xdr_int(&p, &junk);
232
233         pair = p;
234         xdr_int(&p, &encoded_size);
235         xdr_int(&p, &decoded_size);
236         while (encoded_size && decoded_size) {
237                 int namelen, pairtype;
238                 const char *pairname;
239                 int i, found;
240
241                 found = 0;
242
243                 xdr_int(&p, &namelen);
244                 pairname = (const char*) p;
245                 p += roundup(namelen, 4);
246                 xdr_int(&p, &pairtype);
247
248                 for (i = 0; features_for_read[i] != NULL; i++) {
249                         if (!memcmp(pairname, features_for_read[i], namelen)) {
250                                 found = 1;
251                                 break;
252                         }
253                 }
254
255                 if (!found) {
256                         printf("ZFS: unsupported feature: %s\n", pairname);
257                         rc = EIO;
258                 }
259
260                 p = pair + encoded_size;
261
262                 pair = p;
263                 xdr_int(&p, &encoded_size);
264                 xdr_int(&p, &decoded_size);
265         }
266
267         return (rc);
268 }
269
270 /*
271  * Return the next nvlist in an nvlist array.
272  */
273 static const unsigned char *
274 nvlist_next(const unsigned char *nvlist)
275 {
276         const unsigned char *p, *pair;
277         int junk;
278         int encoded_size, decoded_size;
279
280         p = nvlist;
281         xdr_int(&p, &junk);
282         xdr_int(&p, &junk);
283
284         pair = p;
285         xdr_int(&p, &encoded_size);
286         xdr_int(&p, &decoded_size);
287         while (encoded_size && decoded_size) {
288                 p = pair + encoded_size;
289
290                 pair = p;
291                 xdr_int(&p, &encoded_size);
292                 xdr_int(&p, &decoded_size);
293         }
294
295         return p;
296 }
297
298 #ifdef TEST
299
300 static const unsigned char *
301 nvlist_print(const unsigned char *nvlist, unsigned int indent)
302 {
303         static const char* typenames[] = {
304                 "DATA_TYPE_UNKNOWN",
305                 "DATA_TYPE_BOOLEAN",
306                 "DATA_TYPE_BYTE",
307                 "DATA_TYPE_INT16",
308                 "DATA_TYPE_UINT16",
309                 "DATA_TYPE_INT32",
310                 "DATA_TYPE_UINT32",
311                 "DATA_TYPE_INT64",
312                 "DATA_TYPE_UINT64",
313                 "DATA_TYPE_STRING",
314                 "DATA_TYPE_BYTE_ARRAY",
315                 "DATA_TYPE_INT16_ARRAY",
316                 "DATA_TYPE_UINT16_ARRAY",
317                 "DATA_TYPE_INT32_ARRAY",
318                 "DATA_TYPE_UINT32_ARRAY",
319                 "DATA_TYPE_INT64_ARRAY",
320                 "DATA_TYPE_UINT64_ARRAY",
321                 "DATA_TYPE_STRING_ARRAY",
322                 "DATA_TYPE_HRTIME",
323                 "DATA_TYPE_NVLIST",
324                 "DATA_TYPE_NVLIST_ARRAY",
325                 "DATA_TYPE_BOOLEAN_VALUE",
326                 "DATA_TYPE_INT8",
327                 "DATA_TYPE_UINT8",
328                 "DATA_TYPE_BOOLEAN_ARRAY",
329                 "DATA_TYPE_INT8_ARRAY",
330                 "DATA_TYPE_UINT8_ARRAY"
331         };
332
333         unsigned int i, j;
334         const unsigned char *p, *pair;
335         int junk;
336         int encoded_size, decoded_size;
337
338         p = nvlist;
339         xdr_int(&p, &junk);
340         xdr_int(&p, &junk);
341
342         pair = p;
343         xdr_int(&p, &encoded_size);
344         xdr_int(&p, &decoded_size);
345         while (encoded_size && decoded_size) {
346                 int namelen, pairtype, elements;
347                 const char *pairname;
348
349                 xdr_int(&p, &namelen);
350                 pairname = (const char*) p;
351                 p += roundup(namelen, 4);
352                 xdr_int(&p, &pairtype);
353
354                 for (i = 0; i < indent; i++)
355                         printf(" ");
356                 printf("%s %s", typenames[pairtype], pairname);
357
358                 xdr_int(&p, &elements);
359                 switch (pairtype) {
360                 case DATA_TYPE_UINT64: {
361                         uint64_t val;
362                         xdr_uint64_t(&p, &val);
363                         printf(" = 0x%jx\n", (uintmax_t)val);
364                         break;
365                 }
366
367                 case DATA_TYPE_STRING: {
368                         int len;
369                         xdr_int(&p, &len);
370                         printf(" = \"%s\"\n", p);
371                         break;
372                 }
373
374                 case DATA_TYPE_NVLIST:
375                         printf("\n");
376                         nvlist_print(p, indent + 1);
377                         break;
378
379                 case DATA_TYPE_NVLIST_ARRAY:
380                         for (j = 0; j < elements; j++) {
381                                 printf("[%d]\n", j);
382                                 p = nvlist_print(p, indent + 1);
383                                 if (j != elements - 1) {
384                                         for (i = 0; i < indent; i++)
385                                                 printf(" ");
386                                         printf("%s %s", typenames[pairtype], pairname);
387                                 }
388                         }
389                         break;
390
391                 default:
392                         printf("\n");
393                 }
394
395                 p = pair + encoded_size;
396
397                 pair = p;
398                 xdr_int(&p, &encoded_size);
399                 xdr_int(&p, &decoded_size);
400         }
401
402         return p;
403 }
404
405 #endif
406
407 static int
408 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
409     off_t offset, size_t size)
410 {
411         size_t psize;
412         int rc;
413
414         if (!vdev->v_phys_read)
415                 return (EIO);
416
417         if (bp) {
418                 psize = BP_GET_PSIZE(bp);
419         } else {
420                 psize = size;
421         }
422
423         /*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
424         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
425         if (rc)
426                 return (rc);
427         if (bp && zio_checksum_verify(vdev->spa, bp, buf))
428                 return (EIO);
429
430         return (0);
431 }
432
433 static int
434 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
435     off_t offset, size_t bytes)
436 {
437
438         return (vdev_read_phys(vdev, bp, buf,
439                 offset + VDEV_LABEL_START_SIZE, bytes));
440 }
441
442
443 static int
444 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
445     off_t offset, size_t bytes)
446 {
447         vdev_t *kid;
448         int rc;
449
450         rc = EIO;
451         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
452                 if (kid->v_state != VDEV_STATE_HEALTHY)
453                         continue;
454                 rc = kid->v_read(kid, bp, buf, offset, bytes);
455                 if (!rc)
456                         return (0);
457         }
458
459         return (rc);
460 }
461
462 static int
463 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
464     off_t offset, size_t bytes)
465 {
466         vdev_t *kid;
467
468         /*
469          * Here we should have two kids:
470          * First one which is the one we are replacing and we can trust
471          * only this one to have valid data, but it might not be present.
472          * Second one is that one we are replacing with. It is most likely
473          * healthy, but we can't trust it has needed data, so we won't use it.
474          */
475         kid = STAILQ_FIRST(&vdev->v_children);
476         if (kid == NULL)
477                 return (EIO);
478         if (kid->v_state != VDEV_STATE_HEALTHY)
479                 return (EIO);
480         return (kid->v_read(kid, bp, buf, offset, bytes));
481 }
482
483 static vdev_t *
484 vdev_find(uint64_t guid)
485 {
486         vdev_t *vdev;
487
488         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
489                 if (vdev->v_guid == guid)
490                         return (vdev);
491
492         return (0);
493 }
494
495 static vdev_t *
496 vdev_create(uint64_t guid, vdev_read_t *_read)
497 {
498         vdev_t *vdev;
499
500         vdev = malloc(sizeof(vdev_t));
501         memset(vdev, 0, sizeof(vdev_t));
502         STAILQ_INIT(&vdev->v_children);
503         vdev->v_guid = guid;
504         vdev->v_state = VDEV_STATE_OFFLINE;
505         vdev->v_read = _read;
506         vdev->v_phys_read = 0;
507         vdev->v_read_priv = 0;
508         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
509
510         return (vdev);
511 }
512
513 static int
514 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
515     vdev_t **vdevp, int is_newer)
516 {
517         int rc;
518         uint64_t guid, id, ashift, nparity;
519         const char *type;
520         const char *path;
521         vdev_t *vdev, *kid;
522         const unsigned char *kids;
523         int nkids, i, is_new;
524         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
525
526         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
527                         DATA_TYPE_UINT64, 0, &guid)
528             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
529                            DATA_TYPE_UINT64, 0, &id)
530             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
531                            DATA_TYPE_STRING, 0, &type)) {
532                 printf("ZFS: can't find vdev details\n");
533                 return (ENOENT);
534         }
535
536         if (strcmp(type, VDEV_TYPE_MIRROR)
537             && strcmp(type, VDEV_TYPE_DISK)
538 #ifdef ZFS_TEST
539             && strcmp(type, VDEV_TYPE_FILE)
540 #endif
541             && strcmp(type, VDEV_TYPE_RAIDZ)
542             && strcmp(type, VDEV_TYPE_REPLACING)) {
543                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
544                 return (EIO);
545         }
546
547         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
548
549         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
550                         &is_offline);
551         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
552                         &is_removed);
553         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
554                         &is_faulted);
555         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
556                         &is_degraded);
557         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
558                         &isnt_present);
559
560         vdev = vdev_find(guid);
561         if (!vdev) {
562                 is_new = 1;
563
564                 if (!strcmp(type, VDEV_TYPE_MIRROR))
565                         vdev = vdev_create(guid, vdev_mirror_read);
566                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
567                         vdev = vdev_create(guid, vdev_raidz_read);
568                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
569                         vdev = vdev_create(guid, vdev_replacing_read);
570                 else
571                         vdev = vdev_create(guid, vdev_disk_read);
572
573                 vdev->v_id = id;
574                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
575                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
576                         DATA_TYPE_UINT64, 0, &ashift) == 0)
577                         vdev->v_ashift = ashift;
578                 else
579                         vdev->v_ashift = 0;
580                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
581                         DATA_TYPE_UINT64, 0, &nparity) == 0)
582                         vdev->v_nparity = nparity;
583                 else
584                         vdev->v_nparity = 0;
585                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
586                                 DATA_TYPE_STRING, 0, &path) == 0) {
587                         if (strncmp(path, "/dev/", 5) == 0)
588                                 path += 5;
589                         vdev->v_name = strdup(path);
590                 } else {
591                         if (!strcmp(type, "raidz")) {
592                                 if (vdev->v_nparity == 1)
593                                         vdev->v_name = "raidz1";
594                                 else if (vdev->v_nparity == 2)
595                                         vdev->v_name = "raidz2";
596                                 else if (vdev->v_nparity == 3)
597                                         vdev->v_name = "raidz3";
598                                 else {
599                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
600                                         return (EIO);
601                                 }
602                         } else {
603                                 vdev->v_name = strdup(type);
604                         }
605                 }
606         } else {
607                 is_new = 0;
608         }
609
610         if (is_new || is_newer) {
611                 /*
612                  * This is either new vdev or we've already seen this vdev,
613                  * but from an older vdev label, so let's refresh its state
614                  * from the newer label.
615                  */
616                 if (is_offline)
617                         vdev->v_state = VDEV_STATE_OFFLINE;
618                 else if (is_removed)
619                         vdev->v_state = VDEV_STATE_REMOVED;
620                 else if (is_faulted)
621                         vdev->v_state = VDEV_STATE_FAULTED;
622                 else if (is_degraded)
623                         vdev->v_state = VDEV_STATE_DEGRADED;
624                 else if (isnt_present)
625                         vdev->v_state = VDEV_STATE_CANT_OPEN;
626         }
627
628         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
629                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
630         /*
631          * Its ok if we don't have any kids.
632          */
633         if (rc == 0) {
634                 vdev->v_nchildren = nkids;
635                 for (i = 0; i < nkids; i++) {
636                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
637                         if (rc)
638                                 return (rc);
639                         if (is_new)
640                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
641                                                    v_childlink);
642                         kids = nvlist_next(kids);
643                 }
644         } else {
645                 vdev->v_nchildren = 0;
646         }
647
648         if (vdevp)
649                 *vdevp = vdev;
650         return (0);
651 }
652
653 static void
654 vdev_set_state(vdev_t *vdev)
655 {
656         vdev_t *kid;
657         int good_kids;
658         int bad_kids;
659
660         /*
661          * A mirror or raidz is healthy if all its kids are healthy. A
662          * mirror is degraded if any of its kids is healthy; a raidz
663          * is degraded if at most nparity kids are offline.
664          */
665         if (STAILQ_FIRST(&vdev->v_children)) {
666                 good_kids = 0;
667                 bad_kids = 0;
668                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
669                         if (kid->v_state == VDEV_STATE_HEALTHY)
670                                 good_kids++;
671                         else
672                                 bad_kids++;
673                 }
674                 if (bad_kids == 0) {
675                         vdev->v_state = VDEV_STATE_HEALTHY;
676                 } else {
677                         if (vdev->v_read == vdev_mirror_read) {
678                                 if (good_kids) {
679                                         vdev->v_state = VDEV_STATE_DEGRADED;
680                                 } else {
681                                         vdev->v_state = VDEV_STATE_OFFLINE;
682                                 }
683                         } else if (vdev->v_read == vdev_raidz_read) {
684                                 if (bad_kids > vdev->v_nparity) {
685                                         vdev->v_state = VDEV_STATE_OFFLINE;
686                                 } else {
687                                         vdev->v_state = VDEV_STATE_DEGRADED;
688                                 }
689                         }
690                 }
691         }
692 }
693
694 static spa_t *
695 spa_find_by_guid(uint64_t guid)
696 {
697         spa_t *spa;
698
699         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
700                 if (spa->spa_guid == guid)
701                         return (spa);
702
703         return (0);
704 }
705
706 static spa_t *
707 spa_find_by_name(const char *name)
708 {
709         spa_t *spa;
710
711         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
712                 if (!strcmp(spa->spa_name, name))
713                         return (spa);
714
715         return (0);
716 }
717
718 #ifdef BOOT2
719 static spa_t *
720 spa_get_primary(void)
721 {
722
723         return (STAILQ_FIRST(&zfs_pools));
724 }
725
726 static vdev_t *
727 spa_get_primary_vdev(const spa_t *spa)
728 {
729         vdev_t *vdev;
730         vdev_t *kid;
731
732         if (spa == NULL)
733                 spa = spa_get_primary();
734         if (spa == NULL)
735                 return (NULL);
736         vdev = STAILQ_FIRST(&spa->spa_vdevs);
737         if (vdev == NULL)
738                 return (NULL);
739         for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
740              kid = STAILQ_FIRST(&vdev->v_children))
741                 vdev = kid;
742         return (vdev);
743 }
744 #endif
745
746 static spa_t *
747 spa_create(uint64_t guid)
748 {
749         spa_t *spa;
750
751         spa = malloc(sizeof(spa_t));
752         memset(spa, 0, sizeof(spa_t));
753         STAILQ_INIT(&spa->spa_vdevs);
754         spa->spa_guid = guid;
755         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
756
757         return (spa);
758 }
759
760 static const char *
761 state_name(vdev_state_t state)
762 {
763         static const char* names[] = {
764                 "UNKNOWN",
765                 "CLOSED",
766                 "OFFLINE",
767                 "REMOVED",
768                 "CANT_OPEN",
769                 "FAULTED",
770                 "DEGRADED",
771                 "ONLINE"
772         };
773         return names[state];
774 }
775
776 #ifdef BOOT2
777
778 #define pager_printf printf
779
780 #else
781
782 static int
783 pager_printf(const char *fmt, ...)
784 {
785         char line[80];
786         va_list args;
787
788         va_start(args, fmt);
789         vsprintf(line, fmt, args);
790         va_end(args);
791         return (pager_output(line));
792 }
793
794 #endif
795
796 #define STATUS_FORMAT   "        %s %s\n"
797
798 static int
799 print_state(int indent, const char *name, vdev_state_t state)
800 {
801         int i;
802         char buf[512];
803
804         buf[0] = 0;
805         for (i = 0; i < indent; i++)
806                 strcat(buf, "  ");
807         strcat(buf, name);
808         return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
809         
810 }
811
812 static int
813 vdev_status(vdev_t *vdev, int indent)
814 {
815         vdev_t *kid;
816         int ret;
817         ret = print_state(indent, vdev->v_name, vdev->v_state);
818         if (ret != 0)
819                 return (ret);
820
821         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
822                 ret = vdev_status(kid, indent + 1);
823                 if (ret != 0)
824                         return (ret);
825         }
826         return (ret);
827 }
828
829 static int
830 spa_status(spa_t *spa)
831 {
832         static char bootfs[ZFS_MAXNAMELEN];
833         uint64_t rootid;
834         vdev_t *vdev;
835         int good_kids, bad_kids, degraded_kids, ret;
836         vdev_state_t state;
837
838         ret = pager_printf("  pool: %s\n", spa->spa_name);
839         if (ret != 0)
840                 return (ret);
841
842         if (zfs_get_root(spa, &rootid) == 0 &&
843             zfs_rlookup(spa, rootid, bootfs) == 0) {
844                 if (bootfs[0] == '\0')
845                         ret = pager_printf("bootfs: %s\n", spa->spa_name);
846                 else
847                         ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
848                             bootfs);
849                 if (ret != 0)
850                         return (ret);
851         }
852         ret = pager_printf("config:\n\n");
853         if (ret != 0)
854                 return (ret);
855         ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
856         if (ret != 0)
857                 return (ret);
858
859         good_kids = 0;
860         degraded_kids = 0;
861         bad_kids = 0;
862         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
863                 if (vdev->v_state == VDEV_STATE_HEALTHY)
864                         good_kids++;
865                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
866                         degraded_kids++;
867                 else
868                         bad_kids++;
869         }
870
871         state = VDEV_STATE_CLOSED;
872         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
873                 state = VDEV_STATE_HEALTHY;
874         else if ((good_kids + degraded_kids) > 0)
875                 state = VDEV_STATE_DEGRADED;
876
877         ret = print_state(0, spa->spa_name, state);
878         if (ret != 0)
879                 return (ret);
880         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
881                 ret = vdev_status(vdev, 1);
882                 if (ret != 0)
883                         return (ret);
884         }
885         return (ret);
886 }
887
888 static int
889 spa_all_status(void)
890 {
891         spa_t *spa;
892         int first = 1, ret = 0;
893
894         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
895                 if (!first) {
896                         ret = pager_printf("\n");
897                         if (ret != 0)
898                                 return (ret);
899                 }
900                 first = 0;
901                 ret = spa_status(spa);
902                 if (ret != 0)
903                         return (ret);
904         }
905         return (ret);
906 }
907
908 static int
909 vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
910 {
911         vdev_t vtmp;
912         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
913         spa_t *spa;
914         vdev_t *vdev, *top_vdev, *pool_vdev;
915         off_t off;
916         blkptr_t bp;
917         const unsigned char *nvlist;
918         uint64_t val;
919         uint64_t guid;
920         uint64_t pool_txg, pool_guid;
921         uint64_t is_log;
922         const char *pool_name;
923         const unsigned char *vdevs;
924         const unsigned char *features;
925         int i, rc, is_newer;
926         char *upbuf;
927         const struct uberblock *up;
928
929         /*
930          * Load the vdev label and figure out which
931          * uberblock is most current.
932          */
933         memset(&vtmp, 0, sizeof(vtmp));
934         vtmp.v_phys_read = _read;
935         vtmp.v_read_priv = read_priv;
936         off = offsetof(vdev_label_t, vl_vdev_phys);
937         BP_ZERO(&bp);
938         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
939         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
940         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
941         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
942         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
943         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
944         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
945                 return (EIO);
946
947         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
948                 return (EIO);
949         }
950
951         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
952
953         if (nvlist_find(nvlist,
954                         ZPOOL_CONFIG_VERSION,
955                         DATA_TYPE_UINT64, 0, &val)) {
956                 return (EIO);
957         }
958
959         if (!SPA_VERSION_IS_SUPPORTED(val)) {
960                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
961                     (unsigned) val, (unsigned) SPA_VERSION);
962                 return (EIO);
963         }
964
965         /* Check ZFS features for read */
966         if (nvlist_find(nvlist,
967                         ZPOOL_CONFIG_FEATURES_FOR_READ,
968                         DATA_TYPE_NVLIST, 0, &features) == 0
969             && nvlist_check_features_for_read(features) != 0)
970                 return (EIO);
971
972         if (nvlist_find(nvlist,
973                         ZPOOL_CONFIG_POOL_STATE,
974                         DATA_TYPE_UINT64, 0, &val)) {
975                 return (EIO);
976         }
977
978         if (val == POOL_STATE_DESTROYED) {
979                 /* We don't boot only from destroyed pools. */
980                 return (EIO);
981         }
982
983         if (nvlist_find(nvlist,
984                         ZPOOL_CONFIG_POOL_TXG,
985                         DATA_TYPE_UINT64, 0, &pool_txg)
986             || nvlist_find(nvlist,
987                            ZPOOL_CONFIG_POOL_GUID,
988                            DATA_TYPE_UINT64, 0, &pool_guid)
989             || nvlist_find(nvlist,
990                            ZPOOL_CONFIG_POOL_NAME,
991                            DATA_TYPE_STRING, 0, &pool_name)) {
992                 /*
993                  * Cache and spare devices end up here - just ignore
994                  * them.
995                  */
996                 /*printf("ZFS: can't find pool details\n");*/
997                 return (EIO);
998         }
999
1000         is_log = 0;
1001         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
1002             &is_log);
1003         if (is_log)
1004                 return (EIO);
1005
1006         /*
1007          * Create the pool if this is the first time we've seen it.
1008          */
1009         spa = spa_find_by_guid(pool_guid);
1010         if (!spa) {
1011                 spa = spa_create(pool_guid);
1012                 spa->spa_name = strdup(pool_name);
1013         }
1014         if (pool_txg > spa->spa_txg) {
1015                 spa->spa_txg = pool_txg;
1016                 is_newer = 1;
1017         } else
1018                 is_newer = 0;
1019
1020         /*
1021          * Get the vdev tree and create our in-core copy of it.
1022          * If we already have a vdev with this guid, this must
1023          * be some kind of alias (overlapping slices, dangerously dedicated
1024          * disks etc).
1025          */
1026         if (nvlist_find(nvlist,
1027                         ZPOOL_CONFIG_GUID,
1028                         DATA_TYPE_UINT64, 0, &guid)) {
1029                 return (EIO);
1030         }
1031         vdev = vdev_find(guid);
1032         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
1033                 return (EIO);
1034
1035         if (nvlist_find(nvlist,
1036                         ZPOOL_CONFIG_VDEV_TREE,
1037                         DATA_TYPE_NVLIST, 0, &vdevs)) {
1038                 return (EIO);
1039         }
1040
1041         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1042         if (rc)
1043                 return (rc);
1044
1045         /*
1046          * Add the toplevel vdev to the pool if its not already there.
1047          */
1048         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1049                 if (top_vdev == pool_vdev)
1050                         break;
1051         if (!pool_vdev && top_vdev) {
1052                 top_vdev->spa = spa;
1053                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1054         }
1055
1056         /*
1057          * We should already have created an incomplete vdev for this
1058          * vdev. Find it and initialise it with our read proc.
1059          */
1060         vdev = vdev_find(guid);
1061         if (vdev) {
1062                 vdev->v_phys_read = _read;
1063                 vdev->v_read_priv = read_priv;
1064                 vdev->v_state = VDEV_STATE_HEALTHY;
1065         } else {
1066                 printf("ZFS: inconsistent nvlist contents\n");
1067                 return (EIO);
1068         }
1069
1070         /*
1071          * Re-evaluate top-level vdev state.
1072          */
1073         vdev_set_state(top_vdev);
1074
1075         /*
1076          * Ok, we are happy with the pool so far. Lets find
1077          * the best uberblock and then we can actually access
1078          * the contents of the pool.
1079          */
1080         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1081         up = (const struct uberblock *)upbuf;
1082         for (i = 0;
1083              i < VDEV_UBERBLOCK_COUNT(vdev);
1084              i++) {
1085                 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
1086                 BP_ZERO(&bp);
1087                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
1088                 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1089                 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1090                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1091                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1092                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1093
1094                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1095                         continue;
1096
1097                 if (up->ub_magic != UBERBLOCK_MAGIC)
1098                         continue;
1099                 if (up->ub_txg < spa->spa_txg)
1100                         continue;
1101                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
1102                         spa->spa_uberblock = *up;
1103                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
1104                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
1105                                 spa->spa_uberblock = *up;
1106                 }
1107         }
1108         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1109
1110         vdev->spa = spa;
1111         if (spap)
1112                 *spap = spa;
1113         return (0);
1114 }
1115
1116 static int
1117 ilog2(int n)
1118 {
1119         int v;
1120
1121         for (v = 0; v < 32; v++)
1122                 if (n == (1 << v))
1123                         return v;
1124         return -1;
1125 }
1126
1127 static int
1128 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1129 {
1130         blkptr_t gbh_bp;
1131         zio_gbh_phys_t zio_gb;
1132         char *pbuf;
1133         int i;
1134
1135         /* Artificial BP for gang block header. */
1136         gbh_bp = *bp;
1137         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1138         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1139         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1140         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1141         for (i = 0; i < SPA_DVAS_PER_BP; i++)
1142                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1143
1144         /* Read gang header block using the artificial BP. */
1145         if (zio_read(spa, &gbh_bp, &zio_gb))
1146                 return (EIO);
1147
1148         pbuf = buf;
1149         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1150                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1151
1152                 if (BP_IS_HOLE(gbp))
1153                         continue;
1154                 if (zio_read(spa, gbp, pbuf))
1155                         return (EIO);
1156                 pbuf += BP_GET_PSIZE(gbp);
1157         }
1158
1159         if (zio_checksum_verify(spa, bp, buf))
1160                 return (EIO);
1161         return (0);
1162 }
1163
1164 static int
1165 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1166 {
1167         int cpfunc = BP_GET_COMPRESS(bp);
1168         uint64_t align, size;
1169         void *pbuf;
1170         int i, error;
1171
1172         /*
1173          * Process data embedded in block pointer
1174          */
1175         if (BP_IS_EMBEDDED(bp)) {
1176                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1177
1178                 size = BPE_GET_PSIZE(bp);
1179                 ASSERT(size <= BPE_PAYLOAD_SIZE);
1180
1181                 if (cpfunc != ZIO_COMPRESS_OFF)
1182                         pbuf = zfs_alloc(size);
1183                 else
1184                         pbuf = buf;
1185
1186                 decode_embedded_bp_compressed(bp, pbuf);
1187                 error = 0;
1188
1189                 if (cpfunc != ZIO_COMPRESS_OFF) {
1190                         error = zio_decompress_data(cpfunc, pbuf,
1191                             size, buf, BP_GET_LSIZE(bp));
1192                         zfs_free(pbuf, size);
1193                 }
1194                 if (error != 0)
1195                         printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1196                             error);
1197                 return (error);
1198         }
1199
1200         error = EIO;
1201
1202         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1203                 const dva_t *dva = &bp->blk_dva[i];
1204                 vdev_t *vdev;
1205                 int vdevid;
1206                 off_t offset;
1207
1208                 if (!dva->dva_word[0] && !dva->dva_word[1])
1209                         continue;
1210
1211                 vdevid = DVA_GET_VDEV(dva);
1212                 offset = DVA_GET_OFFSET(dva);
1213                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1214                         if (vdev->v_id == vdevid)
1215                                 break;
1216                 }
1217                 if (!vdev || !vdev->v_read)
1218                         continue;
1219
1220                 size = BP_GET_PSIZE(bp);
1221                 if (vdev->v_read == vdev_raidz_read) {
1222                         align = 1ULL << vdev->v_top->v_ashift;
1223                         if (P2PHASE(size, align) != 0)
1224                                 size = P2ROUNDUP(size, align);
1225                 }
1226                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1227                         pbuf = zfs_alloc(size);
1228                 else
1229                         pbuf = buf;
1230
1231                 if (DVA_GET_GANG(dva))
1232                         error = zio_read_gang(spa, bp, pbuf);
1233                 else
1234                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
1235                 if (error == 0) {
1236                         if (cpfunc != ZIO_COMPRESS_OFF)
1237                                 error = zio_decompress_data(cpfunc, pbuf,
1238                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1239                         else if (size != BP_GET_PSIZE(bp))
1240                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1241                 }
1242                 if (buf != pbuf)
1243                         zfs_free(pbuf, size);
1244                 if (error == 0)
1245                         break;
1246         }
1247         if (error != 0)
1248                 printf("ZFS: i/o error - all block copies unavailable\n");
1249         return (error);
1250 }
1251
1252 static int
1253 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1254 {
1255         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1256         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1257         int nlevels = dnode->dn_nlevels;
1258         int i, rc;
1259
1260         if (bsize > SPA_MAXBLOCKSIZE) {
1261                 printf("ZFS: I/O error - blocks larger than %llu are not "
1262                     "supported\n", SPA_MAXBLOCKSIZE);
1263                 return (EIO);
1264         }
1265
1266         /*
1267          * Note: bsize may not be a power of two here so we need to do an
1268          * actual divide rather than a bitshift.
1269          */
1270         while (buflen > 0) {
1271                 uint64_t bn = offset / bsize;
1272                 int boff = offset % bsize;
1273                 int ibn;
1274                 const blkptr_t *indbp;
1275                 blkptr_t bp;
1276
1277                 if (bn > dnode->dn_maxblkid)
1278                         return (EIO);
1279
1280                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1281                         goto cached;
1282
1283                 indbp = dnode->dn_blkptr;
1284                 for (i = 0; i < nlevels; i++) {
1285                         /*
1286                          * Copy the bp from the indirect array so that
1287                          * we can re-use the scratch buffer for multi-level
1288                          * objects.
1289                          */
1290                         ibn = bn >> ((nlevels - i - 1) * ibshift);
1291                         ibn &= ((1 << ibshift) - 1);
1292                         bp = indbp[ibn];
1293                         if (BP_IS_HOLE(&bp)) {
1294                                 memset(dnode_cache_buf, 0, bsize);
1295                                 break;
1296                         }
1297                         rc = zio_read(spa, &bp, dnode_cache_buf);
1298                         if (rc)
1299                                 return (rc);
1300                         indbp = (const blkptr_t *) dnode_cache_buf;
1301                 }
1302                 dnode_cache_obj = dnode;
1303                 dnode_cache_bn = bn;
1304         cached:
1305
1306                 /*
1307                  * The buffer contains our data block. Copy what we
1308                  * need from it and loop.
1309                  */ 
1310                 i = bsize - boff;
1311                 if (i > buflen) i = buflen;
1312                 memcpy(buf, &dnode_cache_buf[boff], i);
1313                 buf = ((char*) buf) + i;
1314                 offset += i;
1315                 buflen -= i;
1316         }
1317
1318         return (0);
1319 }
1320
1321 /*
1322  * Lookup a value in a microzap directory. Assumes that the zap
1323  * scratch buffer contains the directory contents.
1324  */
1325 static int
1326 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1327 {
1328         const mzap_phys_t *mz;
1329         const mzap_ent_phys_t *mze;
1330         size_t size;
1331         int chunks, i;
1332
1333         /*
1334          * Microzap objects use exactly one block. Read the whole
1335          * thing.
1336          */
1337         size = dnode->dn_datablkszsec * 512;
1338
1339         mz = (const mzap_phys_t *) zap_scratch;
1340         chunks = size / MZAP_ENT_LEN - 1;
1341
1342         for (i = 0; i < chunks; i++) {
1343                 mze = &mz->mz_chunk[i];
1344                 if (!strcmp(mze->mze_name, name)) {
1345                         *value = mze->mze_value;
1346                         return (0);
1347                 }
1348         }
1349
1350         return (ENOENT);
1351 }
1352
1353 /*
1354  * Compare a name with a zap leaf entry. Return non-zero if the name
1355  * matches.
1356  */
1357 static int
1358 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1359 {
1360         size_t namelen;
1361         const zap_leaf_chunk_t *nc;
1362         const char *p;
1363
1364         namelen = zc->l_entry.le_name_numints;
1365                         
1366         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1367         p = name;
1368         while (namelen > 0) {
1369                 size_t len;
1370                 len = namelen;
1371                 if (len > ZAP_LEAF_ARRAY_BYTES)
1372                         len = ZAP_LEAF_ARRAY_BYTES;
1373                 if (memcmp(p, nc->l_array.la_array, len))
1374                         return (0);
1375                 p += len;
1376                 namelen -= len;
1377                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1378         }
1379
1380         return 1;
1381 }
1382
1383 /*
1384  * Extract a uint64_t value from a zap leaf entry.
1385  */
1386 static uint64_t
1387 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1388 {
1389         const zap_leaf_chunk_t *vc;
1390         int i;
1391         uint64_t value;
1392         const uint8_t *p;
1393
1394         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1395         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1396                 value = (value << 8) | p[i];
1397         }
1398
1399         return value;
1400 }
1401
1402 static void
1403 stv(int len, void *addr, uint64_t value)
1404 {
1405         switch (len) {
1406         case 1:
1407                 *(uint8_t *)addr = value;
1408                 return;
1409         case 2:
1410                 *(uint16_t *)addr = value;
1411                 return;
1412         case 4:
1413                 *(uint32_t *)addr = value;
1414                 return;
1415         case 8:
1416                 *(uint64_t *)addr = value;
1417                 return;
1418         }
1419 }
1420
1421 /*
1422  * Extract a array from a zap leaf entry.
1423  */
1424 static void
1425 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
1426     uint64_t integer_size, uint64_t num_integers, void *buf)
1427 {
1428         uint64_t array_int_len = zc->l_entry.le_value_intlen;
1429         uint64_t value = 0;
1430         uint64_t *u64 = buf;
1431         char *p = buf;
1432         int len = MIN(zc->l_entry.le_value_numints, num_integers);
1433         int chunk = zc->l_entry.le_value_chunk;
1434         int byten = 0;
1435
1436         if (integer_size == 8 && len == 1) {
1437                 *u64 = fzap_leaf_value(zl, zc);
1438                 return;
1439         }
1440
1441         while (len > 0) {
1442                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
1443                 int i;
1444
1445                 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
1446                 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
1447                         value = (value << 8) | la->la_array[i];
1448                         byten++;
1449                         if (byten == array_int_len) {
1450                                 stv(integer_size, p, value);
1451                                 byten = 0;
1452                                 len--;
1453                                 if (len == 0)
1454                                         return;
1455                                 p += integer_size;
1456                         }
1457                 }
1458                 chunk = la->la_next;
1459         }
1460 }
1461
1462 /*
1463  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1464  * buffer contains the directory header.
1465  */
1466 static int
1467 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1468     uint64_t integer_size, uint64_t num_integers, void *value)
1469 {
1470         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1471         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1472         fat_zap_t z;
1473         uint64_t *ptrtbl;
1474         uint64_t hash;
1475         int rc;
1476
1477         if (zh.zap_magic != ZAP_MAGIC)
1478                 return (EIO);
1479
1480         z.zap_block_shift = ilog2(bsize);
1481         z.zap_phys = (zap_phys_t *) zap_scratch;
1482
1483         /*
1484          * Figure out where the pointer table is and read it in if necessary.
1485          */
1486         if (zh.zap_ptrtbl.zt_blk) {
1487                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1488                                zap_scratch, bsize);
1489                 if (rc)
1490                         return (rc);
1491                 ptrtbl = (uint64_t *) zap_scratch;
1492         } else {
1493                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1494         }
1495
1496         hash = zap_hash(zh.zap_salt, name);
1497
1498         zap_leaf_t zl;
1499         zl.l_bs = z.zap_block_shift;
1500
1501         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1502         zap_leaf_chunk_t *zc;
1503
1504         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1505         if (rc)
1506                 return (rc);
1507
1508         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1509
1510         /*
1511          * Make sure this chunk matches our hash.
1512          */
1513         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1514             && zl.l_phys->l_hdr.lh_prefix
1515             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1516                 return (ENOENT);
1517
1518         /*
1519          * Hash within the chunk to find our entry.
1520          */
1521         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1522         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1523         h = zl.l_phys->l_hash[h];
1524         if (h == 0xffff)
1525                 return (ENOENT);
1526         zc = &ZAP_LEAF_CHUNK(&zl, h);
1527         while (zc->l_entry.le_hash != hash) {
1528                 if (zc->l_entry.le_next == 0xffff) {
1529                         zc = NULL;
1530                         break;
1531                 }
1532                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1533         }
1534         if (fzap_name_equal(&zl, zc, name)) {
1535                 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints >
1536                     integer_size * num_integers)
1537                         return (E2BIG);
1538                 fzap_leaf_array(&zl, zc, integer_size, num_integers, value);
1539                 return (0);
1540         }
1541
1542         return (ENOENT);
1543 }
1544
1545 /*
1546  * Lookup a name in a zap object and return its value as a uint64_t.
1547  */
1548 static int
1549 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1550     uint64_t integer_size, uint64_t num_integers, void *value)
1551 {
1552         int rc;
1553         uint64_t zap_type;
1554         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1555
1556         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1557         if (rc)
1558                 return (rc);
1559
1560         zap_type = *(uint64_t *) zap_scratch;
1561         if (zap_type == ZBT_MICRO)
1562                 return mzap_lookup(dnode, name, value);
1563         else if (zap_type == ZBT_HEADER) {
1564                 return fzap_lookup(spa, dnode, name, integer_size,
1565                     num_integers, value);
1566         }
1567         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1568         return (EIO);
1569 }
1570
1571 /*
1572  * List a microzap directory. Assumes that the zap scratch buffer contains
1573  * the directory contents.
1574  */
1575 static int
1576 mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1577 {
1578         const mzap_phys_t *mz;
1579         const mzap_ent_phys_t *mze;
1580         size_t size;
1581         int chunks, i, rc;
1582
1583         /*
1584          * Microzap objects use exactly one block. Read the whole
1585          * thing.
1586          */
1587         size = dnode->dn_datablkszsec * 512;
1588         mz = (const mzap_phys_t *) zap_scratch;
1589         chunks = size / MZAP_ENT_LEN - 1;
1590
1591         for (i = 0; i < chunks; i++) {
1592                 mze = &mz->mz_chunk[i];
1593                 if (mze->mze_name[0]) {
1594                         rc = callback(mze->mze_name, mze->mze_value);
1595                         if (rc != 0)
1596                                 return (rc);
1597                 }
1598         }
1599
1600         return (0);
1601 }
1602
1603 /*
1604  * List a fatzap directory. Assumes that the zap scratch buffer contains
1605  * the directory header.
1606  */
1607 static int
1608 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1609 {
1610         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1611         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1612         fat_zap_t z;
1613         int i, j, rc;
1614
1615         if (zh.zap_magic != ZAP_MAGIC)
1616                 return (EIO);
1617
1618         z.zap_block_shift = ilog2(bsize);
1619         z.zap_phys = (zap_phys_t *) zap_scratch;
1620
1621         /*
1622          * This assumes that the leaf blocks start at block 1. The
1623          * documentation isn't exactly clear on this.
1624          */
1625         zap_leaf_t zl;
1626         zl.l_bs = z.zap_block_shift;
1627         for (i = 0; i < zh.zap_num_leafs; i++) {
1628                 off_t off = (i + 1) << zl.l_bs;
1629                 char name[256], *p;
1630                 uint64_t value;
1631
1632                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1633                         return (EIO);
1634
1635                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1636
1637                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1638                         zap_leaf_chunk_t *zc, *nc;
1639                         int namelen;
1640
1641                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1642                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1643                                 continue;
1644                         namelen = zc->l_entry.le_name_numints;
1645                         if (namelen > sizeof(name))
1646                                 namelen = sizeof(name);
1647
1648                         /*
1649                          * Paste the name back together.
1650                          */
1651                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1652                         p = name;
1653                         while (namelen > 0) {
1654                                 int len;
1655                                 len = namelen;
1656                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1657                                         len = ZAP_LEAF_ARRAY_BYTES;
1658                                 memcpy(p, nc->l_array.la_array, len);
1659                                 p += len;
1660                                 namelen -= len;
1661                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1662                         }
1663
1664                         /*
1665                          * Assume the first eight bytes of the value are
1666                          * a uint64_t.
1667                          */
1668                         value = fzap_leaf_value(&zl, zc);
1669
1670                         //printf("%s 0x%jx\n", name, (uintmax_t)value);
1671                         rc = callback((const char *)name, value);
1672                         if (rc != 0)
1673                                 return (rc);
1674                 }
1675         }
1676
1677         return (0);
1678 }
1679
1680 static int zfs_printf(const char *name, uint64_t value __unused)
1681 {
1682
1683         printf("%s\n", name);
1684
1685         return (0);
1686 }
1687
1688 /*
1689  * List a zap directory.
1690  */
1691 static int
1692 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1693 {
1694         uint64_t zap_type;
1695         size_t size = dnode->dn_datablkszsec * 512;
1696
1697         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1698                 return (EIO);
1699
1700         zap_type = *(uint64_t *) zap_scratch;
1701         if (zap_type == ZBT_MICRO)
1702                 return mzap_list(dnode, zfs_printf);
1703         else
1704                 return fzap_list(spa, dnode, zfs_printf);
1705 }
1706
1707 static int
1708 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1709 {
1710         off_t offset;
1711
1712         offset = objnum * sizeof(dnode_phys_t);
1713         return dnode_read(spa, &os->os_meta_dnode, offset,
1714                 dnode, sizeof(dnode_phys_t));
1715 }
1716
1717 static int
1718 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1719 {
1720         const mzap_phys_t *mz;
1721         const mzap_ent_phys_t *mze;
1722         size_t size;
1723         int chunks, i;
1724
1725         /*
1726          * Microzap objects use exactly one block. Read the whole
1727          * thing.
1728          */
1729         size = dnode->dn_datablkszsec * 512;
1730
1731         mz = (const mzap_phys_t *) zap_scratch;
1732         chunks = size / MZAP_ENT_LEN - 1;
1733
1734         for (i = 0; i < chunks; i++) {
1735                 mze = &mz->mz_chunk[i];
1736                 if (value == mze->mze_value) {
1737                         strcpy(name, mze->mze_name);
1738                         return (0);
1739                 }
1740         }
1741
1742         return (ENOENT);
1743 }
1744
1745 static void
1746 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1747 {
1748         size_t namelen;
1749         const zap_leaf_chunk_t *nc;
1750         char *p;
1751
1752         namelen = zc->l_entry.le_name_numints;
1753
1754         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1755         p = name;
1756         while (namelen > 0) {
1757                 size_t len;
1758                 len = namelen;
1759                 if (len > ZAP_LEAF_ARRAY_BYTES)
1760                         len = ZAP_LEAF_ARRAY_BYTES;
1761                 memcpy(p, nc->l_array.la_array, len);
1762                 p += len;
1763                 namelen -= len;
1764                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1765         }
1766
1767         *p = '\0';
1768 }
1769
1770 static int
1771 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1772 {
1773         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1774         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1775         fat_zap_t z;
1776         int i, j;
1777
1778         if (zh.zap_magic != ZAP_MAGIC)
1779                 return (EIO);
1780
1781         z.zap_block_shift = ilog2(bsize);
1782         z.zap_phys = (zap_phys_t *) zap_scratch;
1783
1784         /*
1785          * This assumes that the leaf blocks start at block 1. The
1786          * documentation isn't exactly clear on this.
1787          */
1788         zap_leaf_t zl;
1789         zl.l_bs = z.zap_block_shift;
1790         for (i = 0; i < zh.zap_num_leafs; i++) {
1791                 off_t off = (i + 1) << zl.l_bs;
1792
1793                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1794                         return (EIO);
1795
1796                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1797
1798                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1799                         zap_leaf_chunk_t *zc;
1800
1801                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1802                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1803                                 continue;
1804                         if (zc->l_entry.le_value_intlen != 8 ||
1805                             zc->l_entry.le_value_numints != 1)
1806                                 continue;
1807
1808                         if (fzap_leaf_value(&zl, zc) == value) {
1809                                 fzap_name_copy(&zl, zc, name);
1810                                 return (0);
1811                         }
1812                 }
1813         }
1814
1815         return (ENOENT);
1816 }
1817
1818 static int
1819 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1820 {
1821         int rc;
1822         uint64_t zap_type;
1823         size_t size = dnode->dn_datablkszsec * 512;
1824
1825         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1826         if (rc)
1827                 return (rc);
1828
1829         zap_type = *(uint64_t *) zap_scratch;
1830         if (zap_type == ZBT_MICRO)
1831                 return mzap_rlookup(spa, dnode, name, value);
1832         else
1833                 return fzap_rlookup(spa, dnode, name, value);
1834 }
1835
1836 static int
1837 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1838 {
1839         char name[256];
1840         char component[256];
1841         uint64_t dir_obj, parent_obj, child_dir_zapobj;
1842         dnode_phys_t child_dir_zap, dataset, dir, parent;
1843         dsl_dir_phys_t *dd;
1844         dsl_dataset_phys_t *ds;
1845         char *p;
1846         int len;
1847
1848         p = &name[sizeof(name) - 1];
1849         *p = '\0';
1850
1851         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1852                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1853                 return (EIO);
1854         }
1855         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1856         dir_obj = ds->ds_dir_obj;
1857
1858         for (;;) {
1859                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1860                         return (EIO);
1861                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1862
1863                 /* Actual loop condition. */
1864                 parent_obj  = dd->dd_parent_obj;
1865                 if (parent_obj == 0)
1866                         break;
1867
1868                 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1869                         return (EIO);
1870                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1871                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1872                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1873                         return (EIO);
1874                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1875                         return (EIO);
1876
1877                 len = strlen(component);
1878                 p -= len;
1879                 memcpy(p, component, len);
1880                 --p;
1881                 *p = '/';
1882
1883                 /* Actual loop iteration. */
1884                 dir_obj = parent_obj;
1885         }
1886
1887         if (*p != '\0')
1888                 ++p;
1889         strcpy(result, p);
1890
1891         return (0);
1892 }
1893
1894 static int
1895 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1896 {
1897         char element[256];
1898         uint64_t dir_obj, child_dir_zapobj;
1899         dnode_phys_t child_dir_zap, dir;
1900         dsl_dir_phys_t *dd;
1901         const char *p, *q;
1902
1903         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1904                 return (EIO);
1905         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
1906             1, &dir_obj))
1907                 return (EIO);
1908
1909         p = name;
1910         for (;;) {
1911                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1912                         return (EIO);
1913                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1914
1915                 while (*p == '/')
1916                         p++;
1917                 /* Actual loop condition #1. */
1918                 if (*p == '\0')
1919                         break;
1920
1921                 q = strchr(p, '/');
1922                 if (q) {
1923                         memcpy(element, p, q - p);
1924                         element[q - p] = '\0';
1925                         p = q + 1;
1926                 } else {
1927                         strcpy(element, p);
1928                         p += strlen(p);
1929                 }
1930
1931                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1932                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1933                         return (EIO);
1934
1935                 /* Actual loop condition #2. */
1936                 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
1937                     1, &dir_obj) != 0)
1938                         return (ENOENT);
1939         }
1940
1941         *objnum = dd->dd_head_dataset_obj;
1942         return (0);
1943 }
1944
1945 #ifndef BOOT2
1946 static int
1947 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1948 {
1949         uint64_t dir_obj, child_dir_zapobj;
1950         dnode_phys_t child_dir_zap, dir, dataset;
1951         dsl_dataset_phys_t *ds;
1952         dsl_dir_phys_t *dd;
1953
1954         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1955                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1956                 return (EIO);
1957         }
1958         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1959         dir_obj = ds->ds_dir_obj;
1960
1961         if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
1962                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1963                 return (EIO);
1964         }
1965         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1966
1967         child_dir_zapobj = dd->dd_child_dir_zapobj;
1968         if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
1969                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
1970                 return (EIO);
1971         }
1972
1973         return (zap_list(spa, &child_dir_zap) != 0);
1974 }
1975
1976 int
1977 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t))
1978 {
1979         uint64_t dir_obj, child_dir_zapobj, zap_type;
1980         dnode_phys_t child_dir_zap, dir, dataset;
1981         dsl_dataset_phys_t *ds;
1982         dsl_dir_phys_t *dd;
1983         int err;
1984
1985         err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
1986         if (err != 0) {
1987                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1988                 return (err);
1989         }
1990         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1991         dir_obj = ds->ds_dir_obj;
1992
1993         err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
1994         if (err != 0) {
1995                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1996                 return (err);
1997         }
1998         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1999
2000         child_dir_zapobj = dd->dd_child_dir_zapobj;
2001         err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
2002         if (err != 0) {
2003                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2004                 return (err);
2005         }
2006
2007         err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
2008         if (err != 0)
2009                 return (err);
2010
2011         zap_type = *(uint64_t *) zap_scratch;
2012         if (zap_type == ZBT_MICRO)
2013                 return mzap_list(&child_dir_zap, callback);
2014         else
2015                 return fzap_list(spa, &child_dir_zap, callback);
2016 }
2017 #endif
2018
2019 /*
2020  * Find the object set given the object number of its dataset object
2021  * and return its details in *objset
2022  */
2023 static int
2024 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
2025 {
2026         dnode_phys_t dataset;
2027         dsl_dataset_phys_t *ds;
2028
2029         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2030                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2031                 return (EIO);
2032         }
2033
2034         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2035         if (zio_read(spa, &ds->ds_bp, objset)) {
2036                 printf("ZFS: can't read object set for dataset %ju\n",
2037                     (uintmax_t)objnum);
2038                 return (EIO);
2039         }
2040
2041         return (0);
2042 }
2043
2044 /*
2045  * Find the object set pointed to by the BOOTFS property or the root
2046  * dataset if there is none and return its details in *objset
2047  */
2048 static int
2049 zfs_get_root(const spa_t *spa, uint64_t *objid)
2050 {
2051         dnode_phys_t dir, propdir;
2052         uint64_t props, bootfs, root;
2053
2054         *objid = 0;
2055
2056         /*
2057          * Start with the MOS directory object.
2058          */
2059         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
2060                 printf("ZFS: can't read MOS object directory\n");
2061                 return (EIO);
2062         }
2063
2064         /*
2065          * Lookup the pool_props and see if we can find a bootfs.
2066          */
2067         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, sizeof (props), 1, &props) == 0
2068              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
2069              && zap_lookup(spa, &propdir, "bootfs", sizeof (bootfs), 1, &bootfs) == 0
2070              && bootfs != 0)
2071         {
2072                 *objid = bootfs;
2073                 return (0);
2074         }
2075         /*
2076          * Lookup the root dataset directory
2077          */
2078         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root)
2079             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
2080                 printf("ZFS: can't find root dsl_dir\n");
2081                 return (EIO);
2082         }
2083
2084         /*
2085          * Use the information from the dataset directory's bonus buffer
2086          * to find the dataset object and from that the object set itself.
2087          */
2088         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
2089         *objid = dd->dd_head_dataset_obj;
2090         return (0);
2091 }
2092
2093 static int
2094 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
2095 {
2096
2097         mount->spa = spa;
2098
2099         /*
2100          * Find the root object set if not explicitly provided
2101          */
2102         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
2103                 printf("ZFS: can't find root filesystem\n");
2104                 return (EIO);
2105         }
2106
2107         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
2108                 printf("ZFS: can't open root filesystem\n");
2109                 return (EIO);
2110         }
2111
2112         mount->rootobj = rootobj;
2113
2114         return (0);
2115 }
2116
2117 /*
2118  * callback function for feature name checks.
2119  */
2120 static int
2121 check_feature(const char *name, uint64_t value)
2122 {
2123         int i;
2124
2125         if (value == 0)
2126                 return (0);
2127         if (name[0] == '\0')
2128                 return (0);
2129
2130         for (i = 0; features_for_read[i] != NULL; i++) {
2131                 if (strcmp(name, features_for_read[i]) == 0)
2132                         return (0);
2133         }
2134         printf("ZFS: unsupported feature: %s\n", name);
2135         return (EIO);
2136 }
2137
2138 /*
2139  * Checks whether the MOS features that are active are supported.
2140  */
2141 static int
2142 check_mos_features(const spa_t *spa)
2143 {
2144         dnode_phys_t dir;
2145         uint64_t objnum, zap_type;
2146         size_t size;
2147         int rc;
2148
2149         if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
2150             &dir)) != 0)
2151                 return (rc);
2152         if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
2153             sizeof (objnum), 1, &objnum)) != 0) {
2154                 /*
2155                  * It is older pool without features. As we have already
2156                  * tested the label, just return without raising the error.
2157                  */
2158                 return (0);
2159         }
2160
2161         if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
2162                 return (rc);
2163
2164         if (dir.dn_type != DMU_OTN_ZAP_METADATA)
2165                 return (EIO);
2166
2167         size = dir.dn_datablkszsec * 512;
2168         if (dnode_read(spa, &dir, 0, zap_scratch, size))
2169                 return (EIO);
2170
2171         zap_type = *(uint64_t *) zap_scratch;
2172         if (zap_type == ZBT_MICRO)
2173                 rc = mzap_list(&dir, check_feature);
2174         else
2175                 rc = fzap_list(spa, &dir, check_feature);
2176
2177         return (rc);
2178 }
2179
2180 static int
2181 zfs_spa_init(spa_t *spa)
2182 {
2183         dnode_phys_t dir;
2184         int rc;
2185
2186         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
2187                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
2188                 return (EIO);
2189         }
2190         if (spa->spa_mos.os_type != DMU_OST_META) {
2191                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
2192                 return (EIO);
2193         }
2194
2195         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
2196             &dir)) {
2197                 printf("ZFS: failed to read pool %s directory object\n",
2198                     spa->spa_name);
2199                 return (EIO);
2200         }
2201         /* this is allowed to fail, older pools do not have salt */
2202         rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
2203             sizeof (spa->spa_cksum_salt.zcs_bytes),
2204             spa->spa_cksum_salt.zcs_bytes);
2205
2206         rc = check_mos_features(spa);
2207         if (rc != 0) {
2208                 printf("ZFS: pool %s is not supported\n", spa->spa_name);
2209         }
2210
2211         return (rc);
2212 }
2213
2214 static int
2215 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
2216 {
2217
2218         if (dn->dn_bonustype != DMU_OT_SA) {
2219                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
2220
2221                 sb->st_mode = zp->zp_mode;
2222                 sb->st_uid = zp->zp_uid;
2223                 sb->st_gid = zp->zp_gid;
2224                 sb->st_size = zp->zp_size;
2225         } else {
2226                 sa_hdr_phys_t *sahdrp;
2227                 int hdrsize;
2228                 size_t size = 0;
2229                 void *buf = NULL;
2230
2231                 if (dn->dn_bonuslen != 0)
2232                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2233                 else {
2234                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
2235                                 blkptr_t *bp = &dn->dn_spill;
2236                                 int error;
2237
2238                                 size = BP_GET_LSIZE(bp);
2239                                 buf = zfs_alloc(size);
2240                                 error = zio_read(spa, bp, buf);
2241                                 if (error != 0) {
2242                                         zfs_free(buf, size);
2243                                         return (error);
2244                                 }
2245                                 sahdrp = buf;
2246                         } else {
2247                                 return (EIO);
2248                         }
2249                 }
2250                 hdrsize = SA_HDR_SIZE(sahdrp);
2251                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2252                     SA_MODE_OFFSET);
2253                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2254                     SA_UID_OFFSET);
2255                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2256                     SA_GID_OFFSET);
2257                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2258                     SA_SIZE_OFFSET);
2259                 if (buf != NULL)
2260                         zfs_free(buf, size);
2261         }
2262
2263         return (0);
2264 }
2265
2266 static int
2267 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
2268 {
2269         int rc = 0;
2270
2271         if (dn->dn_bonustype == DMU_OT_SA) {
2272                 sa_hdr_phys_t *sahdrp = NULL;
2273                 size_t size = 0;
2274                 void *buf = NULL;
2275                 int hdrsize;
2276                 char *p;
2277
2278                 if (dn->dn_bonuslen != 0)
2279                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2280                 else {
2281                         blkptr_t *bp;
2282
2283                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
2284                                 return (EIO);
2285                         bp = &dn->dn_spill;
2286
2287                         size = BP_GET_LSIZE(bp);
2288                         buf = zfs_alloc(size);
2289                         rc = zio_read(spa, bp, buf);
2290                         if (rc != 0) {
2291                                 zfs_free(buf, size);
2292                                 return (rc);
2293                         }
2294                         sahdrp = buf;
2295                 }
2296                 hdrsize = SA_HDR_SIZE(sahdrp);
2297                 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
2298                 memcpy(path, p, psize);
2299                 if (buf != NULL)
2300                         zfs_free(buf, size);
2301                 return (0);
2302         }
2303         /*
2304          * Second test is purely to silence bogus compiler
2305          * warning about accessing past the end of dn_bonus.
2306          */
2307         if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
2308             sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
2309                 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
2310         } else {
2311                 rc = dnode_read(spa, dn, 0, path, psize);
2312         }
2313         return (rc);
2314 }
2315
2316 struct obj_list {
2317         uint64_t                objnum;
2318         STAILQ_ENTRY(obj_list)  entry;
2319 };
2320
2321 /*
2322  * Lookup a file and return its dnode.
2323  */
2324 static int
2325 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2326 {
2327         int rc;
2328         uint64_t objnum;
2329         const spa_t *spa;
2330         dnode_phys_t dn;
2331         const char *p, *q;
2332         char element[256];
2333         char path[1024];
2334         int symlinks_followed = 0;
2335         struct stat sb;
2336         struct obj_list *entry, *tentry;
2337         STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
2338
2339         spa = mount->spa;
2340         if (mount->objset.os_type != DMU_OST_ZFS) {
2341                 printf("ZFS: unexpected object set type %ju\n",
2342                     (uintmax_t)mount->objset.os_type);
2343                 return (EIO);
2344         }
2345
2346         if ((entry = malloc(sizeof(struct obj_list))) == NULL)
2347                 return (ENOMEM);
2348
2349         /*
2350          * Get the root directory dnode.
2351          */
2352         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2353         if (rc) {
2354                 free(entry);
2355                 return (rc);
2356         }
2357
2358         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum);
2359         if (rc) {
2360                 free(entry);
2361                 return (rc);
2362         }
2363         entry->objnum = objnum;
2364         STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2365
2366         rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2367         if (rc != 0)
2368                 goto done;
2369
2370         p = upath;
2371         while (p && *p) {
2372                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2373                 if (rc != 0)
2374                         goto done;
2375
2376                 while (*p == '/')
2377                         p++;
2378                 if (*p == '\0')
2379                         break;
2380                 q = p;
2381                 while (*q != '\0' && *q != '/')
2382                         q++;
2383
2384                 /* skip dot */
2385                 if (p + 1 == q && p[0] == '.') {
2386                         p++;
2387                         continue;
2388                 }
2389                 /* double dot */
2390                 if (p + 2 == q && p[0] == '.' && p[1] == '.') {
2391                         p += 2;
2392                         if (STAILQ_FIRST(&on_cache) ==
2393                             STAILQ_LAST(&on_cache, obj_list, entry)) {
2394                                 rc = ENOENT;
2395                                 goto done;
2396                         }
2397                         entry = STAILQ_FIRST(&on_cache);
2398                         STAILQ_REMOVE_HEAD(&on_cache, entry);
2399                         free(entry);
2400                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
2401                         continue;
2402                 }
2403                 if (q - p + 1 > sizeof(element)) {
2404                         rc = ENAMETOOLONG;
2405                         goto done;
2406                 }
2407                 memcpy(element, p, q - p);
2408                 element[q - p] = 0;
2409                 p = q;
2410
2411                 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
2412                         goto done;
2413                 if (!S_ISDIR(sb.st_mode)) {
2414                         rc = ENOTDIR;
2415                         goto done;
2416                 }
2417
2418                 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
2419                 if (rc)
2420                         goto done;
2421                 objnum = ZFS_DIRENT_OBJ(objnum);
2422
2423                 if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
2424                         rc = ENOMEM;
2425                         goto done;
2426                 }
2427                 entry->objnum = objnum;
2428                 STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2429                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2430                 if (rc)
2431                         goto done;
2432
2433                 /*
2434                  * Check for symlink.
2435                  */
2436                 rc = zfs_dnode_stat(spa, &dn, &sb);
2437                 if (rc)
2438                         goto done;
2439                 if (S_ISLNK(sb.st_mode)) {
2440                         if (symlinks_followed > 10) {
2441                                 rc = EMLINK;
2442                                 goto done;
2443                         }
2444                         symlinks_followed++;
2445
2446                         /*
2447                          * Read the link value and copy the tail of our
2448                          * current path onto the end.
2449                          */
2450                         if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
2451                                 rc = ENAMETOOLONG;
2452                                 goto done;
2453                         }
2454                         strcpy(&path[sb.st_size], p);
2455
2456                         rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
2457                         if (rc != 0)
2458                                 goto done;
2459
2460                         /*
2461                          * Restart with the new path, starting either at
2462                          * the root or at the parent depending whether or
2463                          * not the link is relative.
2464                          */
2465                         p = path;
2466                         if (*p == '/') {
2467                                 while (STAILQ_FIRST(&on_cache) !=
2468                                     STAILQ_LAST(&on_cache, obj_list, entry)) {
2469                                         entry = STAILQ_FIRST(&on_cache);
2470                                         STAILQ_REMOVE_HEAD(&on_cache, entry);
2471                                         free(entry);
2472                                 }
2473                         } else {
2474                                 entry = STAILQ_FIRST(&on_cache);
2475                                 STAILQ_REMOVE_HEAD(&on_cache, entry);
2476                                 free(entry);
2477                         }
2478                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
2479                 }
2480         }
2481
2482         *dnode = dn;
2483 done:
2484         STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
2485                 free(entry);
2486         return (rc);
2487 }