]> CyberLeo.Net >> Repos - FreeBSD/releng/10.3.git/blob - sys/boot/zfs/zfsimpl.c
- Copy stable/10@296371 to releng/10.3 in preparation for 10.3-RC1
[FreeBSD/releng/10.3.git] / sys / boot / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <sys/stat.h>
35 #include <sys/stdint.h>
36
37 #include "zfsimpl.h"
38 #include "zfssubr.c"
39
40
41 struct zfsmount {
42         const spa_t     *spa;
43         objset_phys_t   objset;
44         uint64_t        rootobj;
45 };
46
47 /*
48  * List of all vdevs, chained through v_alllink.
49  */
50 static vdev_list_t zfs_vdevs;
51
52  /*
53  * List of ZFS features supported for read
54  */
55 static const char *features_for_read[] = {
56         "org.illumos:lz4_compress",
57         "com.delphix:hole_birth",
58         "com.delphix:extensible_dataset",
59         "com.delphix:embedded_data",
60         "org.open-zfs:large_blocks",
61         NULL
62 };
63
64 /*
65  * List of all pools, chained through spa_link.
66  */
67 static spa_list_t zfs_pools;
68
69 static uint64_t zfs_crc64_table[256];
70 static const dnode_phys_t *dnode_cache_obj = 0;
71 static uint64_t dnode_cache_bn;
72 static char *dnode_cache_buf;
73 static char *zap_scratch;
74 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
75
76 #define TEMP_SIZE       (1024 * 1024)
77
78 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
79 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
80 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
81
82 static void
83 zfs_init(void)
84 {
85         STAILQ_INIT(&zfs_vdevs);
86         STAILQ_INIT(&zfs_pools);
87
88         zfs_temp_buf = malloc(TEMP_SIZE);
89         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
90         zfs_temp_ptr = zfs_temp_buf;
91         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
92         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
93
94         zfs_init_crc();
95 }
96
97 static void *
98 zfs_alloc(size_t size)
99 {
100         char *ptr;
101
102         if (zfs_temp_ptr + size > zfs_temp_end) {
103                 printf("ZFS: out of temporary buffer space\n");
104                 for (;;) ;
105         }
106         ptr = zfs_temp_ptr;
107         zfs_temp_ptr += size;
108
109         return (ptr);
110 }
111
112 static void
113 zfs_free(void *ptr, size_t size)
114 {
115
116         zfs_temp_ptr -= size;
117         if (zfs_temp_ptr != ptr) {
118                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
119                 for (;;) ;
120         }
121 }
122
123 static int
124 xdr_int(const unsigned char **xdr, int *ip)
125 {
126         *ip = ((*xdr)[0] << 24)
127                 | ((*xdr)[1] << 16)
128                 | ((*xdr)[2] << 8)
129                 | ((*xdr)[3] << 0);
130         (*xdr) += 4;
131         return (0);
132 }
133
134 static int
135 xdr_u_int(const unsigned char **xdr, u_int *ip)
136 {
137         *ip = ((*xdr)[0] << 24)
138                 | ((*xdr)[1] << 16)
139                 | ((*xdr)[2] << 8)
140                 | ((*xdr)[3] << 0);
141         (*xdr) += 4;
142         return (0);
143 }
144
145 static int
146 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
147 {
148         u_int hi, lo;
149
150         xdr_u_int(xdr, &hi);
151         xdr_u_int(xdr, &lo);
152         *lp = (((uint64_t) hi) << 32) | lo;
153         return (0);
154 }
155
156 static int
157 nvlist_find(const unsigned char *nvlist, const char *name, int type,
158             int* elementsp, void *valuep)
159 {
160         const unsigned char *p, *pair;
161         int junk;
162         int encoded_size, decoded_size;
163
164         p = nvlist;
165         xdr_int(&p, &junk);
166         xdr_int(&p, &junk);
167
168         pair = p;
169         xdr_int(&p, &encoded_size);
170         xdr_int(&p, &decoded_size);
171         while (encoded_size && decoded_size) {
172                 int namelen, pairtype, elements;
173                 const char *pairname;
174
175                 xdr_int(&p, &namelen);
176                 pairname = (const char*) p;
177                 p += roundup(namelen, 4);
178                 xdr_int(&p, &pairtype);
179
180                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
181                         xdr_int(&p, &elements);
182                         if (elementsp)
183                                 *elementsp = elements;
184                         if (type == DATA_TYPE_UINT64) {
185                                 xdr_uint64_t(&p, (uint64_t *) valuep);
186                                 return (0);
187                         } else if (type == DATA_TYPE_STRING) {
188                                 int len;
189                                 xdr_int(&p, &len);
190                                 (*(const char**) valuep) = (const char*) p;
191                                 return (0);
192                         } else if (type == DATA_TYPE_NVLIST
193                                    || type == DATA_TYPE_NVLIST_ARRAY) {
194                                 (*(const unsigned char**) valuep) =
195                                          (const unsigned char*) p;
196                                 return (0);
197                         } else {
198                                 return (EIO);
199                         }
200                 } else {
201                         /*
202                          * Not the pair we are looking for, skip to the next one.
203                          */
204                         p = pair + encoded_size;
205                 }
206
207                 pair = p;
208                 xdr_int(&p, &encoded_size);
209                 xdr_int(&p, &decoded_size);
210         }
211
212         return (EIO);
213 }
214
215 static int
216 nvlist_check_features_for_read(const unsigned char *nvlist)
217 {
218         const unsigned char *p, *pair;
219         int junk;
220         int encoded_size, decoded_size;
221         int rc;
222
223         rc = 0;
224
225         p = nvlist;
226         xdr_int(&p, &junk);
227         xdr_int(&p, &junk);
228
229         pair = p;
230         xdr_int(&p, &encoded_size);
231         xdr_int(&p, &decoded_size);
232         while (encoded_size && decoded_size) {
233                 int namelen, pairtype;
234                 const char *pairname;
235                 int i, found;
236
237                 found = 0;
238
239                 xdr_int(&p, &namelen);
240                 pairname = (const char*) p;
241                 p += roundup(namelen, 4);
242                 xdr_int(&p, &pairtype);
243
244                 for (i = 0; features_for_read[i] != NULL; i++) {
245                         if (!memcmp(pairname, features_for_read[i], namelen)) {
246                                 found = 1;
247                                 break;
248                         }
249                 }
250
251                 if (!found) {
252                         printf("ZFS: unsupported feature: %s\n", pairname);
253                         rc = EIO;
254                 }
255
256                 p = pair + encoded_size;
257
258                 pair = p;
259                 xdr_int(&p, &encoded_size);
260                 xdr_int(&p, &decoded_size);
261         }
262
263         return (rc);
264 }
265
266 /*
267  * Return the next nvlist in an nvlist array.
268  */
269 static const unsigned char *
270 nvlist_next(const unsigned char *nvlist)
271 {
272         const unsigned char *p, *pair;
273         int junk;
274         int encoded_size, decoded_size;
275
276         p = nvlist;
277         xdr_int(&p, &junk);
278         xdr_int(&p, &junk);
279
280         pair = p;
281         xdr_int(&p, &encoded_size);
282         xdr_int(&p, &decoded_size);
283         while (encoded_size && decoded_size) {
284                 p = pair + encoded_size;
285
286                 pair = p;
287                 xdr_int(&p, &encoded_size);
288                 xdr_int(&p, &decoded_size);
289         }
290
291         return p;
292 }
293
294 #ifdef TEST
295
296 static const unsigned char *
297 nvlist_print(const unsigned char *nvlist, unsigned int indent)
298 {
299         static const char* typenames[] = {
300                 "DATA_TYPE_UNKNOWN",
301                 "DATA_TYPE_BOOLEAN",
302                 "DATA_TYPE_BYTE",
303                 "DATA_TYPE_INT16",
304                 "DATA_TYPE_UINT16",
305                 "DATA_TYPE_INT32",
306                 "DATA_TYPE_UINT32",
307                 "DATA_TYPE_INT64",
308                 "DATA_TYPE_UINT64",
309                 "DATA_TYPE_STRING",
310                 "DATA_TYPE_BYTE_ARRAY",
311                 "DATA_TYPE_INT16_ARRAY",
312                 "DATA_TYPE_UINT16_ARRAY",
313                 "DATA_TYPE_INT32_ARRAY",
314                 "DATA_TYPE_UINT32_ARRAY",
315                 "DATA_TYPE_INT64_ARRAY",
316                 "DATA_TYPE_UINT64_ARRAY",
317                 "DATA_TYPE_STRING_ARRAY",
318                 "DATA_TYPE_HRTIME",
319                 "DATA_TYPE_NVLIST",
320                 "DATA_TYPE_NVLIST_ARRAY",
321                 "DATA_TYPE_BOOLEAN_VALUE",
322                 "DATA_TYPE_INT8",
323                 "DATA_TYPE_UINT8",
324                 "DATA_TYPE_BOOLEAN_ARRAY",
325                 "DATA_TYPE_INT8_ARRAY",
326                 "DATA_TYPE_UINT8_ARRAY"
327         };
328
329         unsigned int i, j;
330         const unsigned char *p, *pair;
331         int junk;
332         int encoded_size, decoded_size;
333
334         p = nvlist;
335         xdr_int(&p, &junk);
336         xdr_int(&p, &junk);
337
338         pair = p;
339         xdr_int(&p, &encoded_size);
340         xdr_int(&p, &decoded_size);
341         while (encoded_size && decoded_size) {
342                 int namelen, pairtype, elements;
343                 const char *pairname;
344
345                 xdr_int(&p, &namelen);
346                 pairname = (const char*) p;
347                 p += roundup(namelen, 4);
348                 xdr_int(&p, &pairtype);
349
350                 for (i = 0; i < indent; i++)
351                         printf(" ");
352                 printf("%s %s", typenames[pairtype], pairname);
353
354                 xdr_int(&p, &elements);
355                 switch (pairtype) {
356                 case DATA_TYPE_UINT64: {
357                         uint64_t val;
358                         xdr_uint64_t(&p, &val);
359                         printf(" = 0x%jx\n", (uintmax_t)val);
360                         break;
361                 }
362
363                 case DATA_TYPE_STRING: {
364                         int len;
365                         xdr_int(&p, &len);
366                         printf(" = \"%s\"\n", p);
367                         break;
368                 }
369
370                 case DATA_TYPE_NVLIST:
371                         printf("\n");
372                         nvlist_print(p, indent + 1);
373                         break;
374
375                 case DATA_TYPE_NVLIST_ARRAY:
376                         for (j = 0; j < elements; j++) {
377                                 printf("[%d]\n", j);
378                                 p = nvlist_print(p, indent + 1);
379                                 if (j != elements - 1) {
380                                         for (i = 0; i < indent; i++)
381                                                 printf(" ");
382                                         printf("%s %s", typenames[pairtype], pairname);
383                                 }
384                         }
385                         break;
386
387                 default:
388                         printf("\n");
389                 }
390
391                 p = pair + encoded_size;
392
393                 pair = p;
394                 xdr_int(&p, &encoded_size);
395                 xdr_int(&p, &decoded_size);
396         }
397
398         return p;
399 }
400
401 #endif
402
403 static int
404 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
405     off_t offset, size_t size)
406 {
407         size_t psize;
408         int rc;
409
410         if (!vdev->v_phys_read)
411                 return (EIO);
412
413         if (bp) {
414                 psize = BP_GET_PSIZE(bp);
415         } else {
416                 psize = size;
417         }
418
419         /*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
420         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
421         if (rc)
422                 return (rc);
423         if (bp && zio_checksum_verify(bp, buf))
424                 return (EIO);
425
426         return (0);
427 }
428
429 static int
430 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
431     off_t offset, size_t bytes)
432 {
433
434         return (vdev_read_phys(vdev, bp, buf,
435                 offset + VDEV_LABEL_START_SIZE, bytes));
436 }
437
438
439 static int
440 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
441     off_t offset, size_t bytes)
442 {
443         vdev_t *kid;
444         int rc;
445
446         rc = EIO;
447         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
448                 if (kid->v_state != VDEV_STATE_HEALTHY)
449                         continue;
450                 rc = kid->v_read(kid, bp, buf, offset, bytes);
451                 if (!rc)
452                         return (0);
453         }
454
455         return (rc);
456 }
457
458 static int
459 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
460     off_t offset, size_t bytes)
461 {
462         vdev_t *kid;
463
464         /*
465          * Here we should have two kids:
466          * First one which is the one we are replacing and we can trust
467          * only this one to have valid data, but it might not be present.
468          * Second one is that one we are replacing with. It is most likely
469          * healthy, but we can't trust it has needed data, so we won't use it.
470          */
471         kid = STAILQ_FIRST(&vdev->v_children);
472         if (kid == NULL)
473                 return (EIO);
474         if (kid->v_state != VDEV_STATE_HEALTHY)
475                 return (EIO);
476         return (kid->v_read(kid, bp, buf, offset, bytes));
477 }
478
479 static vdev_t *
480 vdev_find(uint64_t guid)
481 {
482         vdev_t *vdev;
483
484         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
485                 if (vdev->v_guid == guid)
486                         return (vdev);
487
488         return (0);
489 }
490
491 static vdev_t *
492 vdev_create(uint64_t guid, vdev_read_t *read)
493 {
494         vdev_t *vdev;
495
496         vdev = malloc(sizeof(vdev_t));
497         memset(vdev, 0, sizeof(vdev_t));
498         STAILQ_INIT(&vdev->v_children);
499         vdev->v_guid = guid;
500         vdev->v_state = VDEV_STATE_OFFLINE;
501         vdev->v_read = read;
502         vdev->v_phys_read = 0;
503         vdev->v_read_priv = 0;
504         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
505
506         return (vdev);
507 }
508
509 static int
510 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
511     vdev_t **vdevp, int is_newer)
512 {
513         int rc;
514         uint64_t guid, id, ashift, nparity;
515         const char *type;
516         const char *path;
517         vdev_t *vdev, *kid;
518         const unsigned char *kids;
519         int nkids, i, is_new;
520         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
521
522         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
523                         DATA_TYPE_UINT64, 0, &guid)
524             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
525                            DATA_TYPE_UINT64, 0, &id)
526             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
527                            DATA_TYPE_STRING, 0, &type)) {
528                 printf("ZFS: can't find vdev details\n");
529                 return (ENOENT);
530         }
531
532         if (strcmp(type, VDEV_TYPE_MIRROR)
533             && strcmp(type, VDEV_TYPE_DISK)
534 #ifdef ZFS_TEST
535             && strcmp(type, VDEV_TYPE_FILE)
536 #endif
537             && strcmp(type, VDEV_TYPE_RAIDZ)
538             && strcmp(type, VDEV_TYPE_REPLACING)) {
539                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
540                 return (EIO);
541         }
542
543         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
544
545         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
546                         &is_offline);
547         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
548                         &is_removed);
549         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
550                         &is_faulted);
551         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
552                         &is_degraded);
553         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
554                         &isnt_present);
555
556         vdev = vdev_find(guid);
557         if (!vdev) {
558                 is_new = 1;
559
560                 if (!strcmp(type, VDEV_TYPE_MIRROR))
561                         vdev = vdev_create(guid, vdev_mirror_read);
562                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
563                         vdev = vdev_create(guid, vdev_raidz_read);
564                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
565                         vdev = vdev_create(guid, vdev_replacing_read);
566                 else
567                         vdev = vdev_create(guid, vdev_disk_read);
568
569                 vdev->v_id = id;
570                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
571                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
572                         DATA_TYPE_UINT64, 0, &ashift) == 0)
573                         vdev->v_ashift = ashift;
574                 else
575                         vdev->v_ashift = 0;
576                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
577                         DATA_TYPE_UINT64, 0, &nparity) == 0)
578                         vdev->v_nparity = nparity;
579                 else
580                         vdev->v_nparity = 0;
581                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
582                                 DATA_TYPE_STRING, 0, &path) == 0) {
583                         if (strncmp(path, "/dev/", 5) == 0)
584                                 path += 5;
585                         vdev->v_name = strdup(path);
586                 } else {
587                         if (!strcmp(type, "raidz")) {
588                                 if (vdev->v_nparity == 1)
589                                         vdev->v_name = "raidz1";
590                                 else if (vdev->v_nparity == 2)
591                                         vdev->v_name = "raidz2";
592                                 else if (vdev->v_nparity == 3)
593                                         vdev->v_name = "raidz3";
594                                 else {
595                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
596                                         return (EIO);
597                                 }
598                         } else {
599                                 vdev->v_name = strdup(type);
600                         }
601                 }
602         } else {
603                 is_new = 0;
604         }
605
606         if (is_new || is_newer) {
607                 /*
608                  * This is either new vdev or we've already seen this vdev,
609                  * but from an older vdev label, so let's refresh its state
610                  * from the newer label.
611                  */
612                 if (is_offline)
613                         vdev->v_state = VDEV_STATE_OFFLINE;
614                 else if (is_removed)
615                         vdev->v_state = VDEV_STATE_REMOVED;
616                 else if (is_faulted)
617                         vdev->v_state = VDEV_STATE_FAULTED;
618                 else if (is_degraded)
619                         vdev->v_state = VDEV_STATE_DEGRADED;
620                 else if (isnt_present)
621                         vdev->v_state = VDEV_STATE_CANT_OPEN;
622         }
623
624         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
625                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
626         /*
627          * Its ok if we don't have any kids.
628          */
629         if (rc == 0) {
630                 vdev->v_nchildren = nkids;
631                 for (i = 0; i < nkids; i++) {
632                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
633                         if (rc)
634                                 return (rc);
635                         if (is_new)
636                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
637                                                    v_childlink);
638                         kids = nvlist_next(kids);
639                 }
640         } else {
641                 vdev->v_nchildren = 0;
642         }
643
644         if (vdevp)
645                 *vdevp = vdev;
646         return (0);
647 }
648
649 static void
650 vdev_set_state(vdev_t *vdev)
651 {
652         vdev_t *kid;
653         int good_kids;
654         int bad_kids;
655
656         /*
657          * A mirror or raidz is healthy if all its kids are healthy. A
658          * mirror is degraded if any of its kids is healthy; a raidz
659          * is degraded if at most nparity kids are offline.
660          */
661         if (STAILQ_FIRST(&vdev->v_children)) {
662                 good_kids = 0;
663                 bad_kids = 0;
664                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
665                         if (kid->v_state == VDEV_STATE_HEALTHY)
666                                 good_kids++;
667                         else
668                                 bad_kids++;
669                 }
670                 if (bad_kids == 0) {
671                         vdev->v_state = VDEV_STATE_HEALTHY;
672                 } else {
673                         if (vdev->v_read == vdev_mirror_read) {
674                                 if (good_kids) {
675                                         vdev->v_state = VDEV_STATE_DEGRADED;
676                                 } else {
677                                         vdev->v_state = VDEV_STATE_OFFLINE;
678                                 }
679                         } else if (vdev->v_read == vdev_raidz_read) {
680                                 if (bad_kids > vdev->v_nparity) {
681                                         vdev->v_state = VDEV_STATE_OFFLINE;
682                                 } else {
683                                         vdev->v_state = VDEV_STATE_DEGRADED;
684                                 }
685                         }
686                 }
687         }
688 }
689
690 static spa_t *
691 spa_find_by_guid(uint64_t guid)
692 {
693         spa_t *spa;
694
695         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
696                 if (spa->spa_guid == guid)
697                         return (spa);
698
699         return (0);
700 }
701
702 static spa_t *
703 spa_find_by_name(const char *name)
704 {
705         spa_t *spa;
706
707         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
708                 if (!strcmp(spa->spa_name, name))
709                         return (spa);
710
711         return (0);
712 }
713
714 #ifdef BOOT2
715 static spa_t *
716 spa_get_primary(void)
717 {
718
719         return (STAILQ_FIRST(&zfs_pools));
720 }
721
722 static vdev_t *
723 spa_get_primary_vdev(const spa_t *spa)
724 {
725         vdev_t *vdev;
726         vdev_t *kid;
727
728         if (spa == NULL)
729                 spa = spa_get_primary();
730         if (spa == NULL)
731                 return (NULL);
732         vdev = STAILQ_FIRST(&spa->spa_vdevs);
733         if (vdev == NULL)
734                 return (NULL);
735         for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
736              kid = STAILQ_FIRST(&vdev->v_children))
737                 vdev = kid;
738         return (vdev);
739 }
740 #endif
741
742 static spa_t *
743 spa_create(uint64_t guid)
744 {
745         spa_t *spa;
746
747         spa = malloc(sizeof(spa_t));
748         memset(spa, 0, sizeof(spa_t));
749         STAILQ_INIT(&spa->spa_vdevs);
750         spa->spa_guid = guid;
751         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
752
753         return (spa);
754 }
755
756 static const char *
757 state_name(vdev_state_t state)
758 {
759         static const char* names[] = {
760                 "UNKNOWN",
761                 "CLOSED",
762                 "OFFLINE",
763                 "REMOVED",
764                 "CANT_OPEN",
765                 "FAULTED",
766                 "DEGRADED",
767                 "ONLINE"
768         };
769         return names[state];
770 }
771
772 #ifdef BOOT2
773
774 #define pager_printf printf
775
776 #else
777
778 static void
779 pager_printf(const char *fmt, ...)
780 {
781         char line[80];
782         va_list args;
783
784         va_start(args, fmt);
785         vsprintf(line, fmt, args);
786         va_end(args);
787         pager_output(line);
788 }
789
790 #endif
791
792 #define STATUS_FORMAT   "        %s %s\n"
793
794 static void
795 print_state(int indent, const char *name, vdev_state_t state)
796 {
797         int i;
798         char buf[512];
799
800         buf[0] = 0;
801         for (i = 0; i < indent; i++)
802                 strcat(buf, "  ");
803         strcat(buf, name);
804         pager_printf(STATUS_FORMAT, buf, state_name(state));
805         
806 }
807
808 static void
809 vdev_status(vdev_t *vdev, int indent)
810 {
811         vdev_t *kid;
812         print_state(indent, vdev->v_name, vdev->v_state);
813
814         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
815                 vdev_status(kid, indent + 1);
816         }
817 }
818
819 static void
820 spa_status(spa_t *spa)
821 {
822         static char bootfs[ZFS_MAXNAMELEN];
823         uint64_t rootid;
824         vdev_t *vdev;
825         int good_kids, bad_kids, degraded_kids;
826         vdev_state_t state;
827
828         pager_printf("  pool: %s\n", spa->spa_name);
829         if (zfs_get_root(spa, &rootid) == 0 &&
830             zfs_rlookup(spa, rootid, bootfs) == 0) {
831                 if (bootfs[0] == '\0')
832                         pager_printf("bootfs: %s\n", spa->spa_name);
833                 else
834                         pager_printf("bootfs: %s/%s\n", spa->spa_name, bootfs);
835         }
836         pager_printf("config:\n\n");
837         pager_printf(STATUS_FORMAT, "NAME", "STATE");
838
839         good_kids = 0;
840         degraded_kids = 0;
841         bad_kids = 0;
842         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
843                 if (vdev->v_state == VDEV_STATE_HEALTHY)
844                         good_kids++;
845                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
846                         degraded_kids++;
847                 else
848                         bad_kids++;
849         }
850
851         state = VDEV_STATE_CLOSED;
852         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
853                 state = VDEV_STATE_HEALTHY;
854         else if ((good_kids + degraded_kids) > 0)
855                 state = VDEV_STATE_DEGRADED;
856
857         print_state(0, spa->spa_name, state);
858         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
859                 vdev_status(vdev, 1);
860         }
861 }
862
863 static void
864 spa_all_status(void)
865 {
866         spa_t *spa;
867         int first = 1;
868
869         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
870                 if (!first)
871                         pager_printf("\n");
872                 first = 0;
873                 spa_status(spa);
874         }
875 }
876
877 static int
878 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
879 {
880         vdev_t vtmp;
881         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
882         spa_t *spa;
883         vdev_t *vdev, *top_vdev, *pool_vdev;
884         off_t off;
885         blkptr_t bp;
886         const unsigned char *nvlist;
887         uint64_t val;
888         uint64_t guid;
889         uint64_t pool_txg, pool_guid;
890         uint64_t is_log;
891         const char *pool_name;
892         const unsigned char *vdevs;
893         const unsigned char *features;
894         int i, rc, is_newer;
895         char *upbuf;
896         const struct uberblock *up;
897
898         /*
899          * Load the vdev label and figure out which
900          * uberblock is most current.
901          */
902         memset(&vtmp, 0, sizeof(vtmp));
903         vtmp.v_phys_read = read;
904         vtmp.v_read_priv = read_priv;
905         off = offsetof(vdev_label_t, vl_vdev_phys);
906         BP_ZERO(&bp);
907         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
908         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
909         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
910         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
911         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
912         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
913         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
914                 return (EIO);
915
916         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
917                 return (EIO);
918         }
919
920         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
921
922         if (nvlist_find(nvlist,
923                         ZPOOL_CONFIG_VERSION,
924                         DATA_TYPE_UINT64, 0, &val)) {
925                 return (EIO);
926         }
927
928         if (!SPA_VERSION_IS_SUPPORTED(val)) {
929                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
930                     (unsigned) val, (unsigned) SPA_VERSION);
931                 return (EIO);
932         }
933
934         /* Check ZFS features for read */
935         if (nvlist_find(nvlist,
936                         ZPOOL_CONFIG_FEATURES_FOR_READ,
937                         DATA_TYPE_NVLIST, 0, &features) == 0
938             && nvlist_check_features_for_read(features) != 0)
939                 return (EIO);
940
941         if (nvlist_find(nvlist,
942                         ZPOOL_CONFIG_POOL_STATE,
943                         DATA_TYPE_UINT64, 0, &val)) {
944                 return (EIO);
945         }
946
947         if (val == POOL_STATE_DESTROYED) {
948                 /* We don't boot only from destroyed pools. */
949                 return (EIO);
950         }
951
952         if (nvlist_find(nvlist,
953                         ZPOOL_CONFIG_POOL_TXG,
954                         DATA_TYPE_UINT64, 0, &pool_txg)
955             || nvlist_find(nvlist,
956                            ZPOOL_CONFIG_POOL_GUID,
957                            DATA_TYPE_UINT64, 0, &pool_guid)
958             || nvlist_find(nvlist,
959                            ZPOOL_CONFIG_POOL_NAME,
960                            DATA_TYPE_STRING, 0, &pool_name)) {
961                 /*
962                  * Cache and spare devices end up here - just ignore
963                  * them.
964                  */
965                 /*printf("ZFS: can't find pool details\n");*/
966                 return (EIO);
967         }
968
969         is_log = 0;
970         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
971             &is_log);
972         if (is_log)
973                 return (EIO);
974
975         /*
976          * Create the pool if this is the first time we've seen it.
977          */
978         spa = spa_find_by_guid(pool_guid);
979         if (!spa) {
980                 spa = spa_create(pool_guid);
981                 spa->spa_name = strdup(pool_name);
982         }
983         if (pool_txg > spa->spa_txg) {
984                 spa->spa_txg = pool_txg;
985                 is_newer = 1;
986         } else
987                 is_newer = 0;
988
989         /*
990          * Get the vdev tree and create our in-core copy of it.
991          * If we already have a vdev with this guid, this must
992          * be some kind of alias (overlapping slices, dangerously dedicated
993          * disks etc).
994          */
995         if (nvlist_find(nvlist,
996                         ZPOOL_CONFIG_GUID,
997                         DATA_TYPE_UINT64, 0, &guid)) {
998                 return (EIO);
999         }
1000         vdev = vdev_find(guid);
1001         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
1002                 return (EIO);
1003
1004         if (nvlist_find(nvlist,
1005                         ZPOOL_CONFIG_VDEV_TREE,
1006                         DATA_TYPE_NVLIST, 0, &vdevs)) {
1007                 return (EIO);
1008         }
1009
1010         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1011         if (rc)
1012                 return (rc);
1013
1014         /*
1015          * Add the toplevel vdev to the pool if its not already there.
1016          */
1017         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1018                 if (top_vdev == pool_vdev)
1019                         break;
1020         if (!pool_vdev && top_vdev)
1021                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1022
1023         /*
1024          * We should already have created an incomplete vdev for this
1025          * vdev. Find it and initialise it with our read proc.
1026          */
1027         vdev = vdev_find(guid);
1028         if (vdev) {
1029                 vdev->v_phys_read = read;
1030                 vdev->v_read_priv = read_priv;
1031                 vdev->v_state = VDEV_STATE_HEALTHY;
1032         } else {
1033                 printf("ZFS: inconsistent nvlist contents\n");
1034                 return (EIO);
1035         }
1036
1037         /*
1038          * Re-evaluate top-level vdev state.
1039          */
1040         vdev_set_state(top_vdev);
1041
1042         /*
1043          * Ok, we are happy with the pool so far. Lets find
1044          * the best uberblock and then we can actually access
1045          * the contents of the pool.
1046          */
1047         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1048         up = (const struct uberblock *)upbuf;
1049         for (i = 0;
1050              i < VDEV_UBERBLOCK_COUNT(vdev);
1051              i++) {
1052                 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
1053                 BP_ZERO(&bp);
1054                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
1055                 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1056                 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1057                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1058                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1059                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1060
1061                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1062                         continue;
1063
1064                 if (up->ub_magic != UBERBLOCK_MAGIC)
1065                         continue;
1066                 if (up->ub_txg < spa->spa_txg)
1067                         continue;
1068                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
1069                         spa->spa_uberblock = *up;
1070                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
1071                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
1072                                 spa->spa_uberblock = *up;
1073                 }
1074         }
1075         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1076
1077         if (spap)
1078                 *spap = spa;
1079         return (0);
1080 }
1081
1082 static int
1083 ilog2(int n)
1084 {
1085         int v;
1086
1087         for (v = 0; v < 32; v++)
1088                 if (n == (1 << v))
1089                         return v;
1090         return -1;
1091 }
1092
1093 static int
1094 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1095 {
1096         blkptr_t gbh_bp;
1097         zio_gbh_phys_t zio_gb;
1098         char *pbuf;
1099         int i;
1100
1101         /* Artificial BP for gang block header. */
1102         gbh_bp = *bp;
1103         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1104         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1105         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1106         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1107         for (i = 0; i < SPA_DVAS_PER_BP; i++)
1108                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1109
1110         /* Read gang header block using the artificial BP. */
1111         if (zio_read(spa, &gbh_bp, &zio_gb))
1112                 return (EIO);
1113
1114         pbuf = buf;
1115         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1116                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1117
1118                 if (BP_IS_HOLE(gbp))
1119                         continue;
1120                 if (zio_read(spa, gbp, pbuf))
1121                         return (EIO);
1122                 pbuf += BP_GET_PSIZE(gbp);
1123         }
1124
1125         if (zio_checksum_verify(bp, buf))
1126                 return (EIO);
1127         return (0);
1128 }
1129
1130 static int
1131 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1132 {
1133         int cpfunc = BP_GET_COMPRESS(bp);
1134         uint64_t align, size;
1135         void *pbuf;
1136         int i, error;
1137
1138         /*
1139          * Process data embedded in block pointer
1140          */
1141         if (BP_IS_EMBEDDED(bp)) {
1142                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1143
1144                 size = BPE_GET_PSIZE(bp);
1145                 ASSERT(size <= BPE_PAYLOAD_SIZE);
1146
1147                 if (cpfunc != ZIO_COMPRESS_OFF)
1148                         pbuf = zfs_alloc(size);
1149                 else
1150                         pbuf = buf;
1151
1152                 decode_embedded_bp_compressed(bp, pbuf);
1153                 error = 0;
1154
1155                 if (cpfunc != ZIO_COMPRESS_OFF) {
1156                         error = zio_decompress_data(cpfunc, pbuf,
1157                             size, buf, BP_GET_LSIZE(bp));
1158                         zfs_free(pbuf, size);
1159                 }
1160                 if (error != 0)
1161                         printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1162                             error);
1163                 return (error);
1164         }
1165
1166         error = EIO;
1167
1168         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1169                 const dva_t *dva = &bp->blk_dva[i];
1170                 vdev_t *vdev;
1171                 int vdevid;
1172                 off_t offset;
1173
1174                 if (!dva->dva_word[0] && !dva->dva_word[1])
1175                         continue;
1176
1177                 vdevid = DVA_GET_VDEV(dva);
1178                 offset = DVA_GET_OFFSET(dva);
1179                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1180                         if (vdev->v_id == vdevid)
1181                                 break;
1182                 }
1183                 if (!vdev || !vdev->v_read)
1184                         continue;
1185
1186                 size = BP_GET_PSIZE(bp);
1187                 if (vdev->v_read == vdev_raidz_read) {
1188                         align = 1ULL << vdev->v_top->v_ashift;
1189                         if (P2PHASE(size, align) != 0)
1190                                 size = P2ROUNDUP(size, align);
1191                 }
1192                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1193                         pbuf = zfs_alloc(size);
1194                 else
1195                         pbuf = buf;
1196
1197                 if (DVA_GET_GANG(dva))
1198                         error = zio_read_gang(spa, bp, pbuf);
1199                 else
1200                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
1201                 if (error == 0) {
1202                         if (cpfunc != ZIO_COMPRESS_OFF)
1203                                 error = zio_decompress_data(cpfunc, pbuf,
1204                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1205                         else if (size != BP_GET_PSIZE(bp))
1206                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1207                 }
1208                 if (buf != pbuf)
1209                         zfs_free(pbuf, size);
1210                 if (error == 0)
1211                         break;
1212         }
1213         if (error != 0)
1214                 printf("ZFS: i/o error - all block copies unavailable\n");
1215         return (error);
1216 }
1217
1218 static int
1219 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1220 {
1221         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1222         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1223         int nlevels = dnode->dn_nlevels;
1224         int i, rc;
1225
1226         if (bsize > SPA_MAXBLOCKSIZE) {
1227                 printf("ZFS: I/O error - blocks larger than 128K are not supported\n");
1228                 return (EIO);
1229         }
1230
1231         /*
1232          * Note: bsize may not be a power of two here so we need to do an
1233          * actual divide rather than a bitshift.
1234          */
1235         while (buflen > 0) {
1236                 uint64_t bn = offset / bsize;
1237                 int boff = offset % bsize;
1238                 int ibn;
1239                 const blkptr_t *indbp;
1240                 blkptr_t bp;
1241
1242                 if (bn > dnode->dn_maxblkid)
1243                         return (EIO);
1244
1245                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1246                         goto cached;
1247
1248                 indbp = dnode->dn_blkptr;
1249                 for (i = 0; i < nlevels; i++) {
1250                         /*
1251                          * Copy the bp from the indirect array so that
1252                          * we can re-use the scratch buffer for multi-level
1253                          * objects.
1254                          */
1255                         ibn = bn >> ((nlevels - i - 1) * ibshift);
1256                         ibn &= ((1 << ibshift) - 1);
1257                         bp = indbp[ibn];
1258                         if (BP_IS_HOLE(&bp)) {
1259                                 memset(dnode_cache_buf, 0, bsize);
1260                                 break;
1261                         }
1262                         rc = zio_read(spa, &bp, dnode_cache_buf);
1263                         if (rc)
1264                                 return (rc);
1265                         indbp = (const blkptr_t *) dnode_cache_buf;
1266                 }
1267                 dnode_cache_obj = dnode;
1268                 dnode_cache_bn = bn;
1269         cached:
1270
1271                 /*
1272                  * The buffer contains our data block. Copy what we
1273                  * need from it and loop.
1274                  */ 
1275                 i = bsize - boff;
1276                 if (i > buflen) i = buflen;
1277                 memcpy(buf, &dnode_cache_buf[boff], i);
1278                 buf = ((char*) buf) + i;
1279                 offset += i;
1280                 buflen -= i;
1281         }
1282
1283         return (0);
1284 }
1285
1286 /*
1287  * Lookup a value in a microzap directory. Assumes that the zap
1288  * scratch buffer contains the directory contents.
1289  */
1290 static int
1291 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1292 {
1293         const mzap_phys_t *mz;
1294         const mzap_ent_phys_t *mze;
1295         size_t size;
1296         int chunks, i;
1297
1298         /*
1299          * Microzap objects use exactly one block. Read the whole
1300          * thing.
1301          */
1302         size = dnode->dn_datablkszsec * 512;
1303
1304         mz = (const mzap_phys_t *) zap_scratch;
1305         chunks = size / MZAP_ENT_LEN - 1;
1306
1307         for (i = 0; i < chunks; i++) {
1308                 mze = &mz->mz_chunk[i];
1309                 if (!strcmp(mze->mze_name, name)) {
1310                         *value = mze->mze_value;
1311                         return (0);
1312                 }
1313         }
1314
1315         return (ENOENT);
1316 }
1317
1318 /*
1319  * Compare a name with a zap leaf entry. Return non-zero if the name
1320  * matches.
1321  */
1322 static int
1323 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1324 {
1325         size_t namelen;
1326         const zap_leaf_chunk_t *nc;
1327         const char *p;
1328
1329         namelen = zc->l_entry.le_name_numints;
1330                         
1331         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1332         p = name;
1333         while (namelen > 0) {
1334                 size_t len;
1335                 len = namelen;
1336                 if (len > ZAP_LEAF_ARRAY_BYTES)
1337                         len = ZAP_LEAF_ARRAY_BYTES;
1338                 if (memcmp(p, nc->l_array.la_array, len))
1339                         return (0);
1340                 p += len;
1341                 namelen -= len;
1342                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1343         }
1344
1345         return 1;
1346 }
1347
1348 /*
1349  * Extract a uint64_t value from a zap leaf entry.
1350  */
1351 static uint64_t
1352 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1353 {
1354         const zap_leaf_chunk_t *vc;
1355         int i;
1356         uint64_t value;
1357         const uint8_t *p;
1358
1359         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1360         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1361                 value = (value << 8) | p[i];
1362         }
1363
1364         return value;
1365 }
1366
1367 /*
1368  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1369  * buffer contains the directory header.
1370  */
1371 static int
1372 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1373 {
1374         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1375         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1376         fat_zap_t z;
1377         uint64_t *ptrtbl;
1378         uint64_t hash;
1379         int rc;
1380
1381         if (zh.zap_magic != ZAP_MAGIC)
1382                 return (EIO);
1383
1384         z.zap_block_shift = ilog2(bsize);
1385         z.zap_phys = (zap_phys_t *) zap_scratch;
1386
1387         /*
1388          * Figure out where the pointer table is and read it in if necessary.
1389          */
1390         if (zh.zap_ptrtbl.zt_blk) {
1391                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1392                                zap_scratch, bsize);
1393                 if (rc)
1394                         return (rc);
1395                 ptrtbl = (uint64_t *) zap_scratch;
1396         } else {
1397                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1398         }
1399
1400         hash = zap_hash(zh.zap_salt, name);
1401
1402         zap_leaf_t zl;
1403         zl.l_bs = z.zap_block_shift;
1404
1405         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1406         zap_leaf_chunk_t *zc;
1407
1408         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1409         if (rc)
1410                 return (rc);
1411
1412         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1413
1414         /*
1415          * Make sure this chunk matches our hash.
1416          */
1417         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1418             && zl.l_phys->l_hdr.lh_prefix
1419             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1420                 return (ENOENT);
1421
1422         /*
1423          * Hash within the chunk to find our entry.
1424          */
1425         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1426         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1427         h = zl.l_phys->l_hash[h];
1428         if (h == 0xffff)
1429                 return (ENOENT);
1430         zc = &ZAP_LEAF_CHUNK(&zl, h);
1431         while (zc->l_entry.le_hash != hash) {
1432                 if (zc->l_entry.le_next == 0xffff) {
1433                         zc = 0;
1434                         break;
1435                 }
1436                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1437         }
1438         if (fzap_name_equal(&zl, zc, name)) {
1439                 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints > 8)
1440                         return (E2BIG);
1441                 *value = fzap_leaf_value(&zl, zc);
1442                 return (0);
1443         }
1444
1445         return (ENOENT);
1446 }
1447
1448 /*
1449  * Lookup a name in a zap object and return its value as a uint64_t.
1450  */
1451 static int
1452 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1453 {
1454         int rc;
1455         uint64_t zap_type;
1456         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1457
1458         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1459         if (rc)
1460                 return (rc);
1461
1462         zap_type = *(uint64_t *) zap_scratch;
1463         if (zap_type == ZBT_MICRO)
1464                 return mzap_lookup(dnode, name, value);
1465         else if (zap_type == ZBT_HEADER)
1466                 return fzap_lookup(spa, dnode, name, value);
1467         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1468         return (EIO);
1469 }
1470
1471 /*
1472  * List a microzap directory. Assumes that the zap scratch buffer contains
1473  * the directory contents.
1474  */
1475 static int
1476 mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *))
1477 {
1478         const mzap_phys_t *mz;
1479         const mzap_ent_phys_t *mze;
1480         size_t size;
1481         int chunks, i;
1482
1483         /*
1484          * Microzap objects use exactly one block. Read the whole
1485          * thing.
1486          */
1487         size = dnode->dn_datablkszsec * 512;
1488         mz = (const mzap_phys_t *) zap_scratch;
1489         chunks = size / MZAP_ENT_LEN - 1;
1490
1491         for (i = 0; i < chunks; i++) {
1492                 mze = &mz->mz_chunk[i];
1493                 if (mze->mze_name[0])
1494                         //printf("%-32s 0x%jx\n", mze->mze_name, (uintmax_t)mze->mze_value);
1495                         callback(mze->mze_name);
1496         }
1497
1498         return (0);
1499 }
1500
1501 /*
1502  * List a fatzap directory. Assumes that the zap scratch buffer contains
1503  * the directory header.
1504  */
1505 static int
1506 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *))
1507 {
1508         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1509         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1510         fat_zap_t z;
1511         int i, j;
1512
1513         if (zh.zap_magic != ZAP_MAGIC)
1514                 return (EIO);
1515
1516         z.zap_block_shift = ilog2(bsize);
1517         z.zap_phys = (zap_phys_t *) zap_scratch;
1518
1519         /*
1520          * This assumes that the leaf blocks start at block 1. The
1521          * documentation isn't exactly clear on this.
1522          */
1523         zap_leaf_t zl;
1524         zl.l_bs = z.zap_block_shift;
1525         for (i = 0; i < zh.zap_num_leafs; i++) {
1526                 off_t off = (i + 1) << zl.l_bs;
1527                 char name[256], *p;
1528                 uint64_t value;
1529
1530                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1531                         return (EIO);
1532
1533                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1534
1535                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1536                         zap_leaf_chunk_t *zc, *nc;
1537                         int namelen;
1538
1539                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1540                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1541                                 continue;
1542                         namelen = zc->l_entry.le_name_numints;
1543                         if (namelen > sizeof(name))
1544                                 namelen = sizeof(name);
1545
1546                         /*
1547                          * Paste the name back together.
1548                          */
1549                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1550                         p = name;
1551                         while (namelen > 0) {
1552                                 int len;
1553                                 len = namelen;
1554                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1555                                         len = ZAP_LEAF_ARRAY_BYTES;
1556                                 memcpy(p, nc->l_array.la_array, len);
1557                                 p += len;
1558                                 namelen -= len;
1559                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1560                         }
1561
1562                         /*
1563                          * Assume the first eight bytes of the value are
1564                          * a uint64_t.
1565                          */
1566                         value = fzap_leaf_value(&zl, zc);
1567
1568                         //printf("%s 0x%jx\n", name, (uintmax_t)value);
1569                         callback((const char *)name);
1570                 }
1571         }
1572
1573         return (0);
1574 }
1575
1576 static int zfs_printf(const char *name)
1577 {
1578
1579         printf("%s\n", name);
1580
1581         return (0);
1582 }
1583
1584 /*
1585  * List a zap directory.
1586  */
1587 static int
1588 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1589 {
1590         uint64_t zap_type;
1591         size_t size = dnode->dn_datablkszsec * 512;
1592
1593         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1594                 return (EIO);
1595
1596         zap_type = *(uint64_t *) zap_scratch;
1597         if (zap_type == ZBT_MICRO)
1598                 return mzap_list(dnode, zfs_printf);
1599         else
1600                 return fzap_list(spa, dnode, zfs_printf);
1601 }
1602
1603 static int
1604 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1605 {
1606         off_t offset;
1607
1608         offset = objnum * sizeof(dnode_phys_t);
1609         return dnode_read(spa, &os->os_meta_dnode, offset,
1610                 dnode, sizeof(dnode_phys_t));
1611 }
1612
1613 static int
1614 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1615 {
1616         const mzap_phys_t *mz;
1617         const mzap_ent_phys_t *mze;
1618         size_t size;
1619         int chunks, i;
1620
1621         /*
1622          * Microzap objects use exactly one block. Read the whole
1623          * thing.
1624          */
1625         size = dnode->dn_datablkszsec * 512;
1626
1627         mz = (const mzap_phys_t *) zap_scratch;
1628         chunks = size / MZAP_ENT_LEN - 1;
1629
1630         for (i = 0; i < chunks; i++) {
1631                 mze = &mz->mz_chunk[i];
1632                 if (value == mze->mze_value) {
1633                         strcpy(name, mze->mze_name);
1634                         return (0);
1635                 }
1636         }
1637
1638         return (ENOENT);
1639 }
1640
1641 static void
1642 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1643 {
1644         size_t namelen;
1645         const zap_leaf_chunk_t *nc;
1646         char *p;
1647
1648         namelen = zc->l_entry.le_name_numints;
1649
1650         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1651         p = name;
1652         while (namelen > 0) {
1653                 size_t len;
1654                 len = namelen;
1655                 if (len > ZAP_LEAF_ARRAY_BYTES)
1656                         len = ZAP_LEAF_ARRAY_BYTES;
1657                 memcpy(p, nc->l_array.la_array, len);
1658                 p += len;
1659                 namelen -= len;
1660                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1661         }
1662
1663         *p = '\0';
1664 }
1665
1666 static int
1667 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1668 {
1669         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1670         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1671         fat_zap_t z;
1672         int i, j;
1673
1674         if (zh.zap_magic != ZAP_MAGIC)
1675                 return (EIO);
1676
1677         z.zap_block_shift = ilog2(bsize);
1678         z.zap_phys = (zap_phys_t *) zap_scratch;
1679
1680         /*
1681          * This assumes that the leaf blocks start at block 1. The
1682          * documentation isn't exactly clear on this.
1683          */
1684         zap_leaf_t zl;
1685         zl.l_bs = z.zap_block_shift;
1686         for (i = 0; i < zh.zap_num_leafs; i++) {
1687                 off_t off = (i + 1) << zl.l_bs;
1688
1689                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1690                         return (EIO);
1691
1692                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1693
1694                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1695                         zap_leaf_chunk_t *zc;
1696
1697                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1698                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1699                                 continue;
1700                         if (zc->l_entry.le_value_intlen != 8 ||
1701                             zc->l_entry.le_value_numints != 1)
1702                                 continue;
1703
1704                         if (fzap_leaf_value(&zl, zc) == value) {
1705                                 fzap_name_copy(&zl, zc, name);
1706                                 return (0);
1707                         }
1708                 }
1709         }
1710
1711         return (ENOENT);
1712 }
1713
1714 static int
1715 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1716 {
1717         int rc;
1718         uint64_t zap_type;
1719         size_t size = dnode->dn_datablkszsec * 512;
1720
1721         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1722         if (rc)
1723                 return (rc);
1724
1725         zap_type = *(uint64_t *) zap_scratch;
1726         if (zap_type == ZBT_MICRO)
1727                 return mzap_rlookup(spa, dnode, name, value);
1728         else
1729                 return fzap_rlookup(spa, dnode, name, value);
1730 }
1731
1732 static int
1733 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1734 {
1735         char name[256];
1736         char component[256];
1737         uint64_t dir_obj, parent_obj, child_dir_zapobj;
1738         dnode_phys_t child_dir_zap, dataset, dir, parent;
1739         dsl_dir_phys_t *dd;
1740         dsl_dataset_phys_t *ds;
1741         char *p;
1742         int len;
1743
1744         p = &name[sizeof(name) - 1];
1745         *p = '\0';
1746
1747         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1748                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1749                 return (EIO);
1750         }
1751         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1752         dir_obj = ds->ds_dir_obj;
1753
1754         for (;;) {
1755                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1756                         return (EIO);
1757                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1758
1759                 /* Actual loop condition. */
1760                 parent_obj  = dd->dd_parent_obj;
1761                 if (parent_obj == 0)
1762                         break;
1763
1764                 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1765                         return (EIO);
1766                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1767                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1768                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1769                         return (EIO);
1770                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1771                         return (EIO);
1772
1773                 len = strlen(component);
1774                 p -= len;
1775                 memcpy(p, component, len);
1776                 --p;
1777                 *p = '/';
1778
1779                 /* Actual loop iteration. */
1780                 dir_obj = parent_obj;
1781         }
1782
1783         if (*p != '\0')
1784                 ++p;
1785         strcpy(result, p);
1786
1787         return (0);
1788 }
1789
1790 static int
1791 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1792 {
1793         char element[256];
1794         uint64_t dir_obj, child_dir_zapobj;
1795         dnode_phys_t child_dir_zap, dir;
1796         dsl_dir_phys_t *dd;
1797         const char *p, *q;
1798
1799         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1800                 return (EIO);
1801         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &dir_obj))
1802                 return (EIO);
1803
1804         p = name;
1805         for (;;) {
1806                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1807                         return (EIO);
1808                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1809
1810                 while (*p == '/')
1811                         p++;
1812                 /* Actual loop condition #1. */
1813                 if (*p == '\0')
1814                         break;
1815
1816                 q = strchr(p, '/');
1817                 if (q) {
1818                         memcpy(element, p, q - p);
1819                         element[q - p] = '\0';
1820                         p = q + 1;
1821                 } else {
1822                         strcpy(element, p);
1823                         p += strlen(p);
1824                 }
1825
1826                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1827                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1828                         return (EIO);
1829
1830                 /* Actual loop condition #2. */
1831                 if (zap_lookup(spa, &child_dir_zap, element, &dir_obj) != 0)
1832                         return (ENOENT);
1833         }
1834
1835         *objnum = dd->dd_head_dataset_obj;
1836         return (0);
1837 }
1838
1839 #ifndef BOOT2
1840 static int
1841 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1842 {
1843         uint64_t dir_obj, child_dir_zapobj;
1844         dnode_phys_t child_dir_zap, dir, dataset;
1845         dsl_dataset_phys_t *ds;
1846         dsl_dir_phys_t *dd;
1847
1848         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1849                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1850                 return (EIO);
1851         }
1852         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1853         dir_obj = ds->ds_dir_obj;
1854
1855         if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
1856                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1857                 return (EIO);
1858         }
1859         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1860
1861         child_dir_zapobj = dd->dd_child_dir_zapobj;
1862         if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
1863                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
1864                 return (EIO);
1865         }
1866
1867         return (zap_list(spa, &child_dir_zap) != 0);
1868 }
1869
1870 int
1871 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *name))
1872 {
1873         uint64_t dir_obj, child_dir_zapobj, zap_type;
1874         dnode_phys_t child_dir_zap, dir, dataset;
1875         dsl_dataset_phys_t *ds;
1876         dsl_dir_phys_t *dd;
1877         int err;
1878
1879         err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
1880         if (err != 0) {
1881                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1882                 return (err);
1883         }
1884         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1885         dir_obj = ds->ds_dir_obj;
1886
1887         err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
1888         if (err != 0) {
1889                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1890                 return (err);
1891         }
1892         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1893
1894         child_dir_zapobj = dd->dd_child_dir_zapobj;
1895         err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
1896         if (err != 0) {
1897                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
1898                 return (err);
1899         }
1900
1901         err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
1902         if (err != 0)
1903                 return (err);
1904
1905         zap_type = *(uint64_t *) zap_scratch;
1906         if (zap_type == ZBT_MICRO)
1907                 return mzap_list(&child_dir_zap, callback);
1908         else
1909                 return fzap_list(spa, &child_dir_zap, callback);
1910 }
1911 #endif
1912
1913 /*
1914  * Find the object set given the object number of its dataset object
1915  * and return its details in *objset
1916  */
1917 static int
1918 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1919 {
1920         dnode_phys_t dataset;
1921         dsl_dataset_phys_t *ds;
1922
1923         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1924                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1925                 return (EIO);
1926         }
1927
1928         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1929         if (zio_read(spa, &ds->ds_bp, objset)) {
1930                 printf("ZFS: can't read object set for dataset %ju\n",
1931                     (uintmax_t)objnum);
1932                 return (EIO);
1933         }
1934
1935         return (0);
1936 }
1937
1938 /*
1939  * Find the object set pointed to by the BOOTFS property or the root
1940  * dataset if there is none and return its details in *objset
1941  */
1942 static int
1943 zfs_get_root(const spa_t *spa, uint64_t *objid)
1944 {
1945         dnode_phys_t dir, propdir;
1946         uint64_t props, bootfs, root;
1947
1948         *objid = 0;
1949
1950         /*
1951          * Start with the MOS directory object.
1952          */
1953         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1954                 printf("ZFS: can't read MOS object directory\n");
1955                 return (EIO);
1956         }
1957
1958         /*
1959          * Lookup the pool_props and see if we can find a bootfs.
1960          */
1961         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1962              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1963              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
1964              && bootfs != 0)
1965         {
1966                 *objid = bootfs;
1967                 return (0);
1968         }
1969         /*
1970          * Lookup the root dataset directory
1971          */
1972         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1973             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1974                 printf("ZFS: can't find root dsl_dir\n");
1975                 return (EIO);
1976         }
1977
1978         /*
1979          * Use the information from the dataset directory's bonus buffer
1980          * to find the dataset object and from that the object set itself.
1981          */
1982         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1983         *objid = dd->dd_head_dataset_obj;
1984         return (0);
1985 }
1986
1987 static int
1988 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
1989 {
1990
1991         mount->spa = spa;
1992
1993         /*
1994          * Find the root object set if not explicitly provided
1995          */
1996         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
1997                 printf("ZFS: can't find root filesystem\n");
1998                 return (EIO);
1999         }
2000
2001         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
2002                 printf("ZFS: can't open root filesystem\n");
2003                 return (EIO);
2004         }
2005
2006         mount->rootobj = rootobj;
2007
2008         return (0);
2009 }
2010
2011 static int
2012 zfs_spa_init(spa_t *spa)
2013 {
2014
2015         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
2016                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
2017                 return (EIO);
2018         }
2019         if (spa->spa_mos.os_type != DMU_OST_META) {
2020                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
2021                 return (EIO);
2022         }
2023         return (0);
2024 }
2025
2026 static int
2027 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
2028 {
2029
2030         if (dn->dn_bonustype != DMU_OT_SA) {
2031                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
2032
2033                 sb->st_mode = zp->zp_mode;
2034                 sb->st_uid = zp->zp_uid;
2035                 sb->st_gid = zp->zp_gid;
2036                 sb->st_size = zp->zp_size;
2037         } else {
2038                 sa_hdr_phys_t *sahdrp;
2039                 int hdrsize;
2040                 size_t size = 0;
2041                 void *buf = NULL;
2042
2043                 if (dn->dn_bonuslen != 0)
2044                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2045                 else {
2046                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
2047                                 blkptr_t *bp = &dn->dn_spill;
2048                                 int error;
2049
2050                                 size = BP_GET_LSIZE(bp);
2051                                 buf = zfs_alloc(size);
2052                                 error = zio_read(spa, bp, buf);
2053                                 if (error != 0) {
2054                                         zfs_free(buf, size);
2055                                         return (error);
2056                                 }
2057                                 sahdrp = buf;
2058                         } else {
2059                                 return (EIO);
2060                         }
2061                 }
2062                 hdrsize = SA_HDR_SIZE(sahdrp);
2063                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2064                     SA_MODE_OFFSET);
2065                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2066                     SA_UID_OFFSET);
2067                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2068                     SA_GID_OFFSET);
2069                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2070                     SA_SIZE_OFFSET);
2071                 if (buf != NULL)
2072                         zfs_free(buf, size);
2073         }
2074
2075         return (0);
2076 }
2077
2078 /*
2079  * Lookup a file and return its dnode.
2080  */
2081 static int
2082 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2083 {
2084         int rc;
2085         uint64_t objnum, rootnum, parentnum;
2086         const spa_t *spa;
2087         dnode_phys_t dn;
2088         const char *p, *q;
2089         char element[256];
2090         char path[1024];
2091         int symlinks_followed = 0;
2092         struct stat sb;
2093
2094         spa = mount->spa;
2095         if (mount->objset.os_type != DMU_OST_ZFS) {
2096                 printf("ZFS: unexpected object set type %ju\n",
2097                     (uintmax_t)mount->objset.os_type);
2098                 return (EIO);
2099         }
2100
2101         /*
2102          * Get the root directory dnode.
2103          */
2104         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2105         if (rc)
2106                 return (rc);
2107
2108         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
2109         if (rc)
2110                 return (rc);
2111
2112         rc = objset_get_dnode(spa, &mount->objset, rootnum, &dn);
2113         if (rc)
2114                 return (rc);
2115
2116         objnum = rootnum;
2117         p = upath;
2118         while (p && *p) {
2119                 while (*p == '/')
2120                         p++;
2121                 if (!*p)
2122                         break;
2123                 q = strchr(p, '/');
2124                 if (q) {
2125                         memcpy(element, p, q - p);
2126                         element[q - p] = 0;
2127                         p = q;
2128                 } else {
2129                         strcpy(element, p);
2130                         p = 0;
2131                 }
2132
2133                 rc = zfs_dnode_stat(spa, &dn, &sb);
2134                 if (rc)
2135                         return (rc);
2136                 if (!S_ISDIR(sb.st_mode))
2137                         return (ENOTDIR);
2138
2139                 parentnum = objnum;
2140                 rc = zap_lookup(spa, &dn, element, &objnum);
2141                 if (rc)
2142                         return (rc);
2143                 objnum = ZFS_DIRENT_OBJ(objnum);
2144
2145                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2146                 if (rc)
2147                         return (rc);
2148
2149                 /*
2150                  * Check for symlink.
2151                  */
2152                 rc = zfs_dnode_stat(spa, &dn, &sb);
2153                 if (rc)
2154                         return (rc);
2155                 if (S_ISLNK(sb.st_mode)) {
2156                         if (symlinks_followed > 10)
2157                                 return (EMLINK);
2158                         symlinks_followed++;
2159
2160                         /*
2161                          * Read the link value and copy the tail of our
2162                          * current path onto the end.
2163                          */
2164                         if (p)
2165                                 strcpy(&path[sb.st_size], p);
2166                         else
2167                                 path[sb.st_size] = 0;
2168                         /*
2169                          * Second test is purely to silence bogus compiler
2170                          * warning about accessing past the end of dn_bonus.
2171                          */
2172                         if (sb.st_size + sizeof(znode_phys_t) <=
2173                             dn.dn_bonuslen && sizeof(znode_phys_t) <=
2174                             sizeof(dn.dn_bonus)) {
2175                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
2176                                         sb.st_size);
2177                         } else {
2178                                 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
2179                                 if (rc)
2180                                         return (rc);
2181                         }
2182
2183                         /*
2184                          * Restart with the new path, starting either at
2185                          * the root or at the parent depending whether or
2186                          * not the link is relative.
2187                          */
2188                         p = path;
2189                         if (*p == '/')
2190                                 objnum = rootnum;
2191                         else
2192                                 objnum = parentnum;
2193                         objset_get_dnode(spa, &mount->objset, objnum, &dn);
2194                 }
2195         }
2196
2197         *dnode = dn;
2198         return (0);
2199 }