]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - sys/boot/zfs/zfsimpl.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / sys / boot / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <sys/stat.h>
35 #include <sys/stdint.h>
36
37 #include "zfsimpl.h"
38 #include "zfssubr.c"
39
40
41 struct zfsmount {
42         const spa_t     *spa;
43         objset_phys_t   objset;
44         uint64_t        rootobj;
45 };
46
47 /*
48  * List of all vdevs, chained through v_alllink.
49  */
50 static vdev_list_t zfs_vdevs;
51
52  /*
53  * List of ZFS features supported for read
54  */
55 static const char *features_for_read[] = {
56         "org.illumos:lz4_compress",
57         "com.delphix:hole_birth",
58         "com.delphix:extensible_dataset",
59         "com.delphix:embedded_data",
60         "org.open-zfs:large_blocks",
61         NULL
62 };
63
64 /*
65  * List of all pools, chained through spa_link.
66  */
67 static spa_list_t zfs_pools;
68
69 static uint64_t zfs_crc64_table[256];
70 static const dnode_phys_t *dnode_cache_obj = 0;
71 static uint64_t dnode_cache_bn;
72 static char *dnode_cache_buf;
73 static char *zap_scratch;
74 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
75
76 #define TEMP_SIZE       (1024 * 1024)
77
78 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
79 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
80 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
81
82 static void
83 zfs_init(void)
84 {
85         STAILQ_INIT(&zfs_vdevs);
86         STAILQ_INIT(&zfs_pools);
87
88         zfs_temp_buf = malloc(TEMP_SIZE);
89         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
90         zfs_temp_ptr = zfs_temp_buf;
91         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
92         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
93
94         zfs_init_crc();
95 }
96
97 static void *
98 zfs_alloc(size_t size)
99 {
100         char *ptr;
101
102         if (zfs_temp_ptr + size > zfs_temp_end) {
103                 printf("ZFS: out of temporary buffer space\n");
104                 for (;;) ;
105         }
106         ptr = zfs_temp_ptr;
107         zfs_temp_ptr += size;
108
109         return (ptr);
110 }
111
112 static void
113 zfs_free(void *ptr, size_t size)
114 {
115
116         zfs_temp_ptr -= size;
117         if (zfs_temp_ptr != ptr) {
118                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
119                 for (;;) ;
120         }
121 }
122
123 static int
124 xdr_int(const unsigned char **xdr, int *ip)
125 {
126         *ip = ((*xdr)[0] << 24)
127                 | ((*xdr)[1] << 16)
128                 | ((*xdr)[2] << 8)
129                 | ((*xdr)[3] << 0);
130         (*xdr) += 4;
131         return (0);
132 }
133
134 static int
135 xdr_u_int(const unsigned char **xdr, u_int *ip)
136 {
137         *ip = ((*xdr)[0] << 24)
138                 | ((*xdr)[1] << 16)
139                 | ((*xdr)[2] << 8)
140                 | ((*xdr)[3] << 0);
141         (*xdr) += 4;
142         return (0);
143 }
144
145 static int
146 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
147 {
148         u_int hi, lo;
149
150         xdr_u_int(xdr, &hi);
151         xdr_u_int(xdr, &lo);
152         *lp = (((uint64_t) hi) << 32) | lo;
153         return (0);
154 }
155
156 static int
157 nvlist_find(const unsigned char *nvlist, const char *name, int type,
158             int* elementsp, void *valuep)
159 {
160         const unsigned char *p, *pair;
161         int junk;
162         int encoded_size, decoded_size;
163
164         p = nvlist;
165         xdr_int(&p, &junk);
166         xdr_int(&p, &junk);
167
168         pair = p;
169         xdr_int(&p, &encoded_size);
170         xdr_int(&p, &decoded_size);
171         while (encoded_size && decoded_size) {
172                 int namelen, pairtype, elements;
173                 const char *pairname;
174
175                 xdr_int(&p, &namelen);
176                 pairname = (const char*) p;
177                 p += roundup(namelen, 4);
178                 xdr_int(&p, &pairtype);
179
180                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
181                         xdr_int(&p, &elements);
182                         if (elementsp)
183                                 *elementsp = elements;
184                         if (type == DATA_TYPE_UINT64) {
185                                 xdr_uint64_t(&p, (uint64_t *) valuep);
186                                 return (0);
187                         } else if (type == DATA_TYPE_STRING) {
188                                 int len;
189                                 xdr_int(&p, &len);
190                                 (*(const char**) valuep) = (const char*) p;
191                                 return (0);
192                         } else if (type == DATA_TYPE_NVLIST
193                                    || type == DATA_TYPE_NVLIST_ARRAY) {
194                                 (*(const unsigned char**) valuep) =
195                                          (const unsigned char*) p;
196                                 return (0);
197                         } else {
198                                 return (EIO);
199                         }
200                 } else {
201                         /*
202                          * Not the pair we are looking for, skip to the next one.
203                          */
204                         p = pair + encoded_size;
205                 }
206
207                 pair = p;
208                 xdr_int(&p, &encoded_size);
209                 xdr_int(&p, &decoded_size);
210         }
211
212         return (EIO);
213 }
214
215 static int
216 nvlist_check_features_for_read(const unsigned char *nvlist)
217 {
218         const unsigned char *p, *pair;
219         int junk;
220         int encoded_size, decoded_size;
221         int rc;
222
223         rc = 0;
224
225         p = nvlist;
226         xdr_int(&p, &junk);
227         xdr_int(&p, &junk);
228
229         pair = p;
230         xdr_int(&p, &encoded_size);
231         xdr_int(&p, &decoded_size);
232         while (encoded_size && decoded_size) {
233                 int namelen, pairtype;
234                 const char *pairname;
235                 int i, found;
236
237                 found = 0;
238
239                 xdr_int(&p, &namelen);
240                 pairname = (const char*) p;
241                 p += roundup(namelen, 4);
242                 xdr_int(&p, &pairtype);
243
244                 for (i = 0; features_for_read[i] != NULL; i++) {
245                         if (!memcmp(pairname, features_for_read[i], namelen)) {
246                                 found = 1;
247                                 break;
248                         }
249                 }
250
251                 if (!found) {
252                         printf("ZFS: unsupported feature: %s\n", pairname);
253                         rc = EIO;
254                 }
255
256                 p = pair + encoded_size;
257
258                 pair = p;
259                 xdr_int(&p, &encoded_size);
260                 xdr_int(&p, &decoded_size);
261         }
262
263         return (rc);
264 }
265
266 /*
267  * Return the next nvlist in an nvlist array.
268  */
269 static const unsigned char *
270 nvlist_next(const unsigned char *nvlist)
271 {
272         const unsigned char *p, *pair;
273         int junk;
274         int encoded_size, decoded_size;
275
276         p = nvlist;
277         xdr_int(&p, &junk);
278         xdr_int(&p, &junk);
279
280         pair = p;
281         xdr_int(&p, &encoded_size);
282         xdr_int(&p, &decoded_size);
283         while (encoded_size && decoded_size) {
284                 p = pair + encoded_size;
285
286                 pair = p;
287                 xdr_int(&p, &encoded_size);
288                 xdr_int(&p, &decoded_size);
289         }
290
291         return p;
292 }
293
294 #ifdef TEST
295
296 static const unsigned char *
297 nvlist_print(const unsigned char *nvlist, unsigned int indent)
298 {
299         static const char* typenames[] = {
300                 "DATA_TYPE_UNKNOWN",
301                 "DATA_TYPE_BOOLEAN",
302                 "DATA_TYPE_BYTE",
303                 "DATA_TYPE_INT16",
304                 "DATA_TYPE_UINT16",
305                 "DATA_TYPE_INT32",
306                 "DATA_TYPE_UINT32",
307                 "DATA_TYPE_INT64",
308                 "DATA_TYPE_UINT64",
309                 "DATA_TYPE_STRING",
310                 "DATA_TYPE_BYTE_ARRAY",
311                 "DATA_TYPE_INT16_ARRAY",
312                 "DATA_TYPE_UINT16_ARRAY",
313                 "DATA_TYPE_INT32_ARRAY",
314                 "DATA_TYPE_UINT32_ARRAY",
315                 "DATA_TYPE_INT64_ARRAY",
316                 "DATA_TYPE_UINT64_ARRAY",
317                 "DATA_TYPE_STRING_ARRAY",
318                 "DATA_TYPE_HRTIME",
319                 "DATA_TYPE_NVLIST",
320                 "DATA_TYPE_NVLIST_ARRAY",
321                 "DATA_TYPE_BOOLEAN_VALUE",
322                 "DATA_TYPE_INT8",
323                 "DATA_TYPE_UINT8",
324                 "DATA_TYPE_BOOLEAN_ARRAY",
325                 "DATA_TYPE_INT8_ARRAY",
326                 "DATA_TYPE_UINT8_ARRAY"
327         };
328
329         unsigned int i, j;
330         const unsigned char *p, *pair;
331         int junk;
332         int encoded_size, decoded_size;
333
334         p = nvlist;
335         xdr_int(&p, &junk);
336         xdr_int(&p, &junk);
337
338         pair = p;
339         xdr_int(&p, &encoded_size);
340         xdr_int(&p, &decoded_size);
341         while (encoded_size && decoded_size) {
342                 int namelen, pairtype, elements;
343                 const char *pairname;
344
345                 xdr_int(&p, &namelen);
346                 pairname = (const char*) p;
347                 p += roundup(namelen, 4);
348                 xdr_int(&p, &pairtype);
349
350                 for (i = 0; i < indent; i++)
351                         printf(" ");
352                 printf("%s %s", typenames[pairtype], pairname);
353
354                 xdr_int(&p, &elements);
355                 switch (pairtype) {
356                 case DATA_TYPE_UINT64: {
357                         uint64_t val;
358                         xdr_uint64_t(&p, &val);
359                         printf(" = 0x%jx\n", (uintmax_t)val);
360                         break;
361                 }
362
363                 case DATA_TYPE_STRING: {
364                         int len;
365                         xdr_int(&p, &len);
366                         printf(" = \"%s\"\n", p);
367                         break;
368                 }
369
370                 case DATA_TYPE_NVLIST:
371                         printf("\n");
372                         nvlist_print(p, indent + 1);
373                         break;
374
375                 case DATA_TYPE_NVLIST_ARRAY:
376                         for (j = 0; j < elements; j++) {
377                                 printf("[%d]\n", j);
378                                 p = nvlist_print(p, indent + 1);
379                                 if (j != elements - 1) {
380                                         for (i = 0; i < indent; i++)
381                                                 printf(" ");
382                                         printf("%s %s", typenames[pairtype], pairname);
383                                 }
384                         }
385                         break;
386
387                 default:
388                         printf("\n");
389                 }
390
391                 p = pair + encoded_size;
392
393                 pair = p;
394                 xdr_int(&p, &encoded_size);
395                 xdr_int(&p, &decoded_size);
396         }
397
398         return p;
399 }
400
401 #endif
402
403 static int
404 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
405     off_t offset, size_t size)
406 {
407         size_t psize;
408         int rc;
409
410         if (!vdev->v_phys_read)
411                 return (EIO);
412
413         if (bp) {
414                 psize = BP_GET_PSIZE(bp);
415         } else {
416                 psize = size;
417         }
418
419         /*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
420         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
421         if (rc)
422                 return (rc);
423         if (bp && zio_checksum_verify(bp, buf))
424                 return (EIO);
425
426         return (0);
427 }
428
429 static int
430 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
431     off_t offset, size_t bytes)
432 {
433
434         return (vdev_read_phys(vdev, bp, buf,
435                 offset + VDEV_LABEL_START_SIZE, bytes));
436 }
437
438
439 static int
440 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
441     off_t offset, size_t bytes)
442 {
443         vdev_t *kid;
444         int rc;
445
446         rc = EIO;
447         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
448                 if (kid->v_state != VDEV_STATE_HEALTHY)
449                         continue;
450                 rc = kid->v_read(kid, bp, buf, offset, bytes);
451                 if (!rc)
452                         return (0);
453         }
454
455         return (rc);
456 }
457
458 static int
459 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
460     off_t offset, size_t bytes)
461 {
462         vdev_t *kid;
463
464         /*
465          * Here we should have two kids:
466          * First one which is the one we are replacing and we can trust
467          * only this one to have valid data, but it might not be present.
468          * Second one is that one we are replacing with. It is most likely
469          * healthy, but we can't trust it has needed data, so we won't use it.
470          */
471         kid = STAILQ_FIRST(&vdev->v_children);
472         if (kid == NULL)
473                 return (EIO);
474         if (kid->v_state != VDEV_STATE_HEALTHY)
475                 return (EIO);
476         return (kid->v_read(kid, bp, buf, offset, bytes));
477 }
478
479 static vdev_t *
480 vdev_find(uint64_t guid)
481 {
482         vdev_t *vdev;
483
484         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
485                 if (vdev->v_guid == guid)
486                         return (vdev);
487
488         return (0);
489 }
490
491 static vdev_t *
492 vdev_create(uint64_t guid, vdev_read_t *read)
493 {
494         vdev_t *vdev;
495
496         vdev = malloc(sizeof(vdev_t));
497         memset(vdev, 0, sizeof(vdev_t));
498         STAILQ_INIT(&vdev->v_children);
499         vdev->v_guid = guid;
500         vdev->v_state = VDEV_STATE_OFFLINE;
501         vdev->v_read = read;
502         vdev->v_phys_read = 0;
503         vdev->v_read_priv = 0;
504         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
505
506         return (vdev);
507 }
508
509 static int
510 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
511     vdev_t **vdevp, int is_newer)
512 {
513         int rc;
514         uint64_t guid, id, ashift, nparity;
515         const char *type;
516         const char *path;
517         vdev_t *vdev, *kid;
518         const unsigned char *kids;
519         int nkids, i, is_new;
520         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
521
522         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
523                         DATA_TYPE_UINT64, 0, &guid)
524             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
525                            DATA_TYPE_UINT64, 0, &id)
526             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
527                            DATA_TYPE_STRING, 0, &type)) {
528                 printf("ZFS: can't find vdev details\n");
529                 return (ENOENT);
530         }
531
532         if (strcmp(type, VDEV_TYPE_MIRROR)
533             && strcmp(type, VDEV_TYPE_DISK)
534 #ifdef ZFS_TEST
535             && strcmp(type, VDEV_TYPE_FILE)
536 #endif
537             && strcmp(type, VDEV_TYPE_RAIDZ)
538             && strcmp(type, VDEV_TYPE_REPLACING)) {
539                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
540                 return (EIO);
541         }
542
543         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
544
545         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
546                         &is_offline);
547         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
548                         &is_removed);
549         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
550                         &is_faulted);
551         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
552                         &is_degraded);
553         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
554                         &isnt_present);
555
556         vdev = vdev_find(guid);
557         if (!vdev) {
558                 is_new = 1;
559
560                 if (!strcmp(type, VDEV_TYPE_MIRROR))
561                         vdev = vdev_create(guid, vdev_mirror_read);
562                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
563                         vdev = vdev_create(guid, vdev_raidz_read);
564                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
565                         vdev = vdev_create(guid, vdev_replacing_read);
566                 else
567                         vdev = vdev_create(guid, vdev_disk_read);
568
569                 vdev->v_id = id;
570                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
571                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
572                         DATA_TYPE_UINT64, 0, &ashift) == 0)
573                         vdev->v_ashift = ashift;
574                 else
575                         vdev->v_ashift = 0;
576                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
577                         DATA_TYPE_UINT64, 0, &nparity) == 0)
578                         vdev->v_nparity = nparity;
579                 else
580                         vdev->v_nparity = 0;
581                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
582                                 DATA_TYPE_STRING, 0, &path) == 0) {
583                         if (strncmp(path, "/dev/", 5) == 0)
584                                 path += 5;
585                         vdev->v_name = strdup(path);
586                 } else {
587                         if (!strcmp(type, "raidz")) {
588                                 if (vdev->v_nparity == 1)
589                                         vdev->v_name = "raidz1";
590                                 else if (vdev->v_nparity == 2)
591                                         vdev->v_name = "raidz2";
592                                 else if (vdev->v_nparity == 3)
593                                         vdev->v_name = "raidz3";
594                                 else {
595                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
596                                         return (EIO);
597                                 }
598                         } else {
599                                 vdev->v_name = strdup(type);
600                         }
601                 }
602         } else {
603                 is_new = 0;
604         }
605
606         if (is_new || is_newer) {
607                 /*
608                  * This is either new vdev or we've already seen this vdev,
609                  * but from an older vdev label, so let's refresh its state
610                  * from the newer label.
611                  */
612                 if (is_offline)
613                         vdev->v_state = VDEV_STATE_OFFLINE;
614                 else if (is_removed)
615                         vdev->v_state = VDEV_STATE_REMOVED;
616                 else if (is_faulted)
617                         vdev->v_state = VDEV_STATE_FAULTED;
618                 else if (is_degraded)
619                         vdev->v_state = VDEV_STATE_DEGRADED;
620                 else if (isnt_present)
621                         vdev->v_state = VDEV_STATE_CANT_OPEN;
622         }
623
624         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
625                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
626         /*
627          * Its ok if we don't have any kids.
628          */
629         if (rc == 0) {
630                 vdev->v_nchildren = nkids;
631                 for (i = 0; i < nkids; i++) {
632                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
633                         if (rc)
634                                 return (rc);
635                         if (is_new)
636                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
637                                                    v_childlink);
638                         kids = nvlist_next(kids);
639                 }
640         } else {
641                 vdev->v_nchildren = 0;
642         }
643
644         if (vdevp)
645                 *vdevp = vdev;
646         return (0);
647 }
648
649 static void
650 vdev_set_state(vdev_t *vdev)
651 {
652         vdev_t *kid;
653         int good_kids;
654         int bad_kids;
655
656         /*
657          * A mirror or raidz is healthy if all its kids are healthy. A
658          * mirror is degraded if any of its kids is healthy; a raidz
659          * is degraded if at most nparity kids are offline.
660          */
661         if (STAILQ_FIRST(&vdev->v_children)) {
662                 good_kids = 0;
663                 bad_kids = 0;
664                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
665                         if (kid->v_state == VDEV_STATE_HEALTHY)
666                                 good_kids++;
667                         else
668                                 bad_kids++;
669                 }
670                 if (bad_kids == 0) {
671                         vdev->v_state = VDEV_STATE_HEALTHY;
672                 } else {
673                         if (vdev->v_read == vdev_mirror_read) {
674                                 if (good_kids) {
675                                         vdev->v_state = VDEV_STATE_DEGRADED;
676                                 } else {
677                                         vdev->v_state = VDEV_STATE_OFFLINE;
678                                 }
679                         } else if (vdev->v_read == vdev_raidz_read) {
680                                 if (bad_kids > vdev->v_nparity) {
681                                         vdev->v_state = VDEV_STATE_OFFLINE;
682                                 } else {
683                                         vdev->v_state = VDEV_STATE_DEGRADED;
684                                 }
685                         }
686                 }
687         }
688 }
689
690 static spa_t *
691 spa_find_by_guid(uint64_t guid)
692 {
693         spa_t *spa;
694
695         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
696                 if (spa->spa_guid == guid)
697                         return (spa);
698
699         return (0);
700 }
701
702 static spa_t *
703 spa_find_by_name(const char *name)
704 {
705         spa_t *spa;
706
707         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
708                 if (!strcmp(spa->spa_name, name))
709                         return (spa);
710
711         return (0);
712 }
713
714 #ifdef BOOT2
715 static spa_t *
716 spa_get_primary(void)
717 {
718
719         return (STAILQ_FIRST(&zfs_pools));
720 }
721
722 static vdev_t *
723 spa_get_primary_vdev(const spa_t *spa)
724 {
725         vdev_t *vdev;
726         vdev_t *kid;
727
728         if (spa == NULL)
729                 spa = spa_get_primary();
730         if (spa == NULL)
731                 return (NULL);
732         vdev = STAILQ_FIRST(&spa->spa_vdevs);
733         if (vdev == NULL)
734                 return (NULL);
735         for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
736              kid = STAILQ_FIRST(&vdev->v_children))
737                 vdev = kid;
738         return (vdev);
739 }
740 #endif
741
742 static spa_t *
743 spa_create(uint64_t guid)
744 {
745         spa_t *spa;
746
747         spa = malloc(sizeof(spa_t));
748         memset(spa, 0, sizeof(spa_t));
749         STAILQ_INIT(&spa->spa_vdevs);
750         spa->spa_guid = guid;
751         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
752
753         return (spa);
754 }
755
756 static const char *
757 state_name(vdev_state_t state)
758 {
759         static const char* names[] = {
760                 "UNKNOWN",
761                 "CLOSED",
762                 "OFFLINE",
763                 "REMOVED",
764                 "CANT_OPEN",
765                 "FAULTED",
766                 "DEGRADED",
767                 "ONLINE"
768         };
769         return names[state];
770 }
771
772 #ifdef BOOT2
773
774 #define pager_printf printf
775
776 #else
777
778 static void
779 pager_printf(const char *fmt, ...)
780 {
781         char line[80];
782         va_list args;
783
784         va_start(args, fmt);
785         vsprintf(line, fmt, args);
786         va_end(args);
787         pager_output(line);
788 }
789
790 #endif
791
792 #define STATUS_FORMAT   "        %s %s\n"
793
794 static void
795 print_state(int indent, const char *name, vdev_state_t state)
796 {
797         int i;
798         char buf[512];
799
800         buf[0] = 0;
801         for (i = 0; i < indent; i++)
802                 strcat(buf, "  ");
803         strcat(buf, name);
804         pager_printf(STATUS_FORMAT, buf, state_name(state));
805         
806 }
807
808 static void
809 vdev_status(vdev_t *vdev, int indent)
810 {
811         vdev_t *kid;
812         print_state(indent, vdev->v_name, vdev->v_state);
813
814         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
815                 vdev_status(kid, indent + 1);
816         }
817 }
818
819 static void
820 spa_status(spa_t *spa)
821 {
822         static char bootfs[ZFS_MAXNAMELEN];
823         uint64_t rootid;
824         vdev_t *vdev;
825         int good_kids, bad_kids, degraded_kids;
826         vdev_state_t state;
827
828         pager_printf("  pool: %s\n", spa->spa_name);
829         if (zfs_get_root(spa, &rootid) == 0 &&
830             zfs_rlookup(spa, rootid, bootfs) == 0) {
831                 if (bootfs[0] == '\0')
832                         pager_printf("bootfs: %s\n", spa->spa_name);
833                 else
834                         pager_printf("bootfs: %s/%s\n", spa->spa_name, bootfs);
835         }
836         pager_printf("config:\n\n");
837         pager_printf(STATUS_FORMAT, "NAME", "STATE");
838
839         good_kids = 0;
840         degraded_kids = 0;
841         bad_kids = 0;
842         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
843                 if (vdev->v_state == VDEV_STATE_HEALTHY)
844                         good_kids++;
845                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
846                         degraded_kids++;
847                 else
848                         bad_kids++;
849         }
850
851         state = VDEV_STATE_CLOSED;
852         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
853                 state = VDEV_STATE_HEALTHY;
854         else if ((good_kids + degraded_kids) > 0)
855                 state = VDEV_STATE_DEGRADED;
856
857         print_state(0, spa->spa_name, state);
858         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
859                 vdev_status(vdev, 1);
860         }
861 }
862
863 static void
864 spa_all_status(void)
865 {
866         spa_t *spa;
867         int first = 1;
868
869         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
870                 if (!first)
871                         pager_printf("\n");
872                 first = 0;
873                 spa_status(spa);
874         }
875 }
876
877 static int
878 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
879 {
880         vdev_t vtmp;
881         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
882         spa_t *spa;
883         vdev_t *vdev, *top_vdev, *pool_vdev;
884         off_t off;
885         blkptr_t bp;
886         const unsigned char *nvlist;
887         uint64_t val;
888         uint64_t guid;
889         uint64_t pool_txg, pool_guid;
890         uint64_t is_log;
891         const char *pool_name;
892         const unsigned char *vdevs;
893         const unsigned char *features;
894         int i, rc, is_newer;
895         char *upbuf;
896         const struct uberblock *up;
897
898         /*
899          * Load the vdev label and figure out which
900          * uberblock is most current.
901          */
902         memset(&vtmp, 0, sizeof(vtmp));
903         vtmp.v_phys_read = read;
904         vtmp.v_read_priv = read_priv;
905         off = offsetof(vdev_label_t, vl_vdev_phys);
906         BP_ZERO(&bp);
907         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
908         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
909         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
910         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
911         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
912         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
913         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
914                 return (EIO);
915
916         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
917                 return (EIO);
918         }
919
920         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
921
922         if (nvlist_find(nvlist,
923                         ZPOOL_CONFIG_VERSION,
924                         DATA_TYPE_UINT64, 0, &val)) {
925                 return (EIO);
926         }
927
928         if (!SPA_VERSION_IS_SUPPORTED(val)) {
929                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
930                     (unsigned) val, (unsigned) SPA_VERSION);
931                 return (EIO);
932         }
933
934         /* Check ZFS features for read */
935         if (nvlist_find(nvlist,
936                         ZPOOL_CONFIG_FEATURES_FOR_READ,
937                         DATA_TYPE_NVLIST, 0, &features) == 0
938             && nvlist_check_features_for_read(features) != 0)
939                 return (EIO);
940
941         if (nvlist_find(nvlist,
942                         ZPOOL_CONFIG_POOL_STATE,
943                         DATA_TYPE_UINT64, 0, &val)) {
944                 return (EIO);
945         }
946
947         if (val == POOL_STATE_DESTROYED) {
948                 /* We don't boot only from destroyed pools. */
949                 return (EIO);
950         }
951
952         if (nvlist_find(nvlist,
953                         ZPOOL_CONFIG_POOL_TXG,
954                         DATA_TYPE_UINT64, 0, &pool_txg)
955             || nvlist_find(nvlist,
956                            ZPOOL_CONFIG_POOL_GUID,
957                            DATA_TYPE_UINT64, 0, &pool_guid)
958             || nvlist_find(nvlist,
959                            ZPOOL_CONFIG_POOL_NAME,
960                            DATA_TYPE_STRING, 0, &pool_name)) {
961                 /*
962                  * Cache and spare devices end up here - just ignore
963                  * them.
964                  */
965                 /*printf("ZFS: can't find pool details\n");*/
966                 return (EIO);
967         }
968
969         is_log = 0;
970         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
971             &is_log);
972         if (is_log)
973                 return (EIO);
974
975         /*
976          * Create the pool if this is the first time we've seen it.
977          */
978         spa = spa_find_by_guid(pool_guid);
979         if (!spa) {
980                 spa = spa_create(pool_guid);
981                 spa->spa_name = strdup(pool_name);
982         }
983         if (pool_txg > spa->spa_txg) {
984                 spa->spa_txg = pool_txg;
985                 is_newer = 1;
986         } else
987                 is_newer = 0;
988
989         /*
990          * Get the vdev tree and create our in-core copy of it.
991          * If we already have a vdev with this guid, this must
992          * be some kind of alias (overlapping slices, dangerously dedicated
993          * disks etc).
994          */
995         if (nvlist_find(nvlist,
996                         ZPOOL_CONFIG_GUID,
997                         DATA_TYPE_UINT64, 0, &guid)) {
998                 return (EIO);
999         }
1000         vdev = vdev_find(guid);
1001         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
1002                 return (EIO);
1003
1004         if (nvlist_find(nvlist,
1005                         ZPOOL_CONFIG_VDEV_TREE,
1006                         DATA_TYPE_NVLIST, 0, &vdevs)) {
1007                 return (EIO);
1008         }
1009
1010         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1011         if (rc)
1012                 return (rc);
1013
1014         /*
1015          * Add the toplevel vdev to the pool if its not already there.
1016          */
1017         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1018                 if (top_vdev == pool_vdev)
1019                         break;
1020         if (!pool_vdev && top_vdev)
1021                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1022
1023         /*
1024          * We should already have created an incomplete vdev for this
1025          * vdev. Find it and initialise it with our read proc.
1026          */
1027         vdev = vdev_find(guid);
1028         if (vdev) {
1029                 vdev->v_phys_read = read;
1030                 vdev->v_read_priv = read_priv;
1031                 vdev->v_state = VDEV_STATE_HEALTHY;
1032         } else {
1033                 printf("ZFS: inconsistent nvlist contents\n");
1034                 return (EIO);
1035         }
1036
1037         /*
1038          * Re-evaluate top-level vdev state.
1039          */
1040         vdev_set_state(top_vdev);
1041
1042         /*
1043          * Ok, we are happy with the pool so far. Lets find
1044          * the best uberblock and then we can actually access
1045          * the contents of the pool.
1046          */
1047         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1048         up = (const struct uberblock *)upbuf;
1049         for (i = 0;
1050              i < VDEV_UBERBLOCK_COUNT(vdev);
1051              i++) {
1052                 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
1053                 BP_ZERO(&bp);
1054                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
1055                 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1056                 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1057                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1058                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1059                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1060
1061                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1062                         continue;
1063
1064                 if (up->ub_magic != UBERBLOCK_MAGIC)
1065                         continue;
1066                 if (up->ub_txg < spa->spa_txg)
1067                         continue;
1068                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
1069                         spa->spa_uberblock = *up;
1070                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
1071                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
1072                                 spa->spa_uberblock = *up;
1073                 }
1074         }
1075         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1076
1077         if (spap)
1078                 *spap = spa;
1079         return (0);
1080 }
1081
1082 static int
1083 ilog2(int n)
1084 {
1085         int v;
1086
1087         for (v = 0; v < 32; v++)
1088                 if (n == (1 << v))
1089                         return v;
1090         return -1;
1091 }
1092
1093 static int
1094 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1095 {
1096         blkptr_t gbh_bp;
1097         zio_gbh_phys_t zio_gb;
1098         char *pbuf;
1099         int i;
1100
1101         /* Artificial BP for gang block header. */
1102         gbh_bp = *bp;
1103         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1104         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1105         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1106         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1107         for (i = 0; i < SPA_DVAS_PER_BP; i++)
1108                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1109
1110         /* Read gang header block using the artificial BP. */
1111         if (zio_read(spa, &gbh_bp, &zio_gb))
1112                 return (EIO);
1113
1114         pbuf = buf;
1115         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1116                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1117
1118                 if (BP_IS_HOLE(gbp))
1119                         continue;
1120                 if (zio_read(spa, gbp, pbuf))
1121                         return (EIO);
1122                 pbuf += BP_GET_PSIZE(gbp);
1123         }
1124
1125         if (zio_checksum_verify(bp, buf))
1126                 return (EIO);
1127         return (0);
1128 }
1129
1130 static int
1131 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1132 {
1133         int cpfunc = BP_GET_COMPRESS(bp);
1134         uint64_t align, size;
1135         void *pbuf;
1136         int i, error;
1137
1138         /*
1139          * Process data embedded in block pointer
1140          */
1141         if (BP_IS_EMBEDDED(bp)) {
1142                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1143
1144                 size = BPE_GET_PSIZE(bp);
1145                 ASSERT(size <= BPE_PAYLOAD_SIZE);
1146
1147                 if (cpfunc != ZIO_COMPRESS_OFF)
1148                         pbuf = zfs_alloc(size);
1149                 else
1150                         pbuf = buf;
1151
1152                 decode_embedded_bp_compressed(bp, pbuf);
1153                 error = 0;
1154
1155                 if (cpfunc != ZIO_COMPRESS_OFF) {
1156                         error = zio_decompress_data(cpfunc, pbuf,
1157                             size, buf, BP_GET_LSIZE(bp));
1158                         zfs_free(pbuf, size);
1159                 }
1160                 if (error != 0)
1161                         printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1162                             error);
1163                 return (error);
1164         }
1165
1166         error = EIO;
1167
1168         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1169                 const dva_t *dva = &bp->blk_dva[i];
1170                 vdev_t *vdev;
1171                 int vdevid;
1172                 off_t offset;
1173
1174                 if (!dva->dva_word[0] && !dva->dva_word[1])
1175                         continue;
1176
1177                 vdevid = DVA_GET_VDEV(dva);
1178                 offset = DVA_GET_OFFSET(dva);
1179                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1180                         if (vdev->v_id == vdevid)
1181                                 break;
1182                 }
1183                 if (!vdev || !vdev->v_read)
1184                         continue;
1185
1186                 size = BP_GET_PSIZE(bp);
1187                 if (vdev->v_read == vdev_raidz_read) {
1188                         align = 1ULL << vdev->v_top->v_ashift;
1189                         if (P2PHASE(size, align) != 0)
1190                                 size = P2ROUNDUP(size, align);
1191                 }
1192                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1193                         pbuf = zfs_alloc(size);
1194                 else
1195                         pbuf = buf;
1196
1197                 if (DVA_GET_GANG(dva))
1198                         error = zio_read_gang(spa, bp, pbuf);
1199                 else
1200                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
1201                 if (error == 0) {
1202                         if (cpfunc != ZIO_COMPRESS_OFF)
1203                                 error = zio_decompress_data(cpfunc, pbuf,
1204                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1205                         else if (size != BP_GET_PSIZE(bp))
1206                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1207                 }
1208                 if (buf != pbuf)
1209                         zfs_free(pbuf, size);
1210                 if (error == 0)
1211                         break;
1212         }
1213         if (error != 0)
1214                 printf("ZFS: i/o error - all block copies unavailable\n");
1215         return (error);
1216 }
1217
1218 static int
1219 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1220 {
1221         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1222         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1223         int nlevels = dnode->dn_nlevels;
1224         int i, rc;
1225
1226         if (bsize > SPA_MAXBLOCKSIZE) {
1227                 printf("ZFS: I/O error - blocks larger than 128K are not supported\n");
1228                 return (EIO);
1229         }
1230
1231         /*
1232          * Note: bsize may not be a power of two here so we need to do an
1233          * actual divide rather than a bitshift.
1234          */
1235         while (buflen > 0) {
1236                 uint64_t bn = offset / bsize;
1237                 int boff = offset % bsize;
1238                 int ibn;
1239                 const blkptr_t *indbp;
1240                 blkptr_t bp;
1241
1242                 if (bn > dnode->dn_maxblkid)
1243                         return (EIO);
1244
1245                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1246                         goto cached;
1247
1248                 indbp = dnode->dn_blkptr;
1249                 for (i = 0; i < nlevels; i++) {
1250                         /*
1251                          * Copy the bp from the indirect array so that
1252                          * we can re-use the scratch buffer for multi-level
1253                          * objects.
1254                          */
1255                         ibn = bn >> ((nlevels - i - 1) * ibshift);
1256                         ibn &= ((1 << ibshift) - 1);
1257                         bp = indbp[ibn];
1258                         if (BP_IS_HOLE(&bp)) {
1259                                 memset(dnode_cache_buf, 0, bsize);
1260                                 break;
1261                         }
1262                         rc = zio_read(spa, &bp, dnode_cache_buf);
1263                         if (rc)
1264                                 return (rc);
1265                         indbp = (const blkptr_t *) dnode_cache_buf;
1266                 }
1267                 dnode_cache_obj = dnode;
1268                 dnode_cache_bn = bn;
1269         cached:
1270
1271                 /*
1272                  * The buffer contains our data block. Copy what we
1273                  * need from it and loop.
1274                  */ 
1275                 i = bsize - boff;
1276                 if (i > buflen) i = buflen;
1277                 memcpy(buf, &dnode_cache_buf[boff], i);
1278                 buf = ((char*) buf) + i;
1279                 offset += i;
1280                 buflen -= i;
1281         }
1282
1283         return (0);
1284 }
1285
1286 /*
1287  * Lookup a value in a microzap directory. Assumes that the zap
1288  * scratch buffer contains the directory contents.
1289  */
1290 static int
1291 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1292 {
1293         const mzap_phys_t *mz;
1294         const mzap_ent_phys_t *mze;
1295         size_t size;
1296         int chunks, i;
1297
1298         /*
1299          * Microzap objects use exactly one block. Read the whole
1300          * thing.
1301          */
1302         size = dnode->dn_datablkszsec * 512;
1303
1304         mz = (const mzap_phys_t *) zap_scratch;
1305         chunks = size / MZAP_ENT_LEN - 1;
1306
1307         for (i = 0; i < chunks; i++) {
1308                 mze = &mz->mz_chunk[i];
1309                 if (!strcmp(mze->mze_name, name)) {
1310                         *value = mze->mze_value;
1311                         return (0);
1312                 }
1313         }
1314
1315         return (ENOENT);
1316 }
1317
1318 /*
1319  * Compare a name with a zap leaf entry. Return non-zero if the name
1320  * matches.
1321  */
1322 static int
1323 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1324 {
1325         size_t namelen;
1326         const zap_leaf_chunk_t *nc;
1327         const char *p;
1328
1329         namelen = zc->l_entry.le_name_numints;
1330                         
1331         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1332         p = name;
1333         while (namelen > 0) {
1334                 size_t len;
1335                 len = namelen;
1336                 if (len > ZAP_LEAF_ARRAY_BYTES)
1337                         len = ZAP_LEAF_ARRAY_BYTES;
1338                 if (memcmp(p, nc->l_array.la_array, len))
1339                         return (0);
1340                 p += len;
1341                 namelen -= len;
1342                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1343         }
1344
1345         return 1;
1346 }
1347
1348 /*
1349  * Extract a uint64_t value from a zap leaf entry.
1350  */
1351 static uint64_t
1352 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1353 {
1354         const zap_leaf_chunk_t *vc;
1355         int i;
1356         uint64_t value;
1357         const uint8_t *p;
1358
1359         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1360         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1361                 value = (value << 8) | p[i];
1362         }
1363
1364         return value;
1365 }
1366
1367 /*
1368  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1369  * buffer contains the directory header.
1370  */
1371 static int
1372 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1373 {
1374         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1375         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1376         fat_zap_t z;
1377         uint64_t *ptrtbl;
1378         uint64_t hash;
1379         int rc;
1380
1381         if (zh.zap_magic != ZAP_MAGIC)
1382                 return (EIO);
1383
1384         z.zap_block_shift = ilog2(bsize);
1385         z.zap_phys = (zap_phys_t *) zap_scratch;
1386
1387         /*
1388          * Figure out where the pointer table is and read it in if necessary.
1389          */
1390         if (zh.zap_ptrtbl.zt_blk) {
1391                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1392                                zap_scratch, bsize);
1393                 if (rc)
1394                         return (rc);
1395                 ptrtbl = (uint64_t *) zap_scratch;
1396         } else {
1397                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1398         }
1399
1400         hash = zap_hash(zh.zap_salt, name);
1401
1402         zap_leaf_t zl;
1403         zl.l_bs = z.zap_block_shift;
1404
1405         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1406         zap_leaf_chunk_t *zc;
1407
1408         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1409         if (rc)
1410                 return (rc);
1411
1412         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1413
1414         /*
1415          * Make sure this chunk matches our hash.
1416          */
1417         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1418             && zl.l_phys->l_hdr.lh_prefix
1419             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1420                 return (ENOENT);
1421
1422         /*
1423          * Hash within the chunk to find our entry.
1424          */
1425         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1426         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1427         h = zl.l_phys->l_hash[h];
1428         if (h == 0xffff)
1429                 return (ENOENT);
1430         zc = &ZAP_LEAF_CHUNK(&zl, h);
1431         while (zc->l_entry.le_hash != hash) {
1432                 if (zc->l_entry.le_next == 0xffff) {
1433                         zc = 0;
1434                         break;
1435                 }
1436                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1437         }
1438         if (fzap_name_equal(&zl, zc, name)) {
1439                 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints > 8)
1440                         return (E2BIG);
1441                 *value = fzap_leaf_value(&zl, zc);
1442                 return (0);
1443         }
1444
1445         return (ENOENT);
1446 }
1447
1448 /*
1449  * Lookup a name in a zap object and return its value as a uint64_t.
1450  */
1451 static int
1452 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1453 {
1454         int rc;
1455         uint64_t zap_type;
1456         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1457
1458         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1459         if (rc)
1460                 return (rc);
1461
1462         zap_type = *(uint64_t *) zap_scratch;
1463         if (zap_type == ZBT_MICRO)
1464                 return mzap_lookup(dnode, name, value);
1465         else if (zap_type == ZBT_HEADER)
1466                 return fzap_lookup(spa, dnode, name, value);
1467         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1468         return (EIO);
1469 }
1470
1471 /*
1472  * List a microzap directory. Assumes that the zap scratch buffer contains
1473  * the directory contents.
1474  */
1475 static int
1476 mzap_list(const dnode_phys_t *dnode)
1477 {
1478         const mzap_phys_t *mz;
1479         const mzap_ent_phys_t *mze;
1480         size_t size;
1481         int chunks, i;
1482
1483         /*
1484          * Microzap objects use exactly one block. Read the whole
1485          * thing.
1486          */
1487         size = dnode->dn_datablkszsec * 512;
1488         mz = (const mzap_phys_t *) zap_scratch;
1489         chunks = size / MZAP_ENT_LEN - 1;
1490
1491         for (i = 0; i < chunks; i++) {
1492                 mze = &mz->mz_chunk[i];
1493                 if (mze->mze_name[0])
1494                         //printf("%-32s 0x%jx\n", mze->mze_name, (uintmax_t)mze->mze_value);
1495                         printf("%s\n", mze->mze_name);
1496         }
1497
1498         return (0);
1499 }
1500
1501 /*
1502  * List a fatzap directory. Assumes that the zap scratch buffer contains
1503  * the directory header.
1504  */
1505 static int
1506 fzap_list(const spa_t *spa, const dnode_phys_t *dnode)
1507 {
1508         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1509         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1510         fat_zap_t z;
1511         int i, j;
1512
1513         if (zh.zap_magic != ZAP_MAGIC)
1514                 return (EIO);
1515
1516         z.zap_block_shift = ilog2(bsize);
1517         z.zap_phys = (zap_phys_t *) zap_scratch;
1518
1519         /*
1520          * This assumes that the leaf blocks start at block 1. The
1521          * documentation isn't exactly clear on this.
1522          */
1523         zap_leaf_t zl;
1524         zl.l_bs = z.zap_block_shift;
1525         for (i = 0; i < zh.zap_num_leafs; i++) {
1526                 off_t off = (i + 1) << zl.l_bs;
1527                 char name[256], *p;
1528                 uint64_t value;
1529
1530                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1531                         return (EIO);
1532
1533                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1534
1535                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1536                         zap_leaf_chunk_t *zc, *nc;
1537                         int namelen;
1538
1539                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1540                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1541                                 continue;
1542                         namelen = zc->l_entry.le_name_numints;
1543                         if (namelen > sizeof(name))
1544                                 namelen = sizeof(name);
1545
1546                         /*
1547                          * Paste the name back together.
1548                          */
1549                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1550                         p = name;
1551                         while (namelen > 0) {
1552                                 int len;
1553                                 len = namelen;
1554                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1555                                         len = ZAP_LEAF_ARRAY_BYTES;
1556                                 memcpy(p, nc->l_array.la_array, len);
1557                                 p += len;
1558                                 namelen -= len;
1559                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1560                         }
1561
1562                         /*
1563                          * Assume the first eight bytes of the value are
1564                          * a uint64_t.
1565                          */
1566                         value = fzap_leaf_value(&zl, zc);
1567
1568                         //printf("%s 0x%jx\n", name, (uintmax_t)value);
1569                         printf("%s\n", name);
1570                 }
1571         }
1572
1573         return (0);
1574 }
1575
1576 /*
1577  * List a zap directory.
1578  */
1579 static int
1580 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1581 {
1582         uint64_t zap_type;
1583         size_t size = dnode->dn_datablkszsec * 512;
1584
1585         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1586                 return (EIO);
1587
1588         zap_type = *(uint64_t *) zap_scratch;
1589         if (zap_type == ZBT_MICRO)
1590                 return mzap_list(dnode);
1591         else
1592                 return fzap_list(spa, dnode);
1593 }
1594
1595 static int
1596 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1597 {
1598         off_t offset;
1599
1600         offset = objnum * sizeof(dnode_phys_t);
1601         return dnode_read(spa, &os->os_meta_dnode, offset,
1602                 dnode, sizeof(dnode_phys_t));
1603 }
1604
1605 static int
1606 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1607 {
1608         const mzap_phys_t *mz;
1609         const mzap_ent_phys_t *mze;
1610         size_t size;
1611         int chunks, i;
1612
1613         /*
1614          * Microzap objects use exactly one block. Read the whole
1615          * thing.
1616          */
1617         size = dnode->dn_datablkszsec * 512;
1618
1619         mz = (const mzap_phys_t *) zap_scratch;
1620         chunks = size / MZAP_ENT_LEN - 1;
1621
1622         for (i = 0; i < chunks; i++) {
1623                 mze = &mz->mz_chunk[i];
1624                 if (value == mze->mze_value) {
1625                         strcpy(name, mze->mze_name);
1626                         return (0);
1627                 }
1628         }
1629
1630         return (ENOENT);
1631 }
1632
1633 static void
1634 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1635 {
1636         size_t namelen;
1637         const zap_leaf_chunk_t *nc;
1638         char *p;
1639
1640         namelen = zc->l_entry.le_name_numints;
1641
1642         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1643         p = name;
1644         while (namelen > 0) {
1645                 size_t len;
1646                 len = namelen;
1647                 if (len > ZAP_LEAF_ARRAY_BYTES)
1648                         len = ZAP_LEAF_ARRAY_BYTES;
1649                 memcpy(p, nc->l_array.la_array, len);
1650                 p += len;
1651                 namelen -= len;
1652                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1653         }
1654
1655         *p = '\0';
1656 }
1657
1658 static int
1659 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1660 {
1661         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1662         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1663         fat_zap_t z;
1664         int i, j;
1665
1666         if (zh.zap_magic != ZAP_MAGIC)
1667                 return (EIO);
1668
1669         z.zap_block_shift = ilog2(bsize);
1670         z.zap_phys = (zap_phys_t *) zap_scratch;
1671
1672         /*
1673          * This assumes that the leaf blocks start at block 1. The
1674          * documentation isn't exactly clear on this.
1675          */
1676         zap_leaf_t zl;
1677         zl.l_bs = z.zap_block_shift;
1678         for (i = 0; i < zh.zap_num_leafs; i++) {
1679                 off_t off = (i + 1) << zl.l_bs;
1680
1681                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1682                         return (EIO);
1683
1684                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1685
1686                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1687                         zap_leaf_chunk_t *zc;
1688
1689                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1690                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1691                                 continue;
1692                         if (zc->l_entry.le_value_intlen != 8 ||
1693                             zc->l_entry.le_value_numints != 1)
1694                                 continue;
1695
1696                         if (fzap_leaf_value(&zl, zc) == value) {
1697                                 fzap_name_copy(&zl, zc, name);
1698                                 return (0);
1699                         }
1700                 }
1701         }
1702
1703         return (ENOENT);
1704 }
1705
1706 static int
1707 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1708 {
1709         int rc;
1710         uint64_t zap_type;
1711         size_t size = dnode->dn_datablkszsec * 512;
1712
1713         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1714         if (rc)
1715                 return (rc);
1716
1717         zap_type = *(uint64_t *) zap_scratch;
1718         if (zap_type == ZBT_MICRO)
1719                 return mzap_rlookup(spa, dnode, name, value);
1720         else
1721                 return fzap_rlookup(spa, dnode, name, value);
1722 }
1723
1724 static int
1725 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1726 {
1727         char name[256];
1728         char component[256];
1729         uint64_t dir_obj, parent_obj, child_dir_zapobj;
1730         dnode_phys_t child_dir_zap, dataset, dir, parent;
1731         dsl_dir_phys_t *dd;
1732         dsl_dataset_phys_t *ds;
1733         char *p;
1734         int len;
1735
1736         p = &name[sizeof(name) - 1];
1737         *p = '\0';
1738
1739         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1740                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1741                 return (EIO);
1742         }
1743         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1744         dir_obj = ds->ds_dir_obj;
1745
1746         for (;;) {
1747                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1748                         return (EIO);
1749                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1750
1751                 /* Actual loop condition. */
1752                 parent_obj  = dd->dd_parent_obj;
1753                 if (parent_obj == 0)
1754                         break;
1755
1756                 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1757                         return (EIO);
1758                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1759                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1760                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1761                         return (EIO);
1762                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1763                         return (EIO);
1764
1765                 len = strlen(component);
1766                 p -= len;
1767                 memcpy(p, component, len);
1768                 --p;
1769                 *p = '/';
1770
1771                 /* Actual loop iteration. */
1772                 dir_obj = parent_obj;
1773         }
1774
1775         if (*p != '\0')
1776                 ++p;
1777         strcpy(result, p);
1778
1779         return (0);
1780 }
1781
1782 static int
1783 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1784 {
1785         char element[256];
1786         uint64_t dir_obj, child_dir_zapobj;
1787         dnode_phys_t child_dir_zap, dir;
1788         dsl_dir_phys_t *dd;
1789         const char *p, *q;
1790
1791         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1792                 return (EIO);
1793         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &dir_obj))
1794                 return (EIO);
1795
1796         p = name;
1797         for (;;) {
1798                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1799                         return (EIO);
1800                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1801
1802                 while (*p == '/')
1803                         p++;
1804                 /* Actual loop condition #1. */
1805                 if (*p == '\0')
1806                         break;
1807
1808                 q = strchr(p, '/');
1809                 if (q) {
1810                         memcpy(element, p, q - p);
1811                         element[q - p] = '\0';
1812                         p = q + 1;
1813                 } else {
1814                         strcpy(element, p);
1815                         p += strlen(p);
1816                 }
1817
1818                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1819                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1820                         return (EIO);
1821
1822                 /* Actual loop condition #2. */
1823                 if (zap_lookup(spa, &child_dir_zap, element, &dir_obj) != 0)
1824                         return (ENOENT);
1825         }
1826
1827         *objnum = dd->dd_head_dataset_obj;
1828         return (0);
1829 }
1830
1831 #ifndef BOOT2
1832 static int
1833 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1834 {
1835         uint64_t dir_obj, child_dir_zapobj;
1836         dnode_phys_t child_dir_zap, dir, dataset;
1837         dsl_dataset_phys_t *ds;
1838         dsl_dir_phys_t *dd;
1839
1840         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1841                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1842                 return (EIO);
1843         }
1844         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1845         dir_obj = ds->ds_dir_obj;
1846
1847         if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
1848                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1849                 return (EIO);
1850         }
1851         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1852
1853         child_dir_zapobj = dd->dd_child_dir_zapobj;
1854         if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
1855                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
1856                 return (EIO);
1857         }
1858
1859         return (zap_list(spa, &child_dir_zap) != 0);
1860 }
1861 #endif
1862
1863 /*
1864  * Find the object set given the object number of its dataset object
1865  * and return its details in *objset
1866  */
1867 static int
1868 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1869 {
1870         dnode_phys_t dataset;
1871         dsl_dataset_phys_t *ds;
1872
1873         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1874                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1875                 return (EIO);
1876         }
1877
1878         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1879         if (zio_read(spa, &ds->ds_bp, objset)) {
1880                 printf("ZFS: can't read object set for dataset %ju\n",
1881                     (uintmax_t)objnum);
1882                 return (EIO);
1883         }
1884
1885         return (0);
1886 }
1887
1888 /*
1889  * Find the object set pointed to by the BOOTFS property or the root
1890  * dataset if there is none and return its details in *objset
1891  */
1892 static int
1893 zfs_get_root(const spa_t *spa, uint64_t *objid)
1894 {
1895         dnode_phys_t dir, propdir;
1896         uint64_t props, bootfs, root;
1897
1898         *objid = 0;
1899
1900         /*
1901          * Start with the MOS directory object.
1902          */
1903         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1904                 printf("ZFS: can't read MOS object directory\n");
1905                 return (EIO);
1906         }
1907
1908         /*
1909          * Lookup the pool_props and see if we can find a bootfs.
1910          */
1911         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1912              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1913              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
1914              && bootfs != 0)
1915         {
1916                 *objid = bootfs;
1917                 return (0);
1918         }
1919         /*
1920          * Lookup the root dataset directory
1921          */
1922         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1923             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1924                 printf("ZFS: can't find root dsl_dir\n");
1925                 return (EIO);
1926         }
1927
1928         /*
1929          * Use the information from the dataset directory's bonus buffer
1930          * to find the dataset object and from that the object set itself.
1931          */
1932         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1933         *objid = dd->dd_head_dataset_obj;
1934         return (0);
1935 }
1936
1937 static int
1938 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
1939 {
1940
1941         mount->spa = spa;
1942
1943         /*
1944          * Find the root object set if not explicitly provided
1945          */
1946         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
1947                 printf("ZFS: can't find root filesystem\n");
1948                 return (EIO);
1949         }
1950
1951         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
1952                 printf("ZFS: can't open root filesystem\n");
1953                 return (EIO);
1954         }
1955
1956         mount->rootobj = rootobj;
1957
1958         return (0);
1959 }
1960
1961 static int
1962 zfs_spa_init(spa_t *spa)
1963 {
1964
1965         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
1966                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
1967                 return (EIO);
1968         }
1969         if (spa->spa_mos.os_type != DMU_OST_META) {
1970                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
1971                 return (EIO);
1972         }
1973         return (0);
1974 }
1975
1976 static int
1977 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
1978 {
1979
1980         if (dn->dn_bonustype != DMU_OT_SA) {
1981                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
1982
1983                 sb->st_mode = zp->zp_mode;
1984                 sb->st_uid = zp->zp_uid;
1985                 sb->st_gid = zp->zp_gid;
1986                 sb->st_size = zp->zp_size;
1987         } else {
1988                 sa_hdr_phys_t *sahdrp;
1989                 int hdrsize;
1990                 size_t size = 0;
1991                 void *buf = NULL;
1992
1993                 if (dn->dn_bonuslen != 0)
1994                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
1995                 else {
1996                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
1997                                 blkptr_t *bp = &dn->dn_spill;
1998                                 int error;
1999
2000                                 size = BP_GET_LSIZE(bp);
2001                                 buf = zfs_alloc(size);
2002                                 error = zio_read(spa, bp, buf);
2003                                 if (error != 0) {
2004                                         zfs_free(buf, size);
2005                                         return (error);
2006                                 }
2007                                 sahdrp = buf;
2008                         } else {
2009                                 return (EIO);
2010                         }
2011                 }
2012                 hdrsize = SA_HDR_SIZE(sahdrp);
2013                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2014                     SA_MODE_OFFSET);
2015                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2016                     SA_UID_OFFSET);
2017                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2018                     SA_GID_OFFSET);
2019                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2020                     SA_SIZE_OFFSET);
2021                 if (buf != NULL)
2022                         zfs_free(buf, size);
2023         }
2024
2025         return (0);
2026 }
2027
2028 /*
2029  * Lookup a file and return its dnode.
2030  */
2031 static int
2032 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2033 {
2034         int rc;
2035         uint64_t objnum, rootnum, parentnum;
2036         const spa_t *spa;
2037         dnode_phys_t dn;
2038         const char *p, *q;
2039         char element[256];
2040         char path[1024];
2041         int symlinks_followed = 0;
2042         struct stat sb;
2043
2044         spa = mount->spa;
2045         if (mount->objset.os_type != DMU_OST_ZFS) {
2046                 printf("ZFS: unexpected object set type %ju\n",
2047                     (uintmax_t)mount->objset.os_type);
2048                 return (EIO);
2049         }
2050
2051         /*
2052          * Get the root directory dnode.
2053          */
2054         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2055         if (rc)
2056                 return (rc);
2057
2058         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
2059         if (rc)
2060                 return (rc);
2061
2062         rc = objset_get_dnode(spa, &mount->objset, rootnum, &dn);
2063         if (rc)
2064                 return (rc);
2065
2066         objnum = rootnum;
2067         p = upath;
2068         while (p && *p) {
2069                 while (*p == '/')
2070                         p++;
2071                 if (!*p)
2072                         break;
2073                 q = strchr(p, '/');
2074                 if (q) {
2075                         memcpy(element, p, q - p);
2076                         element[q - p] = 0;
2077                         p = q;
2078                 } else {
2079                         strcpy(element, p);
2080                         p = 0;
2081                 }
2082
2083                 rc = zfs_dnode_stat(spa, &dn, &sb);
2084                 if (rc)
2085                         return (rc);
2086                 if (!S_ISDIR(sb.st_mode))
2087                         return (ENOTDIR);
2088
2089                 parentnum = objnum;
2090                 rc = zap_lookup(spa, &dn, element, &objnum);
2091                 if (rc)
2092                         return (rc);
2093                 objnum = ZFS_DIRENT_OBJ(objnum);
2094
2095                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2096                 if (rc)
2097                         return (rc);
2098
2099                 /*
2100                  * Check for symlink.
2101                  */
2102                 rc = zfs_dnode_stat(spa, &dn, &sb);
2103                 if (rc)
2104                         return (rc);
2105                 if (S_ISLNK(sb.st_mode)) {
2106                         if (symlinks_followed > 10)
2107                                 return (EMLINK);
2108                         symlinks_followed++;
2109
2110                         /*
2111                          * Read the link value and copy the tail of our
2112                          * current path onto the end.
2113                          */
2114                         if (p)
2115                                 strcpy(&path[sb.st_size], p);
2116                         else
2117                                 path[sb.st_size] = 0;
2118                         if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
2119                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
2120                                         sb.st_size);
2121                         } else {
2122                                 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
2123                                 if (rc)
2124                                         return (rc);
2125                         }
2126
2127                         /*
2128                          * Restart with the new path, starting either at
2129                          * the root or at the parent depending whether or
2130                          * not the link is relative.
2131                          */
2132                         p = path;
2133                         if (*p == '/')
2134                                 objnum = rootnum;
2135                         else
2136                                 objnum = parentnum;
2137                         objset_get_dnode(spa, &mount->objset, objnum, &dn);
2138                 }
2139         }
2140
2141         *dnode = dn;
2142         return (0);
2143 }