]> CyberLeo.Net >> Repos - FreeBSD/releng/8.0.git/blob - sys/boot/zfs/zfsimpl.c
Adjust to reflect 8.0-RELEASE.
[FreeBSD/releng/8.0.git] / sys / boot / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include "zfsimpl.h"
35 #include "zfssubr.c"
36
37 /*
38  * List of all vdevs, chained through v_alllink.
39  */
40 static vdev_list_t zfs_vdevs;
41
42 /*
43  * List of all pools, chained through spa_link.
44  */
45 static spa_list_t zfs_pools;
46
47 static uint64_t zfs_crc64_table[256];
48 static const dnode_phys_t *dnode_cache_obj = 0;
49 static uint64_t dnode_cache_bn;
50 static char *dnode_cache_buf;
51 static char *zap_scratch;
52 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
53
54 #define TEMP_SIZE       (1*SPA_MAXBLOCKSIZE)
55
56 static void
57 zfs_init(void)
58 {
59         STAILQ_INIT(&zfs_vdevs);
60         STAILQ_INIT(&zfs_pools);
61
62         zfs_temp_buf = malloc(TEMP_SIZE);
63         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
64         zfs_temp_ptr = zfs_temp_buf;
65         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
66         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
67
68         zfs_init_crc();
69 }
70
71 static char *
72 zfs_alloc_temp(size_t sz)
73 {
74         char *p;
75
76         if (zfs_temp_ptr + sz > zfs_temp_end) {
77                 printf("ZFS: out of temporary buffer space\n");
78                 for (;;) ;
79         }
80         p = zfs_temp_ptr;
81         zfs_temp_ptr += sz;
82
83         return (p);
84 }
85
86 static void
87 zfs_reset_temp(void)
88 {
89
90         zfs_temp_ptr = zfs_temp_buf;
91 }
92
93 static int
94 xdr_int(const unsigned char **xdr, int *ip)
95 {
96         *ip = ((*xdr)[0] << 24)
97                 | ((*xdr)[1] << 16)
98                 | ((*xdr)[2] << 8)
99                 | ((*xdr)[3] << 0);
100         (*xdr) += 4;
101         return (0);
102 }
103
104 static int
105 xdr_u_int(const unsigned char **xdr, u_int *ip)
106 {
107         *ip = ((*xdr)[0] << 24)
108                 | ((*xdr)[1] << 16)
109                 | ((*xdr)[2] << 8)
110                 | ((*xdr)[3] << 0);
111         (*xdr) += 4;
112         return (0);
113 }
114
115 static int
116 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
117 {
118         u_int hi, lo;
119
120         xdr_u_int(xdr, &hi);
121         xdr_u_int(xdr, &lo);
122         *lp = (((uint64_t) hi) << 32) | lo;
123         return (0);
124 }
125
126 static int
127 nvlist_find(const unsigned char *nvlist, const char *name, int type,
128             int* elementsp, void *valuep)
129 {
130         const unsigned char *p, *pair;
131         int junk;
132         int encoded_size, decoded_size;
133
134         p = nvlist;
135         xdr_int(&p, &junk);
136         xdr_int(&p, &junk);
137
138         pair = p;
139         xdr_int(&p, &encoded_size);
140         xdr_int(&p, &decoded_size);
141         while (encoded_size && decoded_size) {
142                 int namelen, pairtype, elements;
143                 const char *pairname;
144
145                 xdr_int(&p, &namelen);
146                 pairname = (const char*) p;
147                 p += roundup(namelen, 4);
148                 xdr_int(&p, &pairtype);
149
150                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
151                         xdr_int(&p, &elements);
152                         if (elementsp)
153                                 *elementsp = elements;
154                         if (type == DATA_TYPE_UINT64) {
155                                 xdr_uint64_t(&p, (uint64_t *) valuep);
156                                 return (0);
157                         } else if (type == DATA_TYPE_STRING) {
158                                 int len;
159                                 xdr_int(&p, &len);
160                                 (*(const char**) valuep) = (const char*) p;
161                                 return (0);
162                         } else if (type == DATA_TYPE_NVLIST
163                                    || type == DATA_TYPE_NVLIST_ARRAY) {
164                                 (*(const unsigned char**) valuep) =
165                                          (const unsigned char*) p;
166                                 return (0);
167                         } else {
168                                 return (EIO);
169                         }
170                 } else {
171                         /*
172                          * Not the pair we are looking for, skip to the next one.
173                          */
174                         p = pair + encoded_size;
175                 }
176
177                 pair = p;
178                 xdr_int(&p, &encoded_size);
179                 xdr_int(&p, &decoded_size);
180         }
181
182         return (EIO);
183 }
184
185 /*
186  * Return the next nvlist in an nvlist array.
187  */
188 static const unsigned char *
189 nvlist_next(const unsigned char *nvlist)
190 {
191         const unsigned char *p, *pair;
192         int junk;
193         int encoded_size, decoded_size;
194
195         p = nvlist;
196         xdr_int(&p, &junk);
197         xdr_int(&p, &junk);
198
199         pair = p;
200         xdr_int(&p, &encoded_size);
201         xdr_int(&p, &decoded_size);
202         while (encoded_size && decoded_size) {
203                 p = pair + encoded_size;
204
205                 pair = p;
206                 xdr_int(&p, &encoded_size);
207                 xdr_int(&p, &decoded_size);
208         }
209
210         return p;
211 }
212
213 #ifdef TEST
214
215 static const unsigned char *
216 nvlist_print(const unsigned char *nvlist, unsigned int indent)
217 {
218         static const char* typenames[] = {
219                 "DATA_TYPE_UNKNOWN",
220                 "DATA_TYPE_BOOLEAN",
221                 "DATA_TYPE_BYTE",
222                 "DATA_TYPE_INT16",
223                 "DATA_TYPE_UINT16",
224                 "DATA_TYPE_INT32",
225                 "DATA_TYPE_UINT32",
226                 "DATA_TYPE_INT64",
227                 "DATA_TYPE_UINT64",
228                 "DATA_TYPE_STRING",
229                 "DATA_TYPE_BYTE_ARRAY",
230                 "DATA_TYPE_INT16_ARRAY",
231                 "DATA_TYPE_UINT16_ARRAY",
232                 "DATA_TYPE_INT32_ARRAY",
233                 "DATA_TYPE_UINT32_ARRAY",
234                 "DATA_TYPE_INT64_ARRAY",
235                 "DATA_TYPE_UINT64_ARRAY",
236                 "DATA_TYPE_STRING_ARRAY",
237                 "DATA_TYPE_HRTIME",
238                 "DATA_TYPE_NVLIST",
239                 "DATA_TYPE_NVLIST_ARRAY",
240                 "DATA_TYPE_BOOLEAN_VALUE",
241                 "DATA_TYPE_INT8",
242                 "DATA_TYPE_UINT8",
243                 "DATA_TYPE_BOOLEAN_ARRAY",
244                 "DATA_TYPE_INT8_ARRAY",
245                 "DATA_TYPE_UINT8_ARRAY"
246         };
247
248         unsigned int i, j;
249         const unsigned char *p, *pair;
250         int junk;
251         int encoded_size, decoded_size;
252
253         p = nvlist;
254         xdr_int(&p, &junk);
255         xdr_int(&p, &junk);
256
257         pair = p;
258         xdr_int(&p, &encoded_size);
259         xdr_int(&p, &decoded_size);
260         while (encoded_size && decoded_size) {
261                 int namelen, pairtype, elements;
262                 const char *pairname;
263
264                 xdr_int(&p, &namelen);
265                 pairname = (const char*) p;
266                 p += roundup(namelen, 4);
267                 xdr_int(&p, &pairtype);
268
269                 for (i = 0; i < indent; i++)
270                         printf(" ");
271                 printf("%s %s", typenames[pairtype], pairname);
272
273                 xdr_int(&p, &elements);
274                 switch (pairtype) {
275                 case DATA_TYPE_UINT64: {
276                         uint64_t val;
277                         xdr_uint64_t(&p, &val);
278                         printf(" = 0x%llx\n", val);
279                         break;
280                 }
281
282                 case DATA_TYPE_STRING: {
283                         int len;
284                         xdr_int(&p, &len);
285                         printf(" = \"%s\"\n", p);
286                         break;
287                 }
288
289                 case DATA_TYPE_NVLIST:
290                         printf("\n");
291                         nvlist_print(p, indent + 1);
292                         break;
293
294                 case DATA_TYPE_NVLIST_ARRAY:
295                         for (j = 0; j < elements; j++) {
296                                 printf("[%d]\n", j);
297                                 p = nvlist_print(p, indent + 1);
298                                 if (j != elements - 1) {
299                                         for (i = 0; i < indent; i++)
300                                                 printf(" ");
301                                         printf("%s %s", typenames[pairtype], pairname);
302                                 }
303                         }
304                         break;
305
306                 default:
307                         printf("\n");
308                 }
309
310                 p = pair + encoded_size;
311
312                 pair = p;
313                 xdr_int(&p, &encoded_size);
314                 xdr_int(&p, &decoded_size);
315         }
316
317         return p;
318 }
319
320 #endif
321
322 static int
323 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
324     off_t offset, size_t size)
325 {
326         size_t psize;
327         int rc;
328
329         if (bp) {
330                 psize = BP_GET_PSIZE(bp);
331         } else {
332                 psize = size;
333         }
334
335         /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
336         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
337         if (rc)
338                 return (rc);
339         if (bp && zio_checksum_error(bp, buf))
340                 return (EIO);
341
342         return (0);
343 }
344
345 static int
346 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
347     off_t offset, size_t bytes)
348 {
349
350         return (vdev_read_phys(vdev, bp, buf,
351                 offset + VDEV_LABEL_START_SIZE, bytes));
352 }
353
354
355 static int
356 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
357     off_t offset, size_t bytes)
358 {
359         vdev_t *kid;
360         int rc;
361
362         rc = EIO;
363         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
364                 if (kid->v_state != VDEV_STATE_HEALTHY)
365                         continue;
366                 rc = kid->v_read(kid, bp, buf, offset, bytes);
367                 if (!rc)
368                         return (0);
369         }
370
371         return (rc);
372 }
373
374 static vdev_t *
375 vdev_find(uint64_t guid)
376 {
377         vdev_t *vdev;
378
379         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
380                 if (vdev->v_guid == guid)
381                         return (vdev);
382
383         return (0);
384 }
385
386 static vdev_t *
387 vdev_create(uint64_t guid, vdev_read_t *read)
388 {
389         vdev_t *vdev;
390
391         vdev = malloc(sizeof(vdev_t));
392         memset(vdev, 0, sizeof(vdev_t));
393         STAILQ_INIT(&vdev->v_children);
394         vdev->v_guid = guid;
395         vdev->v_state = VDEV_STATE_OFFLINE;
396         vdev->v_read = read;
397         vdev->v_phys_read = 0;
398         vdev->v_read_priv = 0;
399         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
400
401         return (vdev);
402 }
403
404 static int
405 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
406 {
407         int rc;
408         uint64_t guid, id, ashift, nparity;
409         const char *type;
410         const char *path;
411         vdev_t *vdev, *kid;
412         const unsigned char *kids;
413         int nkids, i;
414
415         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
416                         DATA_TYPE_UINT64, 0, &guid)
417             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
418                            DATA_TYPE_UINT64, 0, &id)
419             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
420                            DATA_TYPE_STRING, 0, &type)) {
421                 printf("ZFS: can't find vdev details\n");
422                 return (ENOENT);
423         }
424
425         /*
426          * Assume that if we've seen this vdev tree before, this one
427          * will be identical.
428          */
429         vdev = vdev_find(guid);
430         if (vdev) {
431                 if (vdevp)
432                         *vdevp = vdev;
433                 return (0);
434         }
435
436         if (strcmp(type, VDEV_TYPE_MIRROR)
437             && strcmp(type, VDEV_TYPE_DISK)
438             && strcmp(type, VDEV_TYPE_RAIDZ)) {
439                 printf("ZFS: can only boot from disk, mirror or raidz vdevs\n");
440                 return (EIO);
441         }
442
443         if (!strcmp(type, VDEV_TYPE_MIRROR))
444                 vdev = vdev_create(guid, vdev_mirror_read);
445         else if (!strcmp(type, VDEV_TYPE_RAIDZ))
446                 vdev = vdev_create(guid, vdev_raidz_read);
447         else
448                 vdev = vdev_create(guid, vdev_disk_read);
449
450         vdev->v_id = id;
451         if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
452                 DATA_TYPE_UINT64, 0, &ashift) == 0)
453                 vdev->v_ashift = ashift;
454         else
455                 vdev->v_ashift = 0;
456         if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
457                 DATA_TYPE_UINT64, 0, &nparity) == 0)
458                 vdev->v_nparity = nparity;
459         else
460                 vdev->v_nparity = 0;
461         if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
462                         DATA_TYPE_STRING, 0, &path) == 0) {
463                 if (strlen(path) > 5
464                     && path[0] == '/'
465                     && path[1] == 'd'
466                     && path[2] == 'e'
467                     && path[3] == 'v'
468                     && path[4] == '/')
469                         path += 5;
470                 vdev->v_name = strdup(path);
471         } else {
472                 if (!strcmp(type, "raidz")) {
473                         if (vdev->v_nparity == 1)
474                                 vdev->v_name = "raidz1";
475                         else
476                                 vdev->v_name = "raidz2";
477                 } else {
478                         vdev->v_name = strdup(type);
479                 }
480         }
481         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
482                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
483         /*
484          * Its ok if we don't have any kids.
485          */
486         if (rc == 0) {
487                 vdev->v_nchildren = nkids;
488                 for (i = 0; i < nkids; i++) {
489                         rc = vdev_init_from_nvlist(kids, &kid);
490                         if (rc)
491                                 return (rc);
492                         STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
493                         kids = nvlist_next(kids);
494                 }
495         } else {
496                 vdev->v_nchildren = 0;
497         }
498
499         if (vdevp)
500                 *vdevp = vdev;
501         return (0);
502 }
503
504 static void
505 vdev_set_state(vdev_t *vdev)
506 {
507         vdev_t *kid;
508         int good_kids;
509         int bad_kids;
510
511         /*
512          * A mirror or raidz is healthy if all its kids are healthy. A
513          * mirror is degraded if any of its kids is healthy; a raidz
514          * is degraded if at most nparity kids are offline.
515          */
516         if (STAILQ_FIRST(&vdev->v_children)) {
517                 good_kids = 0;
518                 bad_kids = 0;
519                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
520                         if (kid->v_state == VDEV_STATE_HEALTHY)
521                                 good_kids++;
522                         else
523                                 bad_kids++;
524                 }
525                 if (bad_kids == 0) {
526                         vdev->v_state = VDEV_STATE_HEALTHY;
527                 } else {
528                         if (vdev->v_read == vdev_mirror_read) {
529                                 if (good_kids) {
530                                         vdev->v_state = VDEV_STATE_DEGRADED;
531                                 } else {
532                                         vdev->v_state = VDEV_STATE_OFFLINE;
533                                 }
534                         } else if (vdev->v_read == vdev_raidz_read) {
535                                 if (bad_kids > vdev->v_nparity) {
536                                         vdev->v_state = VDEV_STATE_OFFLINE;
537                                 } else {
538                                         vdev->v_state = VDEV_STATE_DEGRADED;
539                                 }
540                         }
541                 }
542         }
543 }
544
545 static spa_t *
546 spa_find_by_guid(uint64_t guid)
547 {
548         spa_t *spa;
549
550         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
551                 if (spa->spa_guid == guid)
552                         return (spa);
553
554         return (0);
555 }
556
557 #ifdef BOOT2
558
559 static spa_t *
560 spa_find_by_name(const char *name)
561 {
562         spa_t *spa;
563
564         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
565                 if (!strcmp(spa->spa_name, name))
566                         return (spa);
567
568         return (0);
569 }
570
571 #endif
572
573 static spa_t *
574 spa_create(uint64_t guid)
575 {
576         spa_t *spa;
577
578         spa = malloc(sizeof(spa_t));
579         memset(spa, 0, sizeof(spa_t));
580         STAILQ_INIT(&spa->spa_vdevs);
581         spa->spa_guid = guid;
582         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
583
584         return (spa);
585 }
586
587 static const char *
588 state_name(vdev_state_t state)
589 {
590         static const char* names[] = {
591                 "UNKNOWN",
592                 "CLOSED",
593                 "OFFLINE",
594                 "CANT_OPEN",
595                 "DEGRADED",
596                 "ONLINE"
597         };
598         return names[state];
599 }
600
601 #ifdef BOOT2
602
603 #define pager_printf printf
604
605 #else
606
607 static void
608 pager_printf(const char *fmt, ...)
609 {
610         char line[80];
611         va_list args;
612
613         va_start(args, fmt);
614         vsprintf(line, fmt, args);
615         va_end(args);
616         pager_output(line);
617 }
618
619 #endif
620
621 #define STATUS_FORMAT   "        %-16s %-10s\n"
622
623 static void
624 print_state(int indent, const char *name, vdev_state_t state)
625 {
626         int i;
627         char buf[512];
628
629         buf[0] = 0;
630         for (i = 0; i < indent; i++)
631                 strcat(buf, "  ");
632         strcat(buf, name);
633         pager_printf(STATUS_FORMAT, buf, state_name(state));
634         
635 }
636
637 static void
638 vdev_status(vdev_t *vdev, int indent)
639 {
640         vdev_t *kid;
641         print_state(indent, vdev->v_name, vdev->v_state);
642
643         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
644                 vdev_status(kid, indent + 1);
645         }
646 }
647
648 static void
649 spa_status(spa_t *spa)
650 {
651         vdev_t *vdev;
652         int good_kids, bad_kids, degraded_kids;
653         vdev_state_t state;
654
655         pager_printf("  pool: %s\n", spa->spa_name);
656         pager_printf("config:\n\n");
657         pager_printf(STATUS_FORMAT, "NAME", "STATE");
658
659         good_kids = 0;
660         degraded_kids = 0;
661         bad_kids = 0;
662         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
663                 if (vdev->v_state == VDEV_STATE_HEALTHY)
664                         good_kids++;
665                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
666                         degraded_kids++;
667                 else
668                         bad_kids++;
669         }
670
671         state = VDEV_STATE_CLOSED;
672         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
673                 state = VDEV_STATE_HEALTHY;
674         else if ((good_kids + degraded_kids) > 0)
675                 state = VDEV_STATE_DEGRADED;
676
677         print_state(0, spa->spa_name, state);
678         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
679                 vdev_status(vdev, 1);
680         }
681 }
682
683 static void
684 spa_all_status(void)
685 {
686         spa_t *spa;
687         int first = 1;
688
689         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
690                 if (!first)
691                         pager_printf("\n");
692                 first = 0;
693                 spa_status(spa);
694         }
695 }
696
697 static int
698 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
699 {
700         vdev_t vtmp;
701         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
702         spa_t *spa;
703         vdev_t *vdev, *top_vdev, *pool_vdev;
704         off_t off;
705         blkptr_t bp;
706         const unsigned char *nvlist;
707         uint64_t val;
708         uint64_t guid;
709         uint64_t pool_txg, pool_guid;
710         const char *pool_name;
711         const unsigned char *vdevs;
712         int i, rc;
713         char upbuf[1024];
714         const struct uberblock *up;
715
716         /*
717          * Load the vdev label and figure out which
718          * uberblock is most current.
719          */
720         memset(&vtmp, 0, sizeof(vtmp));
721         vtmp.v_phys_read = read;
722         vtmp.v_read_priv = read_priv;
723         off = offsetof(vdev_label_t, vl_vdev_phys);
724         BP_ZERO(&bp);
725         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
726         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
727         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
728         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
729         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
730         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
731                 return (EIO);
732
733         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
734                 return (EIO);
735         }
736
737         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
738
739         if (nvlist_find(nvlist,
740                         ZPOOL_CONFIG_VERSION,
741                         DATA_TYPE_UINT64, 0, &val)) {
742                 return (EIO);
743         }
744
745         if (val > SPA_VERSION) {
746                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
747                     (unsigned) val, (unsigned) SPA_VERSION);
748                 return (EIO);
749         }
750
751         if (nvlist_find(nvlist,
752                         ZPOOL_CONFIG_POOL_STATE,
753                         DATA_TYPE_UINT64, 0, &val)) {
754                 return (EIO);
755         }
756
757 #ifndef TEST
758         if (val != POOL_STATE_ACTIVE) {
759                 /*
760                  * Don't print a message here. If we happen to reboot
761                  * while where is an exported pool around, we don't
762                  * need a cascade of confusing messages during boot.
763                  */
764                 /*printf("ZFS: pool is not active\n");*/
765                 return (EIO);
766         }
767 #endif
768
769         if (nvlist_find(nvlist,
770                         ZPOOL_CONFIG_POOL_TXG,
771                         DATA_TYPE_UINT64, 0, &pool_txg)
772             || nvlist_find(nvlist,
773                            ZPOOL_CONFIG_POOL_GUID,
774                            DATA_TYPE_UINT64, 0, &pool_guid)
775             || nvlist_find(nvlist,
776                            ZPOOL_CONFIG_POOL_NAME,
777                            DATA_TYPE_STRING, 0, &pool_name)) {
778                 /*
779                  * Cache and spare devices end up here - just ignore
780                  * them.
781                  */
782                 /*printf("ZFS: can't find pool details\n");*/
783                 return (EIO);
784         }
785
786         /*
787          * Create the pool if this is the first time we've seen it.
788          */
789         spa = spa_find_by_guid(pool_guid);
790         if (!spa) {
791                 spa = spa_create(pool_guid);
792                 spa->spa_name = strdup(pool_name);
793         }
794         if (pool_txg > spa->spa_txg)
795                 spa->spa_txg = pool_txg;
796
797         /*
798          * Get the vdev tree and create our in-core copy of it.
799          * If we already have a healthy vdev with this guid, this must
800          * be some kind of alias (overlapping slices, dangerously dedicated
801          * disks etc).
802          */
803         if (nvlist_find(nvlist,
804                         ZPOOL_CONFIG_GUID,
805                         DATA_TYPE_UINT64, 0, &guid)) {
806                 return (EIO);
807         }
808         vdev = vdev_find(guid);
809         if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
810                 return (EIO);
811         }
812
813         if (nvlist_find(nvlist,
814                         ZPOOL_CONFIG_VDEV_TREE,
815                         DATA_TYPE_NVLIST, 0, &vdevs)) {
816                 return (EIO);
817         }
818         rc = vdev_init_from_nvlist(vdevs, &top_vdev);
819         if (rc)
820                 return (rc);
821
822         /*
823          * Add the toplevel vdev to the pool if its not already there.
824          */
825         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
826                 if (top_vdev == pool_vdev)
827                         break;
828         if (!pool_vdev && top_vdev)
829                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
830
831         /*
832          * We should already have created an incomplete vdev for this
833          * vdev. Find it and initialise it with our read proc.
834          */
835         vdev = vdev_find(guid);
836         if (vdev) {
837                 vdev->v_phys_read = read;
838                 vdev->v_read_priv = read_priv;
839                 vdev->v_state = VDEV_STATE_HEALTHY;
840         } else {
841                 printf("ZFS: inconsistent nvlist contents\n");
842                 return (EIO);
843         }
844
845         /*
846          * Re-evaluate top-level vdev state.
847          */
848         vdev_set_state(top_vdev);
849
850         /*
851          * Ok, we are happy with the pool so far. Lets find
852          * the best uberblock and then we can actually access
853          * the contents of the pool.
854          */
855         for (i = 0;
856              i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
857              i++) {
858                 off = offsetof(vdev_label_t, vl_uberblock);
859                 off += i << UBERBLOCK_SHIFT;
860                 BP_ZERO(&bp);
861                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
862                 BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
863                 BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
864                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
865                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
866                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
867                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
868                         continue;
869
870                 up = (const struct uberblock *) upbuf;
871                 if (up->ub_magic != UBERBLOCK_MAGIC)
872                         continue;
873                 if (up->ub_txg < spa->spa_txg)
874                         continue;
875                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
876                         spa->spa_uberblock = *up;
877                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
878                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
879                                 spa->spa_uberblock = *up;
880                 }
881         }
882
883         if (spap)
884                 *spap = spa;
885         return (0);
886 }
887
888 static int
889 ilog2(int n)
890 {
891         int v;
892
893         for (v = 0; v < 32; v++)
894                 if (n == (1 << v))
895                         return v;
896         return -1;
897 }
898
899 static int
900 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
901 {
902         int cpfunc = BP_GET_COMPRESS(bp);
903         size_t lsize = BP_GET_LSIZE(bp);
904         size_t psize = BP_GET_PSIZE(bp);
905         void *pbuf;
906         int i;
907
908         zfs_reset_temp();
909         if (cpfunc != ZIO_COMPRESS_OFF)
910                 pbuf = zfs_alloc_temp(psize);
911         else
912                 pbuf = buf;
913
914         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
915                 const dva_t *dva = &bp->blk_dva[i];
916                 vdev_t *vdev;
917                 int vdevid;
918                 off_t offset;
919
920                 if (!dva->dva_word[0] && !dva->dva_word[1])
921                         continue;
922
923                 vdevid = DVA_GET_VDEV(dva);
924                 offset = DVA_GET_OFFSET(dva);
925                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
926                         if (vdev->v_id == vdevid)
927                                 break;
928                 if (!vdev || !vdev->v_read)
929                         continue;
930                 if (vdev->v_read(vdev, bp, pbuf, offset, psize))
931                         continue;
932
933                 if (cpfunc != ZIO_COMPRESS_OFF) {
934                         if (zio_decompress_data(cpfunc, pbuf, psize,
935                                 buf, lsize))
936                                 return (EIO);
937                 }
938
939                 return (0);
940         }
941         printf("ZFS: i/o error - all block copies unavailable\n");
942
943         return (EIO);
944 }
945
946 static int
947 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
948 {
949         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
950         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
951         int nlevels = dnode->dn_nlevels;
952         int i, rc;
953
954         /*
955          * Note: bsize may not be a power of two here so we need to do an
956          * actual divide rather than a bitshift.
957          */
958         while (buflen > 0) {
959                 uint64_t bn = offset / bsize;
960                 int boff = offset % bsize;
961                 int ibn;
962                 const blkptr_t *indbp;
963                 blkptr_t bp;
964
965                 if (bn > dnode->dn_maxblkid)
966                         return (EIO);
967
968                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
969                         goto cached;
970
971                 indbp = dnode->dn_blkptr;
972                 for (i = 0; i < nlevels; i++) {
973                         /*
974                          * Copy the bp from the indirect array so that
975                          * we can re-use the scratch buffer for multi-level
976                          * objects.
977                          */
978                         ibn = bn >> ((nlevels - i - 1) * ibshift);
979                         ibn &= ((1 << ibshift) - 1);
980                         bp = indbp[ibn];
981                         rc = zio_read(spa, &bp, dnode_cache_buf);
982                         if (rc)
983                                 return (rc);
984                         indbp = (const blkptr_t *) dnode_cache_buf;
985                 }
986                 dnode_cache_obj = dnode;
987                 dnode_cache_bn = bn;
988         cached:
989
990                 /*
991                  * The buffer contains our data block. Copy what we
992                  * need from it and loop.
993                  */ 
994                 i = bsize - boff;
995                 if (i > buflen) i = buflen;
996                 memcpy(buf, &dnode_cache_buf[boff], i);
997                 buf = ((char*) buf) + i;
998                 offset += i;
999                 buflen -= i;
1000         }
1001
1002         return (0);
1003 }
1004
1005 /*
1006  * Lookup a value in a microzap directory. Assumes that the zap
1007  * scratch buffer contains the directory contents.
1008  */
1009 static int
1010 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1011 {
1012         const mzap_phys_t *mz;
1013         const mzap_ent_phys_t *mze;
1014         size_t size;
1015         int chunks, i;
1016
1017         /*
1018          * Microzap objects use exactly one block. Read the whole
1019          * thing.
1020          */
1021         size = dnode->dn_datablkszsec * 512;
1022
1023         mz = (const mzap_phys_t *) zap_scratch;
1024         chunks = size / MZAP_ENT_LEN - 1;
1025
1026         for (i = 0; i < chunks; i++) {
1027                 mze = &mz->mz_chunk[i];
1028                 if (!strcmp(mze->mze_name, name)) {
1029                         *value = mze->mze_value;
1030                         return (0);
1031                 }
1032         }
1033
1034         return (ENOENT);
1035 }
1036
1037 /*
1038  * Compare a name with a zap leaf entry. Return non-zero if the name
1039  * matches.
1040  */
1041 static int
1042 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1043 {
1044         size_t namelen;
1045         const zap_leaf_chunk_t *nc;
1046         const char *p;
1047
1048         namelen = zc->l_entry.le_name_length;
1049                         
1050         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1051         p = name;
1052         while (namelen > 0) {
1053                 size_t len;
1054                 len = namelen;
1055                 if (len > ZAP_LEAF_ARRAY_BYTES)
1056                         len = ZAP_LEAF_ARRAY_BYTES;
1057                 if (memcmp(p, nc->l_array.la_array, len))
1058                         return (0);
1059                 p += len;
1060                 namelen -= len;
1061                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1062         }
1063
1064         return 1;
1065 }
1066
1067 /*
1068  * Extract a uint64_t value from a zap leaf entry.
1069  */
1070 static uint64_t
1071 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1072 {
1073         const zap_leaf_chunk_t *vc;
1074         int i;
1075         uint64_t value;
1076         const uint8_t *p;
1077
1078         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1079         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1080                 value = (value << 8) | p[i];
1081         }
1082
1083         return value;
1084 }
1085
1086 /*
1087  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1088  * buffer contains the directory header.
1089  */
1090 static int
1091 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1092 {
1093         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1094         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1095         fat_zap_t z;
1096         uint64_t *ptrtbl;
1097         uint64_t hash;
1098         int rc;
1099
1100         if (zh.zap_magic != ZAP_MAGIC)
1101                 return (EIO);
1102
1103         z.zap_block_shift = ilog2(bsize);
1104         z.zap_phys = (zap_phys_t *) zap_scratch;
1105
1106         /*
1107          * Figure out where the pointer table is and read it in if necessary.
1108          */
1109         if (zh.zap_ptrtbl.zt_blk) {
1110                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1111                                zap_scratch, bsize);
1112                 if (rc)
1113                         return (rc);
1114                 ptrtbl = (uint64_t *) zap_scratch;
1115         } else {
1116                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1117         }
1118
1119         hash = zap_hash(zh.zap_salt, name);
1120
1121         zap_leaf_t zl;
1122         zl.l_bs = z.zap_block_shift;
1123
1124         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1125         zap_leaf_chunk_t *zc;
1126
1127         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1128         if (rc)
1129                 return (rc);
1130
1131         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1132
1133         /*
1134          * Make sure this chunk matches our hash.
1135          */
1136         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1137             && zl.l_phys->l_hdr.lh_prefix
1138             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1139                 return (ENOENT);
1140
1141         /*
1142          * Hash within the chunk to find our entry.
1143          */
1144         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1145         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1146         h = zl.l_phys->l_hash[h];
1147         if (h == 0xffff)
1148                 return (ENOENT);
1149         zc = &ZAP_LEAF_CHUNK(&zl, h);
1150         while (zc->l_entry.le_hash != hash) {
1151                 if (zc->l_entry.le_next == 0xffff) {
1152                         zc = 0;
1153                         break;
1154                 }
1155                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1156         }
1157         if (fzap_name_equal(&zl, zc, name)) {
1158                 *value = fzap_leaf_value(&zl, zc);
1159                 return (0);
1160         }
1161
1162         return (ENOENT);
1163 }
1164
1165 /*
1166  * Lookup a name in a zap object and return its value as a uint64_t.
1167  */
1168 static int
1169 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1170 {
1171         int rc;
1172         uint64_t zap_type;
1173         size_t size = dnode->dn_datablkszsec * 512;
1174
1175         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1176         if (rc)
1177                 return (rc);
1178
1179         zap_type = *(uint64_t *) zap_scratch;
1180         if (zap_type == ZBT_MICRO)
1181                 return mzap_lookup(spa, dnode, name, value);
1182         else
1183                 return fzap_lookup(spa, dnode, name, value);
1184 }
1185
1186 #ifdef BOOT2
1187
1188 /*
1189  * List a microzap directory. Assumes that the zap scratch buffer contains
1190  * the directory contents.
1191  */
1192 static int
1193 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
1194 {
1195         const mzap_phys_t *mz;
1196         const mzap_ent_phys_t *mze;
1197         size_t size;
1198         int chunks, i;
1199
1200         /*
1201          * Microzap objects use exactly one block. Read the whole
1202          * thing.
1203          */
1204         size = dnode->dn_datablkszsec * 512;
1205         mz = (const mzap_phys_t *) zap_scratch;
1206         chunks = size / MZAP_ENT_LEN - 1;
1207
1208         for (i = 0; i < chunks; i++) {
1209                 mze = &mz->mz_chunk[i];
1210                 if (mze->mze_name[0])
1211                         //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
1212                         printf("%s\n", mze->mze_name);
1213         }
1214
1215         return (0);
1216 }
1217
1218 /*
1219  * List a fatzap directory. Assumes that the zap scratch buffer contains
1220  * the directory header.
1221  */
1222 static int
1223 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
1224 {
1225         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1226         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1227         fat_zap_t z;
1228         int i, j;
1229
1230         if (zh.zap_magic != ZAP_MAGIC)
1231                 return (EIO);
1232
1233         z.zap_block_shift = ilog2(bsize);
1234         z.zap_phys = (zap_phys_t *) zap_scratch;
1235
1236         /*
1237          * This assumes that the leaf blocks start at block 1. The
1238          * documentation isn't exactly clear on this.
1239          */
1240         zap_leaf_t zl;
1241         zl.l_bs = z.zap_block_shift;
1242         for (i = 0; i < zh.zap_num_leafs; i++) {
1243                 off_t off = (i + 1) << zl.l_bs;
1244                 char name[256], *p;
1245                 uint64_t value;
1246
1247                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1248                         return (EIO);
1249
1250                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1251
1252                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1253                         zap_leaf_chunk_t *zc, *nc;
1254                         int namelen;
1255
1256                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1257                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1258                                 continue;
1259                         namelen = zc->l_entry.le_name_length;
1260                         if (namelen > sizeof(name))
1261                                 namelen = sizeof(name);
1262                         
1263                         /*
1264                          * Paste the name back together.
1265                          */
1266                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1267                         p = name;
1268                         while (namelen > 0) {
1269                                 int len;
1270                                 len = namelen;
1271                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1272                                         len = ZAP_LEAF_ARRAY_BYTES;
1273                                 memcpy(p, nc->l_array.la_array, len);
1274                                 p += len;
1275                                 namelen -= len;
1276                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1277                         }
1278
1279                         /*
1280                          * Assume the first eight bytes of the value are
1281                          * a uint64_t.
1282                          */
1283                         value = fzap_leaf_value(&zl, zc);
1284
1285                         printf("%-32s 0x%llx\n", name, value);
1286                 }
1287         }
1288
1289         return (0);
1290 }
1291
1292 /*
1293  * List a zap directory.
1294  */
1295 static int
1296 zap_list(spa_t *spa, const dnode_phys_t *dnode)
1297 {
1298         uint64_t zap_type;
1299         size_t size = dnode->dn_datablkszsec * 512;
1300
1301         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1302                 return (EIO);
1303
1304         zap_type = *(uint64_t *) zap_scratch;
1305         if (zap_type == ZBT_MICRO)
1306                 return mzap_list(spa, dnode);
1307         else
1308                 return fzap_list(spa, dnode);
1309 }
1310
1311 #endif
1312
1313 static int
1314 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1315 {
1316         off_t offset;
1317
1318         offset = objnum * sizeof(dnode_phys_t);
1319         return dnode_read(spa, &os->os_meta_dnode, offset,
1320                 dnode, sizeof(dnode_phys_t));
1321 }
1322
1323 /*
1324  * Find the object set given the object number of its dataset object
1325  * and return its details in *objset
1326  */
1327 static int
1328 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1329 {
1330         dnode_phys_t dataset;
1331         dsl_dataset_phys_t *ds;
1332
1333         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1334                 printf("ZFS: can't find dataset %lld\n", objnum);
1335                 return (EIO);
1336         }
1337
1338         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1339         if (zio_read(spa, &ds->ds_bp, objset)) {
1340                 printf("ZFS: can't read object set for dataset %lld\n", objnum);
1341                 return (EIO);
1342         }
1343
1344         return (0);
1345 }
1346
1347 /*
1348  * Find the object set pointed to by the BOOTFS property or the root
1349  * dataset if there is none and return its details in *objset
1350  */
1351 static int
1352 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
1353 {
1354         dnode_phys_t dir, propdir;
1355         uint64_t props, bootfs, root;
1356
1357         /*
1358          * Start with the MOS directory object.
1359          */
1360         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1361                 printf("ZFS: can't read MOS object directory\n");
1362                 return (EIO);
1363         }
1364
1365         /*
1366          * Lookup the pool_props and see if we can find a bootfs.
1367          */
1368         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1369              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1370              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0)
1371                 return zfs_mount_dataset(spa, bootfs, objset);
1372
1373         /*
1374          * Lookup the root dataset directory
1375          */
1376         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1377             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1378                 printf("ZFS: can't find root dsl_dir\n");
1379                 return (EIO);
1380         }
1381
1382         /*
1383          * Use the information from the dataset directory's bonus buffer
1384          * to find the dataset object and from that the object set itself.
1385          */
1386         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1387         return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
1388 }
1389
1390 static int
1391 zfs_mount_pool(spa_t *spa)
1392 {
1393         /*
1394          * Find the MOS and work our way in from there.
1395          */
1396         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
1397                 printf("ZFS: can't read MOS\n");
1398                 return (EIO);
1399         }
1400
1401         /*
1402          * Find the root object set
1403          */
1404         if (zfs_mount_root(spa, &spa->spa_root_objset)) {
1405                 printf("Can't find root filesystem - giving up\n");
1406                 return (EIO);
1407         }
1408
1409         return (0);
1410 }
1411
1412 /*
1413  * Lookup a file and return its dnode.
1414  */
1415 static int
1416 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
1417 {
1418         int rc;
1419         uint64_t objnum, rootnum, parentnum;
1420         dnode_phys_t dn;
1421         const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
1422         const char *p, *q;
1423         char element[256];
1424         char path[1024];
1425         int symlinks_followed = 0;
1426
1427         if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
1428                 printf("ZFS: unexpected object set type %lld\n",
1429                        spa->spa_root_objset.os_type);
1430                 return (EIO);
1431         }
1432
1433         /*
1434          * Get the root directory dnode.
1435          */
1436         rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
1437         if (rc)
1438                 return (rc);
1439
1440         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
1441         if (rc)
1442                 return (rc);
1443
1444         rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
1445         if (rc)
1446                 return (rc);
1447
1448         objnum = rootnum;
1449         p = upath;
1450         while (p && *p) {
1451                 while (*p == '/')
1452                         p++;
1453                 if (!*p)
1454                         break;
1455                 q = strchr(p, '/');
1456                 if (q) {
1457                         memcpy(element, p, q - p);
1458                         element[q - p] = 0;
1459                         p = q;
1460                 } else {
1461                         strcpy(element, p);
1462                         p = 0;
1463                 }
1464
1465                 if ((zp->zp_mode >> 12) != 0x4) {
1466                         return (ENOTDIR);
1467                 }
1468
1469                 parentnum = objnum;
1470                 rc = zap_lookup(spa, &dn, element, &objnum);
1471                 if (rc)
1472                         return (rc);
1473                 objnum = ZFS_DIRENT_OBJ(objnum);
1474
1475                 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1476                 if (rc)
1477                         return (rc);
1478
1479                 /*
1480                  * Check for symlink.
1481                  */
1482                 if ((zp->zp_mode >> 12) == 0xa) {
1483                         if (symlinks_followed > 10)
1484                                 return (EMLINK);
1485                         symlinks_followed++;
1486
1487                         /*
1488                          * Read the link value and copy the tail of our
1489                          * current path onto the end.
1490                          */
1491                         if (p)
1492                                 strcpy(&path[zp->zp_size], p);
1493                         else
1494                                 path[zp->zp_size] = 0;
1495                         if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
1496                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
1497                                         zp->zp_size);
1498                         } else {
1499                                 rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
1500                                 if (rc)
1501                                         return (rc);
1502                         }
1503
1504                         /*
1505                          * Restart with the new path, starting either at
1506                          * the root or at the parent depending whether or
1507                          * not the link is relative.
1508                          */
1509                         p = path;
1510                         if (*p == '/')
1511                                 objnum = rootnum;
1512                         else
1513                                 objnum = parentnum;
1514                         objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1515                 }
1516         }
1517
1518         *dnode = dn;
1519         return (0);
1520 }