]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - stand/zfs/zfsimpl.c
Re-sync loader.mk and ficl.mk to where they should be
[FreeBSD/FreeBSD.git] / stand / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <sys/stat.h>
35 #include <sys/stdint.h>
36
37 #include "zfsimpl.h"
38 #include "zfssubr.c"
39
40
41 struct zfsmount {
42         const spa_t     *spa;
43         objset_phys_t   objset;
44         uint64_t        rootobj;
45 };
46
47 /*
48  * List of all vdevs, chained through v_alllink.
49  */
50 static vdev_list_t zfs_vdevs;
51
52  /*
53  * List of ZFS features supported for read
54  */
55 static const char *features_for_read[] = {
56         "org.illumos:lz4_compress",
57         "com.delphix:hole_birth",
58         "com.delphix:extensible_dataset",
59         "com.delphix:embedded_data",
60         "org.open-zfs:large_blocks",
61         "org.illumos:sha512",
62         "org.illumos:skein",
63         "org.zfsonlinux:large_dnode",
64         NULL
65 };
66
67 /*
68  * List of all pools, chained through spa_link.
69  */
70 static spa_list_t zfs_pools;
71
72 static const dnode_phys_t *dnode_cache_obj;
73 static uint64_t dnode_cache_bn;
74 static char *dnode_cache_buf;
75 static char *zap_scratch;
76 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
77
78 #define TEMP_SIZE       (1024 * 1024)
79
80 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
81 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
82 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
83 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
84     const char *name, uint64_t integer_size, uint64_t num_integers,
85     void *value);
86
87 static void
88 zfs_init(void)
89 {
90         STAILQ_INIT(&zfs_vdevs);
91         STAILQ_INIT(&zfs_pools);
92
93         zfs_temp_buf = malloc(TEMP_SIZE);
94         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
95         zfs_temp_ptr = zfs_temp_buf;
96         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
97         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
98
99         zfs_init_crc();
100 }
101
102 static void *
103 zfs_alloc(size_t size)
104 {
105         char *ptr;
106
107         if (zfs_temp_ptr + size > zfs_temp_end) {
108                 printf("ZFS: out of temporary buffer space\n");
109                 for (;;) ;
110         }
111         ptr = zfs_temp_ptr;
112         zfs_temp_ptr += size;
113
114         return (ptr);
115 }
116
117 static void
118 zfs_free(void *ptr, size_t size)
119 {
120
121         zfs_temp_ptr -= size;
122         if (zfs_temp_ptr != ptr) {
123                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
124                 for (;;) ;
125         }
126 }
127
128 static int
129 xdr_int(const unsigned char **xdr, int *ip)
130 {
131         *ip = ((*xdr)[0] << 24)
132                 | ((*xdr)[1] << 16)
133                 | ((*xdr)[2] << 8)
134                 | ((*xdr)[3] << 0);
135         (*xdr) += 4;
136         return (0);
137 }
138
139 static int
140 xdr_u_int(const unsigned char **xdr, u_int *ip)
141 {
142         *ip = ((*xdr)[0] << 24)
143                 | ((*xdr)[1] << 16)
144                 | ((*xdr)[2] << 8)
145                 | ((*xdr)[3] << 0);
146         (*xdr) += 4;
147         return (0);
148 }
149
150 static int
151 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
152 {
153         u_int hi, lo;
154
155         xdr_u_int(xdr, &hi);
156         xdr_u_int(xdr, &lo);
157         *lp = (((uint64_t) hi) << 32) | lo;
158         return (0);
159 }
160
161 static int
162 nvlist_find(const unsigned char *nvlist, const char *name, int type,
163             int* elementsp, void *valuep)
164 {
165         const unsigned char *p, *pair;
166         int junk;
167         int encoded_size, decoded_size;
168
169         p = nvlist;
170         xdr_int(&p, &junk);
171         xdr_int(&p, &junk);
172
173         pair = p;
174         xdr_int(&p, &encoded_size);
175         xdr_int(&p, &decoded_size);
176         while (encoded_size && decoded_size) {
177                 int namelen, pairtype, elements;
178                 const char *pairname;
179
180                 xdr_int(&p, &namelen);
181                 pairname = (const char*) p;
182                 p += roundup(namelen, 4);
183                 xdr_int(&p, &pairtype);
184
185                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
186                         xdr_int(&p, &elements);
187                         if (elementsp)
188                                 *elementsp = elements;
189                         if (type == DATA_TYPE_UINT64) {
190                                 xdr_uint64_t(&p, (uint64_t *) valuep);
191                                 return (0);
192                         } else if (type == DATA_TYPE_STRING) {
193                                 int len;
194                                 xdr_int(&p, &len);
195                                 (*(const char**) valuep) = (const char*) p;
196                                 return (0);
197                         } else if (type == DATA_TYPE_NVLIST
198                                    || type == DATA_TYPE_NVLIST_ARRAY) {
199                                 (*(const unsigned char**) valuep) =
200                                          (const unsigned char*) p;
201                                 return (0);
202                         } else {
203                                 return (EIO);
204                         }
205                 } else {
206                         /*
207                          * Not the pair we are looking for, skip to the next one.
208                          */
209                         p = pair + encoded_size;
210                 }
211
212                 pair = p;
213                 xdr_int(&p, &encoded_size);
214                 xdr_int(&p, &decoded_size);
215         }
216
217         return (EIO);
218 }
219
220 static int
221 nvlist_check_features_for_read(const unsigned char *nvlist)
222 {
223         const unsigned char *p, *pair;
224         int junk;
225         int encoded_size, decoded_size;
226         int rc;
227
228         rc = 0;
229
230         p = nvlist;
231         xdr_int(&p, &junk);
232         xdr_int(&p, &junk);
233
234         pair = p;
235         xdr_int(&p, &encoded_size);
236         xdr_int(&p, &decoded_size);
237         while (encoded_size && decoded_size) {
238                 int namelen, pairtype;
239                 const char *pairname;
240                 int i, found;
241
242                 found = 0;
243
244                 xdr_int(&p, &namelen);
245                 pairname = (const char*) p;
246                 p += roundup(namelen, 4);
247                 xdr_int(&p, &pairtype);
248
249                 for (i = 0; features_for_read[i] != NULL; i++) {
250                         if (!memcmp(pairname, features_for_read[i], namelen)) {
251                                 found = 1;
252                                 break;
253                         }
254                 }
255
256                 if (!found) {
257                         printf("ZFS: unsupported feature: %s\n", pairname);
258                         rc = EIO;
259                 }
260
261                 p = pair + encoded_size;
262
263                 pair = p;
264                 xdr_int(&p, &encoded_size);
265                 xdr_int(&p, &decoded_size);
266         }
267
268         return (rc);
269 }
270
271 /*
272  * Return the next nvlist in an nvlist array.
273  */
274 static const unsigned char *
275 nvlist_next(const unsigned char *nvlist)
276 {
277         const unsigned char *p, *pair;
278         int junk;
279         int encoded_size, decoded_size;
280
281         p = nvlist;
282         xdr_int(&p, &junk);
283         xdr_int(&p, &junk);
284
285         pair = p;
286         xdr_int(&p, &encoded_size);
287         xdr_int(&p, &decoded_size);
288         while (encoded_size && decoded_size) {
289                 p = pair + encoded_size;
290
291                 pair = p;
292                 xdr_int(&p, &encoded_size);
293                 xdr_int(&p, &decoded_size);
294         }
295
296         return p;
297 }
298
299 #ifdef TEST
300
301 static const unsigned char *
302 nvlist_print(const unsigned char *nvlist, unsigned int indent)
303 {
304         static const char* typenames[] = {
305                 "DATA_TYPE_UNKNOWN",
306                 "DATA_TYPE_BOOLEAN",
307                 "DATA_TYPE_BYTE",
308                 "DATA_TYPE_INT16",
309                 "DATA_TYPE_UINT16",
310                 "DATA_TYPE_INT32",
311                 "DATA_TYPE_UINT32",
312                 "DATA_TYPE_INT64",
313                 "DATA_TYPE_UINT64",
314                 "DATA_TYPE_STRING",
315                 "DATA_TYPE_BYTE_ARRAY",
316                 "DATA_TYPE_INT16_ARRAY",
317                 "DATA_TYPE_UINT16_ARRAY",
318                 "DATA_TYPE_INT32_ARRAY",
319                 "DATA_TYPE_UINT32_ARRAY",
320                 "DATA_TYPE_INT64_ARRAY",
321                 "DATA_TYPE_UINT64_ARRAY",
322                 "DATA_TYPE_STRING_ARRAY",
323                 "DATA_TYPE_HRTIME",
324                 "DATA_TYPE_NVLIST",
325                 "DATA_TYPE_NVLIST_ARRAY",
326                 "DATA_TYPE_BOOLEAN_VALUE",
327                 "DATA_TYPE_INT8",
328                 "DATA_TYPE_UINT8",
329                 "DATA_TYPE_BOOLEAN_ARRAY",
330                 "DATA_TYPE_INT8_ARRAY",
331                 "DATA_TYPE_UINT8_ARRAY"
332         };
333
334         unsigned int i, j;
335         const unsigned char *p, *pair;
336         int junk;
337         int encoded_size, decoded_size;
338
339         p = nvlist;
340         xdr_int(&p, &junk);
341         xdr_int(&p, &junk);
342
343         pair = p;
344         xdr_int(&p, &encoded_size);
345         xdr_int(&p, &decoded_size);
346         while (encoded_size && decoded_size) {
347                 int namelen, pairtype, elements;
348                 const char *pairname;
349
350                 xdr_int(&p, &namelen);
351                 pairname = (const char*) p;
352                 p += roundup(namelen, 4);
353                 xdr_int(&p, &pairtype);
354
355                 for (i = 0; i < indent; i++)
356                         printf(" ");
357                 printf("%s %s", typenames[pairtype], pairname);
358
359                 xdr_int(&p, &elements);
360                 switch (pairtype) {
361                 case DATA_TYPE_UINT64: {
362                         uint64_t val;
363                         xdr_uint64_t(&p, &val);
364                         printf(" = 0x%jx\n", (uintmax_t)val);
365                         break;
366                 }
367
368                 case DATA_TYPE_STRING: {
369                         int len;
370                         xdr_int(&p, &len);
371                         printf(" = \"%s\"\n", p);
372                         break;
373                 }
374
375                 case DATA_TYPE_NVLIST:
376                         printf("\n");
377                         nvlist_print(p, indent + 1);
378                         break;
379
380                 case DATA_TYPE_NVLIST_ARRAY:
381                         for (j = 0; j < elements; j++) {
382                                 printf("[%d]\n", j);
383                                 p = nvlist_print(p, indent + 1);
384                                 if (j != elements - 1) {
385                                         for (i = 0; i < indent; i++)
386                                                 printf(" ");
387                                         printf("%s %s", typenames[pairtype], pairname);
388                                 }
389                         }
390                         break;
391
392                 default:
393                         printf("\n");
394                 }
395
396                 p = pair + encoded_size;
397
398                 pair = p;
399                 xdr_int(&p, &encoded_size);
400                 xdr_int(&p, &decoded_size);
401         }
402
403         return p;
404 }
405
406 #endif
407
408 static int
409 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
410     off_t offset, size_t size)
411 {
412         size_t psize;
413         int rc;
414
415         if (!vdev->v_phys_read)
416                 return (EIO);
417
418         if (bp) {
419                 psize = BP_GET_PSIZE(bp);
420         } else {
421                 psize = size;
422         }
423
424         /*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
425         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
426         if (rc)
427                 return (rc);
428         if (bp && zio_checksum_verify(vdev->spa, bp, buf))
429                 return (EIO);
430
431         return (0);
432 }
433
434 static int
435 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
436     off_t offset, size_t bytes)
437 {
438
439         return (vdev_read_phys(vdev, bp, buf,
440                 offset + VDEV_LABEL_START_SIZE, bytes));
441 }
442
443
444 static int
445 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
446     off_t offset, size_t bytes)
447 {
448         vdev_t *kid;
449         int rc;
450
451         rc = EIO;
452         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
453                 if (kid->v_state != VDEV_STATE_HEALTHY)
454                         continue;
455                 rc = kid->v_read(kid, bp, buf, offset, bytes);
456                 if (!rc)
457                         return (0);
458         }
459
460         return (rc);
461 }
462
463 static int
464 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
465     off_t offset, size_t bytes)
466 {
467         vdev_t *kid;
468
469         /*
470          * Here we should have two kids:
471          * First one which is the one we are replacing and we can trust
472          * only this one to have valid data, but it might not be present.
473          * Second one is that one we are replacing with. It is most likely
474          * healthy, but we can't trust it has needed data, so we won't use it.
475          */
476         kid = STAILQ_FIRST(&vdev->v_children);
477         if (kid == NULL)
478                 return (EIO);
479         if (kid->v_state != VDEV_STATE_HEALTHY)
480                 return (EIO);
481         return (kid->v_read(kid, bp, buf, offset, bytes));
482 }
483
484 static vdev_t *
485 vdev_find(uint64_t guid)
486 {
487         vdev_t *vdev;
488
489         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
490                 if (vdev->v_guid == guid)
491                         return (vdev);
492
493         return (0);
494 }
495
496 static vdev_t *
497 vdev_create(uint64_t guid, vdev_read_t *_read)
498 {
499         vdev_t *vdev;
500
501         vdev = malloc(sizeof(vdev_t));
502         memset(vdev, 0, sizeof(vdev_t));
503         STAILQ_INIT(&vdev->v_children);
504         vdev->v_guid = guid;
505         vdev->v_state = VDEV_STATE_OFFLINE;
506         vdev->v_read = _read;
507         vdev->v_phys_read = 0;
508         vdev->v_read_priv = 0;
509         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
510
511         return (vdev);
512 }
513
514 static int
515 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
516     vdev_t **vdevp, int is_newer)
517 {
518         int rc;
519         uint64_t guid, id, ashift, nparity;
520         const char *type;
521         const char *path;
522         vdev_t *vdev, *kid;
523         const unsigned char *kids;
524         int nkids, i, is_new;
525         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
526
527         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
528             NULL, &guid)
529             || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id)
530             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
531             NULL, &type)) {
532                 printf("ZFS: can't find vdev details\n");
533                 return (ENOENT);
534         }
535
536         if (strcmp(type, VDEV_TYPE_MIRROR)
537             && strcmp(type, VDEV_TYPE_DISK)
538 #ifdef ZFS_TEST
539             && strcmp(type, VDEV_TYPE_FILE)
540 #endif
541             && strcmp(type, VDEV_TYPE_RAIDZ)
542             && strcmp(type, VDEV_TYPE_REPLACING)) {
543                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
544                 return (EIO);
545         }
546
547         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
548
549         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
550                         &is_offline);
551         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
552                         &is_removed);
553         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
554                         &is_faulted);
555         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL,
556                         &is_degraded);
557         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL,
558                         &isnt_present);
559
560         vdev = vdev_find(guid);
561         if (!vdev) {
562                 is_new = 1;
563
564                 if (!strcmp(type, VDEV_TYPE_MIRROR))
565                         vdev = vdev_create(guid, vdev_mirror_read);
566                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
567                         vdev = vdev_create(guid, vdev_raidz_read);
568                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
569                         vdev = vdev_create(guid, vdev_replacing_read);
570                 else
571                         vdev = vdev_create(guid, vdev_disk_read);
572
573                 vdev->v_id = id;
574                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
575                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
576                         DATA_TYPE_UINT64, NULL, &ashift) == 0) {
577                         vdev->v_ashift = ashift;
578                 } else {
579                         vdev->v_ashift = 0;
580                 }
581                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
582                         DATA_TYPE_UINT64, NULL, &nparity) == 0) {
583                         vdev->v_nparity = nparity;
584                 } else {
585                         vdev->v_nparity = 0;
586                 }
587                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
588                                 DATA_TYPE_STRING, NULL, &path) == 0) {
589                         if (strncmp(path, "/dev/", 5) == 0)
590                                 path += 5;
591                         vdev->v_name = strdup(path);
592                 } else {
593                         if (!strcmp(type, "raidz")) {
594                                 if (vdev->v_nparity == 1)
595                                         vdev->v_name = "raidz1";
596                                 else if (vdev->v_nparity == 2)
597                                         vdev->v_name = "raidz2";
598                                 else if (vdev->v_nparity == 3)
599                                         vdev->v_name = "raidz3";
600                                 else {
601                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
602                                         return (EIO);
603                                 }
604                         } else {
605                                 vdev->v_name = strdup(type);
606                         }
607                 }
608         } else {
609                 is_new = 0;
610         }
611
612         if (is_new || is_newer) {
613                 /*
614                  * This is either new vdev or we've already seen this vdev,
615                  * but from an older vdev label, so let's refresh its state
616                  * from the newer label.
617                  */
618                 if (is_offline)
619                         vdev->v_state = VDEV_STATE_OFFLINE;
620                 else if (is_removed)
621                         vdev->v_state = VDEV_STATE_REMOVED;
622                 else if (is_faulted)
623                         vdev->v_state = VDEV_STATE_FAULTED;
624                 else if (is_degraded)
625                         vdev->v_state = VDEV_STATE_DEGRADED;
626                 else if (isnt_present)
627                         vdev->v_state = VDEV_STATE_CANT_OPEN;
628         }
629
630         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
631             &nkids, &kids);
632         /*
633          * Its ok if we don't have any kids.
634          */
635         if (rc == 0) {
636                 vdev->v_nchildren = nkids;
637                 for (i = 0; i < nkids; i++) {
638                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
639                         if (rc)
640                                 return (rc);
641                         if (is_new)
642                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
643                                                    v_childlink);
644                         kids = nvlist_next(kids);
645                 }
646         } else {
647                 vdev->v_nchildren = 0;
648         }
649
650         if (vdevp)
651                 *vdevp = vdev;
652         return (0);
653 }
654
655 static void
656 vdev_set_state(vdev_t *vdev)
657 {
658         vdev_t *kid;
659         int good_kids;
660         int bad_kids;
661
662         /*
663          * A mirror or raidz is healthy if all its kids are healthy. A
664          * mirror is degraded if any of its kids is healthy; a raidz
665          * is degraded if at most nparity kids are offline.
666          */
667         if (STAILQ_FIRST(&vdev->v_children)) {
668                 good_kids = 0;
669                 bad_kids = 0;
670                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
671                         if (kid->v_state == VDEV_STATE_HEALTHY)
672                                 good_kids++;
673                         else
674                                 bad_kids++;
675                 }
676                 if (bad_kids == 0) {
677                         vdev->v_state = VDEV_STATE_HEALTHY;
678                 } else {
679                         if (vdev->v_read == vdev_mirror_read) {
680                                 if (good_kids) {
681                                         vdev->v_state = VDEV_STATE_DEGRADED;
682                                 } else {
683                                         vdev->v_state = VDEV_STATE_OFFLINE;
684                                 }
685                         } else if (vdev->v_read == vdev_raidz_read) {
686                                 if (bad_kids > vdev->v_nparity) {
687                                         vdev->v_state = VDEV_STATE_OFFLINE;
688                                 } else {
689                                         vdev->v_state = VDEV_STATE_DEGRADED;
690                                 }
691                         }
692                 }
693         }
694 }
695
696 static spa_t *
697 spa_find_by_guid(uint64_t guid)
698 {
699         spa_t *spa;
700
701         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
702                 if (spa->spa_guid == guid)
703                         return (spa);
704
705         return (0);
706 }
707
708 static spa_t *
709 spa_find_by_name(const char *name)
710 {
711         spa_t *spa;
712
713         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
714                 if (!strcmp(spa->spa_name, name))
715                         return (spa);
716
717         return (0);
718 }
719
720 #ifdef BOOT2
721 static spa_t *
722 spa_get_primary(void)
723 {
724
725         return (STAILQ_FIRST(&zfs_pools));
726 }
727
728 static vdev_t *
729 spa_get_primary_vdev(const spa_t *spa)
730 {
731         vdev_t *vdev;
732         vdev_t *kid;
733
734         if (spa == NULL)
735                 spa = spa_get_primary();
736         if (spa == NULL)
737                 return (NULL);
738         vdev = STAILQ_FIRST(&spa->spa_vdevs);
739         if (vdev == NULL)
740                 return (NULL);
741         for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
742              kid = STAILQ_FIRST(&vdev->v_children))
743                 vdev = kid;
744         return (vdev);
745 }
746 #endif
747
748 static spa_t *
749 spa_create(uint64_t guid, const char *name)
750 {
751         spa_t *spa;
752
753         if ((spa = malloc(sizeof(spa_t))) == NULL)
754                 return (NULL);
755         memset(spa, 0, sizeof(spa_t));
756         if ((spa->spa_name = strdup(name)) == NULL) {
757                 free(spa);
758                 return (NULL);
759         }
760         STAILQ_INIT(&spa->spa_vdevs);
761         spa->spa_guid = guid;
762         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
763
764         return (spa);
765 }
766
767 static const char *
768 state_name(vdev_state_t state)
769 {
770         static const char* names[] = {
771                 "UNKNOWN",
772                 "CLOSED",
773                 "OFFLINE",
774                 "REMOVED",
775                 "CANT_OPEN",
776                 "FAULTED",
777                 "DEGRADED",
778                 "ONLINE"
779         };
780         return names[state];
781 }
782
783 #ifdef BOOT2
784
785 #define pager_printf printf
786
787 #else
788
789 static int
790 pager_printf(const char *fmt, ...)
791 {
792         char line[80];
793         va_list args;
794
795         va_start(args, fmt);
796         vsprintf(line, fmt, args);
797         va_end(args);
798
799         return (pager_output(line));
800 }
801
802 #endif
803
804 #define STATUS_FORMAT   "        %s %s\n"
805
806 static int
807 print_state(int indent, const char *name, vdev_state_t state)
808 {
809         char buf[512];
810         int i;
811
812         buf[0] = 0;
813         for (i = 0; i < indent; i++)
814                 strcat(buf, "  ");
815         strcat(buf, name);
816
817         return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
818 }
819
820 static int
821 vdev_status(vdev_t *vdev, int indent)
822 {
823         vdev_t *kid;
824         int ret;
825         ret = print_state(indent, vdev->v_name, vdev->v_state);
826         if (ret != 0)
827                 return (ret);
828
829         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
830                 ret = vdev_status(kid, indent + 1);
831                 if (ret != 0)
832                         return (ret);
833         }
834         return (ret);
835 }
836
837 static int
838 spa_status(spa_t *spa)
839 {
840         static char bootfs[ZFS_MAXNAMELEN];
841         uint64_t rootid;
842         vdev_t *vdev;
843         int good_kids, bad_kids, degraded_kids, ret;
844         vdev_state_t state;
845
846         ret = pager_printf("  pool: %s\n", spa->spa_name);
847         if (ret != 0)
848                 return (ret);
849
850         if (zfs_get_root(spa, &rootid) == 0 &&
851             zfs_rlookup(spa, rootid, bootfs) == 0) {
852                 if (bootfs[0] == '\0')
853                         ret = pager_printf("bootfs: %s\n", spa->spa_name);
854                 else
855                         ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
856                             bootfs);
857                 if (ret != 0)
858                         return (ret);
859         }
860         ret = pager_printf("config:\n\n");
861         if (ret != 0)
862                 return (ret);
863         ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
864         if (ret != 0)
865                 return (ret);
866
867         good_kids = 0;
868         degraded_kids = 0;
869         bad_kids = 0;
870         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
871                 if (vdev->v_state == VDEV_STATE_HEALTHY)
872                         good_kids++;
873                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
874                         degraded_kids++;
875                 else
876                         bad_kids++;
877         }
878
879         state = VDEV_STATE_CLOSED;
880         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
881                 state = VDEV_STATE_HEALTHY;
882         else if ((good_kids + degraded_kids) > 0)
883                 state = VDEV_STATE_DEGRADED;
884
885         ret = print_state(0, spa->spa_name, state);
886         if (ret != 0)
887                 return (ret);
888         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
889                 ret = vdev_status(vdev, 1);
890                 if (ret != 0)
891                         return (ret);
892         }
893         return (ret);
894 }
895
896 static int
897 spa_all_status(void)
898 {
899         spa_t *spa;
900         int first = 1, ret = 0;
901
902         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
903                 if (!first) {
904                         ret = pager_printf("\n");
905                         if (ret != 0)
906                                 return (ret);
907                 }
908                 first = 0;
909                 ret = spa_status(spa);
910                 if (ret != 0)
911                         return (ret);
912         }
913         return (ret);
914 }
915
916 uint64_t
917 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
918 {
919         uint64_t label_offset;
920
921         if (l < VDEV_LABELS / 2)
922                 label_offset = 0;
923         else
924                 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
925
926         return (offset + l * sizeof (vdev_label_t) + label_offset);
927 }
928
929 static int
930 vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
931 {
932         vdev_t vtmp;
933         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
934         vdev_phys_t *tmp_label;
935         spa_t *spa;
936         vdev_t *vdev, *top_vdev, *pool_vdev;
937         off_t off;
938         blkptr_t bp;
939         const unsigned char *nvlist = NULL;
940         uint64_t val;
941         uint64_t guid;
942         uint64_t best_txg = 0;
943         uint64_t pool_txg, pool_guid;
944         uint64_t psize;
945         const char *pool_name;
946         const unsigned char *vdevs;
947         const unsigned char *features;
948         int i, l, rc, is_newer;
949         char *upbuf;
950         const struct uberblock *up;
951
952         /*
953          * Load the vdev label and figure out which
954          * uberblock is most current.
955          */
956         memset(&vtmp, 0, sizeof(vtmp));
957         vtmp.v_phys_read = _read;
958         vtmp.v_read_priv = read_priv;
959         psize = P2ALIGN(ldi_get_size(read_priv),
960             (uint64_t)sizeof (vdev_label_t));
961
962         /* Test for minimum pool size. */
963         if (psize < SPA_MINDEVSIZE)
964                 return (EIO);
965
966         tmp_label = zfs_alloc(sizeof(vdev_phys_t));
967
968         for (l = 0; l < VDEV_LABELS; l++) {
969                 off = vdev_label_offset(psize, l,
970                     offsetof(vdev_label_t, vl_vdev_phys));
971
972                 BP_ZERO(&bp);
973                 BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
974                 BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
975                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
976                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
977                 DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
978                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
979
980                 if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
981                         continue;
982
983                 if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
984                         continue;
985
986                 nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
987                 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
988                     DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
989                         continue;
990
991                 if (best_txg <= pool_txg) {
992                         best_txg = pool_txg;
993                         memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
994                 }
995         }
996
997         zfs_free(tmp_label, sizeof (vdev_phys_t));
998
999         if (best_txg == 0)
1000                 return (EIO);
1001
1002         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
1003                 return (EIO);
1004
1005         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
1006
1007         if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
1008             NULL, &val) != 0) {
1009                 return (EIO);
1010         }
1011
1012         if (!SPA_VERSION_IS_SUPPORTED(val)) {
1013                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1014                     (unsigned) val, (unsigned) SPA_VERSION);
1015                 return (EIO);
1016         }
1017
1018         /* Check ZFS features for read */
1019         if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1020             DATA_TYPE_NVLIST, NULL, &features) == 0 &&
1021             nvlist_check_features_for_read(features) != 0) {
1022                 return (EIO);
1023         }
1024
1025         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
1026             NULL, &val) != 0) {
1027                 return (EIO);
1028         }
1029
1030         if (val == POOL_STATE_DESTROYED) {
1031                 /* We don't boot only from destroyed pools. */
1032                 return (EIO);
1033         }
1034
1035         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
1036             NULL, &pool_txg) != 0 ||
1037             nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1038             NULL, &pool_guid) != 0 ||
1039             nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
1040             NULL, &pool_name) != 0) {
1041                 /*
1042                  * Cache and spare devices end up here - just ignore
1043                  * them.
1044                  */
1045                 /*printf("ZFS: can't find pool details\n");*/
1046                 return (EIO);
1047         }
1048
1049         if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
1050             NULL, &val) == 0 && val != 0) {
1051                 return (EIO);
1052         }
1053
1054         /*
1055          * Create the pool if this is the first time we've seen it.
1056          */
1057         spa = spa_find_by_guid(pool_guid);
1058         if (spa == NULL) {
1059                 spa = spa_create(pool_guid, pool_name);
1060                 if (spa == NULL)
1061                         return (ENOMEM);
1062         }
1063         if (pool_txg > spa->spa_txg) {
1064                 spa->spa_txg = pool_txg;
1065                 is_newer = 1;
1066         } else {
1067                 is_newer = 0;
1068         }
1069
1070         /*
1071          * Get the vdev tree and create our in-core copy of it.
1072          * If we already have a vdev with this guid, this must
1073          * be some kind of alias (overlapping slices, dangerously dedicated
1074          * disks etc).
1075          */
1076         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1077             NULL, &guid) != 0) {
1078                 return (EIO);
1079         }
1080         vdev = vdev_find(guid);
1081         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
1082                 return (EIO);
1083
1084         if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1085             NULL, &vdevs)) {
1086                 return (EIO);
1087         }
1088
1089         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1090         if (rc != 0)
1091                 return (rc);
1092
1093         /*
1094          * Add the toplevel vdev to the pool if its not already there.
1095          */
1096         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1097                 if (top_vdev == pool_vdev)
1098                         break;
1099         if (!pool_vdev && top_vdev) {
1100                 top_vdev->spa = spa;
1101                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1102         }
1103
1104         /*
1105          * We should already have created an incomplete vdev for this
1106          * vdev. Find it and initialise it with our read proc.
1107          */
1108         vdev = vdev_find(guid);
1109         if (vdev) {
1110                 vdev->v_phys_read = _read;
1111                 vdev->v_read_priv = read_priv;
1112                 vdev->v_state = VDEV_STATE_HEALTHY;
1113         } else {
1114                 printf("ZFS: inconsistent nvlist contents\n");
1115                 return (EIO);
1116         }
1117
1118         /*
1119          * Re-evaluate top-level vdev state.
1120          */
1121         vdev_set_state(top_vdev);
1122
1123         /*
1124          * Ok, we are happy with the pool so far. Lets find
1125          * the best uberblock and then we can actually access
1126          * the contents of the pool.
1127          */
1128         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1129         up = (const struct uberblock *)upbuf;
1130         for (l = 0; l < VDEV_LABELS; l++) {
1131                 for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
1132                         off = vdev_label_offset(psize, l,
1133                             VDEV_UBERBLOCK_OFFSET(vdev, i));
1134                         BP_ZERO(&bp);
1135                         DVA_SET_OFFSET(&bp.blk_dva[0], off);
1136                         BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1137                         BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1138                         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1139                         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1140                         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1141
1142                         if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1143                                 continue;
1144
1145                         if (up->ub_magic != UBERBLOCK_MAGIC)
1146                                 continue;
1147                         if (up->ub_txg < spa->spa_txg)
1148                                 continue;
1149                         if (up->ub_txg > spa->spa_uberblock.ub_txg ||
1150                             (up->ub_txg == spa->spa_uberblock.ub_txg &&
1151                             up->ub_timestamp >
1152                             spa->spa_uberblock.ub_timestamp)) {
1153                                 spa->spa_uberblock = *up;
1154                         }
1155                 }
1156         }
1157         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1158
1159         vdev->spa = spa;
1160         if (spap != NULL)
1161                 *spap = spa;
1162         return (0);
1163 }
1164
1165 static int
1166 ilog2(int n)
1167 {
1168         int v;
1169
1170         for (v = 0; v < 32; v++)
1171                 if (n == (1 << v))
1172                         return v;
1173         return -1;
1174 }
1175
1176 static int
1177 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1178 {
1179         blkptr_t gbh_bp;
1180         zio_gbh_phys_t zio_gb;
1181         char *pbuf;
1182         int i;
1183
1184         /* Artificial BP for gang block header. */
1185         gbh_bp = *bp;
1186         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1187         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1188         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1189         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1190         for (i = 0; i < SPA_DVAS_PER_BP; i++)
1191                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1192
1193         /* Read gang header block using the artificial BP. */
1194         if (zio_read(spa, &gbh_bp, &zio_gb))
1195                 return (EIO);
1196
1197         pbuf = buf;
1198         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1199                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1200
1201                 if (BP_IS_HOLE(gbp))
1202                         continue;
1203                 if (zio_read(spa, gbp, pbuf))
1204                         return (EIO);
1205                 pbuf += BP_GET_PSIZE(gbp);
1206         }
1207
1208         if (zio_checksum_verify(spa, bp, buf))
1209                 return (EIO);
1210         return (0);
1211 }
1212
1213 static int
1214 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1215 {
1216         int cpfunc = BP_GET_COMPRESS(bp);
1217         uint64_t align, size;
1218         void *pbuf;
1219         int i, error;
1220
1221         /*
1222          * Process data embedded in block pointer
1223          */
1224         if (BP_IS_EMBEDDED(bp)) {
1225                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1226
1227                 size = BPE_GET_PSIZE(bp);
1228                 ASSERT(size <= BPE_PAYLOAD_SIZE);
1229
1230                 if (cpfunc != ZIO_COMPRESS_OFF)
1231                         pbuf = zfs_alloc(size);
1232                 else
1233                         pbuf = buf;
1234
1235                 decode_embedded_bp_compressed(bp, pbuf);
1236                 error = 0;
1237
1238                 if (cpfunc != ZIO_COMPRESS_OFF) {
1239                         error = zio_decompress_data(cpfunc, pbuf,
1240                             size, buf, BP_GET_LSIZE(bp));
1241                         zfs_free(pbuf, size);
1242                 }
1243                 if (error != 0)
1244                         printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1245                             error);
1246                 return (error);
1247         }
1248
1249         error = EIO;
1250
1251         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1252                 const dva_t *dva = &bp->blk_dva[i];
1253                 vdev_t *vdev;
1254                 int vdevid;
1255                 off_t offset;
1256
1257                 if (!dva->dva_word[0] && !dva->dva_word[1])
1258                         continue;
1259
1260                 vdevid = DVA_GET_VDEV(dva);
1261                 offset = DVA_GET_OFFSET(dva);
1262                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1263                         if (vdev->v_id == vdevid)
1264                                 break;
1265                 }
1266                 if (!vdev || !vdev->v_read)
1267                         continue;
1268
1269                 size = BP_GET_PSIZE(bp);
1270                 if (vdev->v_read == vdev_raidz_read) {
1271                         align = 1ULL << vdev->v_top->v_ashift;
1272                         if (P2PHASE(size, align) != 0)
1273                                 size = P2ROUNDUP(size, align);
1274                 }
1275                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1276                         pbuf = zfs_alloc(size);
1277                 else
1278                         pbuf = buf;
1279
1280                 if (DVA_GET_GANG(dva))
1281                         error = zio_read_gang(spa, bp, pbuf);
1282                 else
1283                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
1284                 if (error == 0) {
1285                         if (cpfunc != ZIO_COMPRESS_OFF)
1286                                 error = zio_decompress_data(cpfunc, pbuf,
1287                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1288                         else if (size != BP_GET_PSIZE(bp))
1289                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1290                 }
1291                 if (buf != pbuf)
1292                         zfs_free(pbuf, size);
1293                 if (error == 0)
1294                         break;
1295         }
1296         if (error != 0)
1297                 printf("ZFS: i/o error - all block copies unavailable\n");
1298         return (error);
1299 }
1300
1301 static int
1302 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1303 {
1304         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1305         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1306         int nlevels = dnode->dn_nlevels;
1307         int i, rc;
1308
1309         if (bsize > SPA_MAXBLOCKSIZE) {
1310                 printf("ZFS: I/O error - blocks larger than %llu are not "
1311                     "supported\n", SPA_MAXBLOCKSIZE);
1312                 return (EIO);
1313         }
1314
1315         /*
1316          * Note: bsize may not be a power of two here so we need to do an
1317          * actual divide rather than a bitshift.
1318          */
1319         while (buflen > 0) {
1320                 uint64_t bn = offset / bsize;
1321                 int boff = offset % bsize;
1322                 int ibn;
1323                 const blkptr_t *indbp;
1324                 blkptr_t bp;
1325
1326                 if (bn > dnode->dn_maxblkid)
1327                         return (EIO);
1328
1329                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1330                         goto cached;
1331
1332                 indbp = dnode->dn_blkptr;
1333                 for (i = 0; i < nlevels; i++) {
1334                         /*
1335                          * Copy the bp from the indirect array so that
1336                          * we can re-use the scratch buffer for multi-level
1337                          * objects.
1338                          */
1339                         ibn = bn >> ((nlevels - i - 1) * ibshift);
1340                         ibn &= ((1 << ibshift) - 1);
1341                         bp = indbp[ibn];
1342                         if (BP_IS_HOLE(&bp)) {
1343                                 memset(dnode_cache_buf, 0, bsize);
1344                                 break;
1345                         }
1346                         rc = zio_read(spa, &bp, dnode_cache_buf);
1347                         if (rc)
1348                                 return (rc);
1349                         indbp = (const blkptr_t *) dnode_cache_buf;
1350                 }
1351                 dnode_cache_obj = dnode;
1352                 dnode_cache_bn = bn;
1353         cached:
1354
1355                 /*
1356                  * The buffer contains our data block. Copy what we
1357                  * need from it and loop.
1358                  */ 
1359                 i = bsize - boff;
1360                 if (i > buflen) i = buflen;
1361                 memcpy(buf, &dnode_cache_buf[boff], i);
1362                 buf = ((char*) buf) + i;
1363                 offset += i;
1364                 buflen -= i;
1365         }
1366
1367         return (0);
1368 }
1369
1370 /*
1371  * Lookup a value in a microzap directory. Assumes that the zap
1372  * scratch buffer contains the directory contents.
1373  */
1374 static int
1375 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1376 {
1377         const mzap_phys_t *mz;
1378         const mzap_ent_phys_t *mze;
1379         size_t size;
1380         int chunks, i;
1381
1382         /*
1383          * Microzap objects use exactly one block. Read the whole
1384          * thing.
1385          */
1386         size = dnode->dn_datablkszsec * 512;
1387
1388         mz = (const mzap_phys_t *) zap_scratch;
1389         chunks = size / MZAP_ENT_LEN - 1;
1390
1391         for (i = 0; i < chunks; i++) {
1392                 mze = &mz->mz_chunk[i];
1393                 if (!strcmp(mze->mze_name, name)) {
1394                         *value = mze->mze_value;
1395                         return (0);
1396                 }
1397         }
1398
1399         return (ENOENT);
1400 }
1401
1402 /*
1403  * Compare a name with a zap leaf entry. Return non-zero if the name
1404  * matches.
1405  */
1406 static int
1407 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1408 {
1409         size_t namelen;
1410         const zap_leaf_chunk_t *nc;
1411         const char *p;
1412
1413         namelen = zc->l_entry.le_name_numints;
1414                         
1415         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1416         p = name;
1417         while (namelen > 0) {
1418                 size_t len;
1419                 len = namelen;
1420                 if (len > ZAP_LEAF_ARRAY_BYTES)
1421                         len = ZAP_LEAF_ARRAY_BYTES;
1422                 if (memcmp(p, nc->l_array.la_array, len))
1423                         return (0);
1424                 p += len;
1425                 namelen -= len;
1426                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1427         }
1428
1429         return 1;
1430 }
1431
1432 /*
1433  * Extract a uint64_t value from a zap leaf entry.
1434  */
1435 static uint64_t
1436 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1437 {
1438         const zap_leaf_chunk_t *vc;
1439         int i;
1440         uint64_t value;
1441         const uint8_t *p;
1442
1443         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1444         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1445                 value = (value << 8) | p[i];
1446         }
1447
1448         return value;
1449 }
1450
1451 static void
1452 stv(int len, void *addr, uint64_t value)
1453 {
1454         switch (len) {
1455         case 1:
1456                 *(uint8_t *)addr = value;
1457                 return;
1458         case 2:
1459                 *(uint16_t *)addr = value;
1460                 return;
1461         case 4:
1462                 *(uint32_t *)addr = value;
1463                 return;
1464         case 8:
1465                 *(uint64_t *)addr = value;
1466                 return;
1467         }
1468 }
1469
1470 /*
1471  * Extract a array from a zap leaf entry.
1472  */
1473 static void
1474 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
1475     uint64_t integer_size, uint64_t num_integers, void *buf)
1476 {
1477         uint64_t array_int_len = zc->l_entry.le_value_intlen;
1478         uint64_t value = 0;
1479         uint64_t *u64 = buf;
1480         char *p = buf;
1481         int len = MIN(zc->l_entry.le_value_numints, num_integers);
1482         int chunk = zc->l_entry.le_value_chunk;
1483         int byten = 0;
1484
1485         if (integer_size == 8 && len == 1) {
1486                 *u64 = fzap_leaf_value(zl, zc);
1487                 return;
1488         }
1489
1490         while (len > 0) {
1491                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
1492                 int i;
1493
1494                 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
1495                 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
1496                         value = (value << 8) | la->la_array[i];
1497                         byten++;
1498                         if (byten == array_int_len) {
1499                                 stv(integer_size, p, value);
1500                                 byten = 0;
1501                                 len--;
1502                                 if (len == 0)
1503                                         return;
1504                                 p += integer_size;
1505                         }
1506                 }
1507                 chunk = la->la_next;
1508         }
1509 }
1510
1511 /*
1512  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1513  * buffer contains the directory header.
1514  */
1515 static int
1516 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1517     uint64_t integer_size, uint64_t num_integers, void *value)
1518 {
1519         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1520         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1521         fat_zap_t z;
1522         uint64_t *ptrtbl;
1523         uint64_t hash;
1524         int rc;
1525
1526         if (zh.zap_magic != ZAP_MAGIC)
1527                 return (EIO);
1528
1529         z.zap_block_shift = ilog2(bsize);
1530         z.zap_phys = (zap_phys_t *) zap_scratch;
1531
1532         /*
1533          * Figure out where the pointer table is and read it in if necessary.
1534          */
1535         if (zh.zap_ptrtbl.zt_blk) {
1536                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1537                                zap_scratch, bsize);
1538                 if (rc)
1539                         return (rc);
1540                 ptrtbl = (uint64_t *) zap_scratch;
1541         } else {
1542                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1543         }
1544
1545         hash = zap_hash(zh.zap_salt, name);
1546
1547         zap_leaf_t zl;
1548         zl.l_bs = z.zap_block_shift;
1549
1550         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1551         zap_leaf_chunk_t *zc;
1552
1553         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1554         if (rc)
1555                 return (rc);
1556
1557         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1558
1559         /*
1560          * Make sure this chunk matches our hash.
1561          */
1562         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1563             && zl.l_phys->l_hdr.lh_prefix
1564             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1565                 return (ENOENT);
1566
1567         /*
1568          * Hash within the chunk to find our entry.
1569          */
1570         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1571         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1572         h = zl.l_phys->l_hash[h];
1573         if (h == 0xffff)
1574                 return (ENOENT);
1575         zc = &ZAP_LEAF_CHUNK(&zl, h);
1576         while (zc->l_entry.le_hash != hash) {
1577                 if (zc->l_entry.le_next == 0xffff) {
1578                         zc = NULL;
1579                         break;
1580                 }
1581                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1582         }
1583         if (fzap_name_equal(&zl, zc, name)) {
1584                 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints >
1585                     integer_size * num_integers)
1586                         return (E2BIG);
1587                 fzap_leaf_array(&zl, zc, integer_size, num_integers, value);
1588                 return (0);
1589         }
1590
1591         return (ENOENT);
1592 }
1593
1594 /*
1595  * Lookup a name in a zap object and return its value as a uint64_t.
1596  */
1597 static int
1598 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1599     uint64_t integer_size, uint64_t num_integers, void *value)
1600 {
1601         int rc;
1602         uint64_t zap_type;
1603         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1604
1605         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1606         if (rc)
1607                 return (rc);
1608
1609         zap_type = *(uint64_t *) zap_scratch;
1610         if (zap_type == ZBT_MICRO)
1611                 return mzap_lookup(dnode, name, value);
1612         else if (zap_type == ZBT_HEADER) {
1613                 return fzap_lookup(spa, dnode, name, integer_size,
1614                     num_integers, value);
1615         }
1616         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1617         return (EIO);
1618 }
1619
1620 /*
1621  * List a microzap directory. Assumes that the zap scratch buffer contains
1622  * the directory contents.
1623  */
1624 static int
1625 mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1626 {
1627         const mzap_phys_t *mz;
1628         const mzap_ent_phys_t *mze;
1629         size_t size;
1630         int chunks, i, rc;
1631
1632         /*
1633          * Microzap objects use exactly one block. Read the whole
1634          * thing.
1635          */
1636         size = dnode->dn_datablkszsec * 512;
1637         mz = (const mzap_phys_t *) zap_scratch;
1638         chunks = size / MZAP_ENT_LEN - 1;
1639
1640         for (i = 0; i < chunks; i++) {
1641                 mze = &mz->mz_chunk[i];
1642                 if (mze->mze_name[0]) {
1643                         rc = callback(mze->mze_name, mze->mze_value);
1644                         if (rc != 0)
1645                                 return (rc);
1646                 }
1647         }
1648
1649         return (0);
1650 }
1651
1652 /*
1653  * List a fatzap directory. Assumes that the zap scratch buffer contains
1654  * the directory header.
1655  */
1656 static int
1657 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1658 {
1659         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1660         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1661         fat_zap_t z;
1662         int i, j, rc;
1663
1664         if (zh.zap_magic != ZAP_MAGIC)
1665                 return (EIO);
1666
1667         z.zap_block_shift = ilog2(bsize);
1668         z.zap_phys = (zap_phys_t *) zap_scratch;
1669
1670         /*
1671          * This assumes that the leaf blocks start at block 1. The
1672          * documentation isn't exactly clear on this.
1673          */
1674         zap_leaf_t zl;
1675         zl.l_bs = z.zap_block_shift;
1676         for (i = 0; i < zh.zap_num_leafs; i++) {
1677                 off_t off = (i + 1) << zl.l_bs;
1678                 char name[256], *p;
1679                 uint64_t value;
1680
1681                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1682                         return (EIO);
1683
1684                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1685
1686                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1687                         zap_leaf_chunk_t *zc, *nc;
1688                         int namelen;
1689
1690                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1691                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1692                                 continue;
1693                         namelen = zc->l_entry.le_name_numints;
1694                         if (namelen > sizeof(name))
1695                                 namelen = sizeof(name);
1696
1697                         /*
1698                          * Paste the name back together.
1699                          */
1700                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1701                         p = name;
1702                         while (namelen > 0) {
1703                                 int len;
1704                                 len = namelen;
1705                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1706                                         len = ZAP_LEAF_ARRAY_BYTES;
1707                                 memcpy(p, nc->l_array.la_array, len);
1708                                 p += len;
1709                                 namelen -= len;
1710                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1711                         }
1712
1713                         /*
1714                          * Assume the first eight bytes of the value are
1715                          * a uint64_t.
1716                          */
1717                         value = fzap_leaf_value(&zl, zc);
1718
1719                         //printf("%s 0x%jx\n", name, (uintmax_t)value);
1720                         rc = callback((const char *)name, value);
1721                         if (rc != 0)
1722                                 return (rc);
1723                 }
1724         }
1725
1726         return (0);
1727 }
1728
1729 static int zfs_printf(const char *name, uint64_t value __unused)
1730 {
1731
1732         printf("%s\n", name);
1733
1734         return (0);
1735 }
1736
1737 /*
1738  * List a zap directory.
1739  */
1740 static int
1741 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1742 {
1743         uint64_t zap_type;
1744         size_t size = dnode->dn_datablkszsec * 512;
1745
1746         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1747                 return (EIO);
1748
1749         zap_type = *(uint64_t *) zap_scratch;
1750         if (zap_type == ZBT_MICRO)
1751                 return mzap_list(dnode, zfs_printf);
1752         else
1753                 return fzap_list(spa, dnode, zfs_printf);
1754 }
1755
1756 static int
1757 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1758 {
1759         off_t offset;
1760
1761         offset = objnum * sizeof(dnode_phys_t);
1762         return dnode_read(spa, &os->os_meta_dnode, offset,
1763                 dnode, sizeof(dnode_phys_t));
1764 }
1765
1766 static int
1767 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1768 {
1769         const mzap_phys_t *mz;
1770         const mzap_ent_phys_t *mze;
1771         size_t size;
1772         int chunks, i;
1773
1774         /*
1775          * Microzap objects use exactly one block. Read the whole
1776          * thing.
1777          */
1778         size = dnode->dn_datablkszsec * 512;
1779
1780         mz = (const mzap_phys_t *) zap_scratch;
1781         chunks = size / MZAP_ENT_LEN - 1;
1782
1783         for (i = 0; i < chunks; i++) {
1784                 mze = &mz->mz_chunk[i];
1785                 if (value == mze->mze_value) {
1786                         strcpy(name, mze->mze_name);
1787                         return (0);
1788                 }
1789         }
1790
1791         return (ENOENT);
1792 }
1793
1794 static void
1795 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1796 {
1797         size_t namelen;
1798         const zap_leaf_chunk_t *nc;
1799         char *p;
1800
1801         namelen = zc->l_entry.le_name_numints;
1802
1803         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1804         p = name;
1805         while (namelen > 0) {
1806                 size_t len;
1807                 len = namelen;
1808                 if (len > ZAP_LEAF_ARRAY_BYTES)
1809                         len = ZAP_LEAF_ARRAY_BYTES;
1810                 memcpy(p, nc->l_array.la_array, len);
1811                 p += len;
1812                 namelen -= len;
1813                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1814         }
1815
1816         *p = '\0';
1817 }
1818
1819 static int
1820 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1821 {
1822         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1823         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1824         fat_zap_t z;
1825         int i, j;
1826
1827         if (zh.zap_magic != ZAP_MAGIC)
1828                 return (EIO);
1829
1830         z.zap_block_shift = ilog2(bsize);
1831         z.zap_phys = (zap_phys_t *) zap_scratch;
1832
1833         /*
1834          * This assumes that the leaf blocks start at block 1. The
1835          * documentation isn't exactly clear on this.
1836          */
1837         zap_leaf_t zl;
1838         zl.l_bs = z.zap_block_shift;
1839         for (i = 0; i < zh.zap_num_leafs; i++) {
1840                 off_t off = (i + 1) << zl.l_bs;
1841
1842                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1843                         return (EIO);
1844
1845                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1846
1847                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1848                         zap_leaf_chunk_t *zc;
1849
1850                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1851                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1852                                 continue;
1853                         if (zc->l_entry.le_value_intlen != 8 ||
1854                             zc->l_entry.le_value_numints != 1)
1855                                 continue;
1856
1857                         if (fzap_leaf_value(&zl, zc) == value) {
1858                                 fzap_name_copy(&zl, zc, name);
1859                                 return (0);
1860                         }
1861                 }
1862         }
1863
1864         return (ENOENT);
1865 }
1866
1867 static int
1868 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1869 {
1870         int rc;
1871         uint64_t zap_type;
1872         size_t size = dnode->dn_datablkszsec * 512;
1873
1874         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1875         if (rc)
1876                 return (rc);
1877
1878         zap_type = *(uint64_t *) zap_scratch;
1879         if (zap_type == ZBT_MICRO)
1880                 return mzap_rlookup(spa, dnode, name, value);
1881         else
1882                 return fzap_rlookup(spa, dnode, name, value);
1883 }
1884
1885 static int
1886 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1887 {
1888         char name[256];
1889         char component[256];
1890         uint64_t dir_obj, parent_obj, child_dir_zapobj;
1891         dnode_phys_t child_dir_zap, dataset, dir, parent;
1892         dsl_dir_phys_t *dd;
1893         dsl_dataset_phys_t *ds;
1894         char *p;
1895         int len;
1896
1897         p = &name[sizeof(name) - 1];
1898         *p = '\0';
1899
1900         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1901                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1902                 return (EIO);
1903         }
1904         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1905         dir_obj = ds->ds_dir_obj;
1906
1907         for (;;) {
1908                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1909                         return (EIO);
1910                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1911
1912                 /* Actual loop condition. */
1913                 parent_obj  = dd->dd_parent_obj;
1914                 if (parent_obj == 0)
1915                         break;
1916
1917                 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1918                         return (EIO);
1919                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1920                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1921                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1922                         return (EIO);
1923                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1924                         return (EIO);
1925
1926                 len = strlen(component);
1927                 p -= len;
1928                 memcpy(p, component, len);
1929                 --p;
1930                 *p = '/';
1931
1932                 /* Actual loop iteration. */
1933                 dir_obj = parent_obj;
1934         }
1935
1936         if (*p != '\0')
1937                 ++p;
1938         strcpy(result, p);
1939
1940         return (0);
1941 }
1942
1943 static int
1944 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1945 {
1946         char element[256];
1947         uint64_t dir_obj, child_dir_zapobj;
1948         dnode_phys_t child_dir_zap, dir;
1949         dsl_dir_phys_t *dd;
1950         const char *p, *q;
1951
1952         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1953                 return (EIO);
1954         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
1955             1, &dir_obj))
1956                 return (EIO);
1957
1958         p = name;
1959         for (;;) {
1960                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1961                         return (EIO);
1962                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1963
1964                 while (*p == '/')
1965                         p++;
1966                 /* Actual loop condition #1. */
1967                 if (*p == '\0')
1968                         break;
1969
1970                 q = strchr(p, '/');
1971                 if (q) {
1972                         memcpy(element, p, q - p);
1973                         element[q - p] = '\0';
1974                         p = q + 1;
1975                 } else {
1976                         strcpy(element, p);
1977                         p += strlen(p);
1978                 }
1979
1980                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1981                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1982                         return (EIO);
1983
1984                 /* Actual loop condition #2. */
1985                 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
1986                     1, &dir_obj) != 0)
1987                         return (ENOENT);
1988         }
1989
1990         *objnum = dd->dd_head_dataset_obj;
1991         return (0);
1992 }
1993
1994 #ifndef BOOT2
1995 static int
1996 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1997 {
1998         uint64_t dir_obj, child_dir_zapobj;
1999         dnode_phys_t child_dir_zap, dir, dataset;
2000         dsl_dataset_phys_t *ds;
2001         dsl_dir_phys_t *dd;
2002
2003         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2004                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2005                 return (EIO);
2006         }
2007         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2008         dir_obj = ds->ds_dir_obj;
2009
2010         if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
2011                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2012                 return (EIO);
2013         }
2014         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2015
2016         child_dir_zapobj = dd->dd_child_dir_zapobj;
2017         if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
2018                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2019                 return (EIO);
2020         }
2021
2022         return (zap_list(spa, &child_dir_zap) != 0);
2023 }
2024
2025 int
2026 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t))
2027 {
2028         uint64_t dir_obj, child_dir_zapobj, zap_type;
2029         dnode_phys_t child_dir_zap, dir, dataset;
2030         dsl_dataset_phys_t *ds;
2031         dsl_dir_phys_t *dd;
2032         int err;
2033
2034         err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
2035         if (err != 0) {
2036                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2037                 return (err);
2038         }
2039         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2040         dir_obj = ds->ds_dir_obj;
2041
2042         err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
2043         if (err != 0) {
2044                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2045                 return (err);
2046         }
2047         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2048
2049         child_dir_zapobj = dd->dd_child_dir_zapobj;
2050         err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
2051         if (err != 0) {
2052                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2053                 return (err);
2054         }
2055
2056         err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
2057         if (err != 0)
2058                 return (err);
2059
2060         zap_type = *(uint64_t *) zap_scratch;
2061         if (zap_type == ZBT_MICRO)
2062                 return mzap_list(&child_dir_zap, callback);
2063         else
2064                 return fzap_list(spa, &child_dir_zap, callback);
2065 }
2066 #endif
2067
2068 /*
2069  * Find the object set given the object number of its dataset object
2070  * and return its details in *objset
2071  */
2072 static int
2073 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
2074 {
2075         dnode_phys_t dataset;
2076         dsl_dataset_phys_t *ds;
2077
2078         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2079                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2080                 return (EIO);
2081         }
2082
2083         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2084         if (zio_read(spa, &ds->ds_bp, objset)) {
2085                 printf("ZFS: can't read object set for dataset %ju\n",
2086                     (uintmax_t)objnum);
2087                 return (EIO);
2088         }
2089
2090         return (0);
2091 }
2092
2093 /*
2094  * Find the object set pointed to by the BOOTFS property or the root
2095  * dataset if there is none and return its details in *objset
2096  */
2097 static int
2098 zfs_get_root(const spa_t *spa, uint64_t *objid)
2099 {
2100         dnode_phys_t dir, propdir;
2101         uint64_t props, bootfs, root;
2102
2103         *objid = 0;
2104
2105         /*
2106          * Start with the MOS directory object.
2107          */
2108         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
2109                 printf("ZFS: can't read MOS object directory\n");
2110                 return (EIO);
2111         }
2112
2113         /*
2114          * Lookup the pool_props and see if we can find a bootfs.
2115          */
2116         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, sizeof (props), 1, &props) == 0
2117              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
2118              && zap_lookup(spa, &propdir, "bootfs", sizeof (bootfs), 1, &bootfs) == 0
2119              && bootfs != 0)
2120         {
2121                 *objid = bootfs;
2122                 return (0);
2123         }
2124         /*
2125          * Lookup the root dataset directory
2126          */
2127         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root)
2128             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
2129                 printf("ZFS: can't find root dsl_dir\n");
2130                 return (EIO);
2131         }
2132
2133         /*
2134          * Use the information from the dataset directory's bonus buffer
2135          * to find the dataset object and from that the object set itself.
2136          */
2137         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
2138         *objid = dd->dd_head_dataset_obj;
2139         return (0);
2140 }
2141
2142 static int
2143 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
2144 {
2145
2146         mount->spa = spa;
2147
2148         /*
2149          * Find the root object set if not explicitly provided
2150          */
2151         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
2152                 printf("ZFS: can't find root filesystem\n");
2153                 return (EIO);
2154         }
2155
2156         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
2157                 printf("ZFS: can't open root filesystem\n");
2158                 return (EIO);
2159         }
2160
2161         mount->rootobj = rootobj;
2162
2163         return (0);
2164 }
2165
2166 /*
2167  * callback function for feature name checks.
2168  */
2169 static int
2170 check_feature(const char *name, uint64_t value)
2171 {
2172         int i;
2173
2174         if (value == 0)
2175                 return (0);
2176         if (name[0] == '\0')
2177                 return (0);
2178
2179         for (i = 0; features_for_read[i] != NULL; i++) {
2180                 if (strcmp(name, features_for_read[i]) == 0)
2181                         return (0);
2182         }
2183         printf("ZFS: unsupported feature: %s\n", name);
2184         return (EIO);
2185 }
2186
2187 /*
2188  * Checks whether the MOS features that are active are supported.
2189  */
2190 static int
2191 check_mos_features(const spa_t *spa)
2192 {
2193         dnode_phys_t dir;
2194         uint64_t objnum, zap_type;
2195         size_t size;
2196         int rc;
2197
2198         if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
2199             &dir)) != 0)
2200                 return (rc);
2201         if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
2202             sizeof (objnum), 1, &objnum)) != 0) {
2203                 /*
2204                  * It is older pool without features. As we have already
2205                  * tested the label, just return without raising the error.
2206                  */
2207                 return (0);
2208         }
2209
2210         if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
2211                 return (rc);
2212
2213         if (dir.dn_type != DMU_OTN_ZAP_METADATA)
2214                 return (EIO);
2215
2216         size = dir.dn_datablkszsec * 512;
2217         if (dnode_read(spa, &dir, 0, zap_scratch, size))
2218                 return (EIO);
2219
2220         zap_type = *(uint64_t *) zap_scratch;
2221         if (zap_type == ZBT_MICRO)
2222                 rc = mzap_list(&dir, check_feature);
2223         else
2224                 rc = fzap_list(spa, &dir, check_feature);
2225
2226         return (rc);
2227 }
2228
2229 static int
2230 zfs_spa_init(spa_t *spa)
2231 {
2232         dnode_phys_t dir;
2233         int rc;
2234
2235         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
2236                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
2237                 return (EIO);
2238         }
2239         if (spa->spa_mos.os_type != DMU_OST_META) {
2240                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
2241                 return (EIO);
2242         }
2243
2244         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
2245             &dir)) {
2246                 printf("ZFS: failed to read pool %s directory object\n",
2247                     spa->spa_name);
2248                 return (EIO);
2249         }
2250         /* this is allowed to fail, older pools do not have salt */
2251         rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
2252             sizeof (spa->spa_cksum_salt.zcs_bytes),
2253             spa->spa_cksum_salt.zcs_bytes);
2254
2255         rc = check_mos_features(spa);
2256         if (rc != 0) {
2257                 printf("ZFS: pool %s is not supported\n", spa->spa_name);
2258         }
2259
2260         return (rc);
2261 }
2262
2263 static int
2264 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
2265 {
2266
2267         if (dn->dn_bonustype != DMU_OT_SA) {
2268                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
2269
2270                 sb->st_mode = zp->zp_mode;
2271                 sb->st_uid = zp->zp_uid;
2272                 sb->st_gid = zp->zp_gid;
2273                 sb->st_size = zp->zp_size;
2274         } else {
2275                 sa_hdr_phys_t *sahdrp;
2276                 int hdrsize;
2277                 size_t size = 0;
2278                 void *buf = NULL;
2279
2280                 if (dn->dn_bonuslen != 0)
2281                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2282                 else {
2283                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
2284                                 blkptr_t *bp = DN_SPILL_BLKPTR(dn);
2285                                 int error;
2286
2287                                 size = BP_GET_LSIZE(bp);
2288                                 buf = zfs_alloc(size);
2289                                 error = zio_read(spa, bp, buf);
2290                                 if (error != 0) {
2291                                         zfs_free(buf, size);
2292                                         return (error);
2293                                 }
2294                                 sahdrp = buf;
2295                         } else {
2296                                 return (EIO);
2297                         }
2298                 }
2299                 hdrsize = SA_HDR_SIZE(sahdrp);
2300                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2301                     SA_MODE_OFFSET);
2302                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2303                     SA_UID_OFFSET);
2304                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2305                     SA_GID_OFFSET);
2306                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2307                     SA_SIZE_OFFSET);
2308                 if (buf != NULL)
2309                         zfs_free(buf, size);
2310         }
2311
2312         return (0);
2313 }
2314
2315 static int
2316 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
2317 {
2318         int rc = 0;
2319
2320         if (dn->dn_bonustype == DMU_OT_SA) {
2321                 sa_hdr_phys_t *sahdrp = NULL;
2322                 size_t size = 0;
2323                 void *buf = NULL;
2324                 int hdrsize;
2325                 char *p;
2326
2327                 if (dn->dn_bonuslen != 0)
2328                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2329                 else {
2330                         blkptr_t *bp;
2331
2332                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
2333                                 return (EIO);
2334                         bp = DN_SPILL_BLKPTR(dn);
2335
2336                         size = BP_GET_LSIZE(bp);
2337                         buf = zfs_alloc(size);
2338                         rc = zio_read(spa, bp, buf);
2339                         if (rc != 0) {
2340                                 zfs_free(buf, size);
2341                                 return (rc);
2342                         }
2343                         sahdrp = buf;
2344                 }
2345                 hdrsize = SA_HDR_SIZE(sahdrp);
2346                 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
2347                 memcpy(path, p, psize);
2348                 if (buf != NULL)
2349                         zfs_free(buf, size);
2350                 return (0);
2351         }
2352         /*
2353          * Second test is purely to silence bogus compiler
2354          * warning about accessing past the end of dn_bonus.
2355          */
2356         if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
2357             sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
2358                 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
2359         } else {
2360                 rc = dnode_read(spa, dn, 0, path, psize);
2361         }
2362         return (rc);
2363 }
2364
2365 struct obj_list {
2366         uint64_t                objnum;
2367         STAILQ_ENTRY(obj_list)  entry;
2368 };
2369
2370 /*
2371  * Lookup a file and return its dnode.
2372  */
2373 static int
2374 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2375 {
2376         int rc;
2377         uint64_t objnum;
2378         const spa_t *spa;
2379         dnode_phys_t dn;
2380         const char *p, *q;
2381         char element[256];
2382         char path[1024];
2383         int symlinks_followed = 0;
2384         struct stat sb;
2385         struct obj_list *entry, *tentry;
2386         STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
2387
2388         spa = mount->spa;
2389         if (mount->objset.os_type != DMU_OST_ZFS) {
2390                 printf("ZFS: unexpected object set type %ju\n",
2391                     (uintmax_t)mount->objset.os_type);
2392                 return (EIO);
2393         }
2394
2395         if ((entry = malloc(sizeof(struct obj_list))) == NULL)
2396                 return (ENOMEM);
2397
2398         /*
2399          * Get the root directory dnode.
2400          */
2401         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2402         if (rc) {
2403                 free(entry);
2404                 return (rc);
2405         }
2406
2407         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum);
2408         if (rc) {
2409                 free(entry);
2410                 return (rc);
2411         }
2412         entry->objnum = objnum;
2413         STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2414
2415         rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2416         if (rc != 0)
2417                 goto done;
2418
2419         p = upath;
2420         while (p && *p) {
2421                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2422                 if (rc != 0)
2423                         goto done;
2424
2425                 while (*p == '/')
2426                         p++;
2427                 if (*p == '\0')
2428                         break;
2429                 q = p;
2430                 while (*q != '\0' && *q != '/')
2431                         q++;
2432
2433                 /* skip dot */
2434                 if (p + 1 == q && p[0] == '.') {
2435                         p++;
2436                         continue;
2437                 }
2438                 /* double dot */
2439                 if (p + 2 == q && p[0] == '.' && p[1] == '.') {
2440                         p += 2;
2441                         if (STAILQ_FIRST(&on_cache) ==
2442                             STAILQ_LAST(&on_cache, obj_list, entry)) {
2443                                 rc = ENOENT;
2444                                 goto done;
2445                         }
2446                         entry = STAILQ_FIRST(&on_cache);
2447                         STAILQ_REMOVE_HEAD(&on_cache, entry);
2448                         free(entry);
2449                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
2450                         continue;
2451                 }
2452                 if (q - p + 1 > sizeof(element)) {
2453                         rc = ENAMETOOLONG;
2454                         goto done;
2455                 }
2456                 memcpy(element, p, q - p);
2457                 element[q - p] = 0;
2458                 p = q;
2459
2460                 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
2461                         goto done;
2462                 if (!S_ISDIR(sb.st_mode)) {
2463                         rc = ENOTDIR;
2464                         goto done;
2465                 }
2466
2467                 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
2468                 if (rc)
2469                         goto done;
2470                 objnum = ZFS_DIRENT_OBJ(objnum);
2471
2472                 if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
2473                         rc = ENOMEM;
2474                         goto done;
2475                 }
2476                 entry->objnum = objnum;
2477                 STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2478                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2479                 if (rc)
2480                         goto done;
2481
2482                 /*
2483                  * Check for symlink.
2484                  */
2485                 rc = zfs_dnode_stat(spa, &dn, &sb);
2486                 if (rc)
2487                         goto done;
2488                 if (S_ISLNK(sb.st_mode)) {
2489                         if (symlinks_followed > 10) {
2490                                 rc = EMLINK;
2491                                 goto done;
2492                         }
2493                         symlinks_followed++;
2494
2495                         /*
2496                          * Read the link value and copy the tail of our
2497                          * current path onto the end.
2498                          */
2499                         if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
2500                                 rc = ENAMETOOLONG;
2501                                 goto done;
2502                         }
2503                         strcpy(&path[sb.st_size], p);
2504
2505                         rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
2506                         if (rc != 0)
2507                                 goto done;
2508
2509                         /*
2510                          * Restart with the new path, starting either at
2511                          * the root or at the parent depending whether or
2512                          * not the link is relative.
2513                          */
2514                         p = path;
2515                         if (*p == '/') {
2516                                 while (STAILQ_FIRST(&on_cache) !=
2517                                     STAILQ_LAST(&on_cache, obj_list, entry)) {
2518                                         entry = STAILQ_FIRST(&on_cache);
2519                                         STAILQ_REMOVE_HEAD(&on_cache, entry);
2520                                         free(entry);
2521                                 }
2522                         } else {
2523                                 entry = STAILQ_FIRST(&on_cache);
2524                                 STAILQ_REMOVE_HEAD(&on_cache, entry);
2525                                 free(entry);
2526                         }
2527                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
2528                 }
2529         }
2530
2531         *dnode = dn;
2532 done:
2533         STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
2534                 free(entry);
2535         return (rc);
2536 }