]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - stand/libsa/zfs/zfsimpl.c
MFHead @348740
[FreeBSD/FreeBSD.git] / stand / libsa / zfs / zfsimpl.c
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31  *      Stand-alone ZFS file reader.
32  */
33
34 #include <sys/endian.h>
35 #include <sys/stat.h>
36 #include <sys/stdint.h>
37
38 #include "zfsimpl.h"
39 #include "zfssubr.c"
40
41
42 struct zfsmount {
43         const spa_t     *spa;
44         objset_phys_t   objset;
45         uint64_t        rootobj;
46 };
47 static struct zfsmount zfsmount __unused;
48
49 /*
50  * List of all vdevs, chained through v_alllink.
51  */
52 static vdev_list_t zfs_vdevs;
53
54  /*
55  * List of ZFS features supported for read
56  */
57 static const char *features_for_read[] = {
58         "org.illumos:lz4_compress",
59         "com.delphix:hole_birth",
60         "com.delphix:extensible_dataset",
61         "com.delphix:embedded_data",
62         "org.open-zfs:large_blocks",
63         "org.illumos:sha512",
64         "org.illumos:skein",
65         "org.zfsonlinux:large_dnode",
66         "com.joyent:multi_vdev_crash_dump",
67         NULL
68 };
69
70 /*
71  * List of all pools, chained through spa_link.
72  */
73 static spa_list_t zfs_pools;
74
75 static const dnode_phys_t *dnode_cache_obj;
76 static uint64_t dnode_cache_bn;
77 static char *dnode_cache_buf;
78 static char *zap_scratch;
79 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
80
81 #define TEMP_SIZE       (1024 * 1024)
82
83 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
84 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
85 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
86 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
87     const char *name, uint64_t integer_size, uint64_t num_integers,
88     void *value);
89
90 static void
91 zfs_init(void)
92 {
93         STAILQ_INIT(&zfs_vdevs);
94         STAILQ_INIT(&zfs_pools);
95
96         zfs_temp_buf = malloc(TEMP_SIZE);
97         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
98         zfs_temp_ptr = zfs_temp_buf;
99         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
100         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
101
102         zfs_init_crc();
103 }
104
105 static void *
106 zfs_alloc(size_t size)
107 {
108         char *ptr;
109
110         if (zfs_temp_ptr + size > zfs_temp_end) {
111                 panic("ZFS: out of temporary buffer space");
112         }
113         ptr = zfs_temp_ptr;
114         zfs_temp_ptr += size;
115
116         return (ptr);
117 }
118
119 static void
120 zfs_free(void *ptr, size_t size)
121 {
122
123         zfs_temp_ptr -= size;
124         if (zfs_temp_ptr != ptr) {
125                 panic("ZFS: zfs_alloc()/zfs_free() mismatch");
126         }
127 }
128
129 static int
130 xdr_int(const unsigned char **xdr, int *ip)
131 {
132         *ip = be32dec(*xdr);
133         (*xdr) += 4;
134         return (0);
135 }
136
137 static int
138 xdr_u_int(const unsigned char **xdr, u_int *ip)
139 {
140         *ip = be32dec(*xdr);
141         (*xdr) += 4;
142         return (0);
143 }
144
145 static int
146 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
147 {
148         u_int hi, lo;
149
150         xdr_u_int(xdr, &hi);
151         xdr_u_int(xdr, &lo);
152         *lp = (((uint64_t) hi) << 32) | lo;
153         return (0);
154 }
155
156 static int
157 nvlist_find(const unsigned char *nvlist, const char *name, int type,
158             int* elementsp, void *valuep)
159 {
160         const unsigned char *p, *pair;
161         int junk;
162         int encoded_size, decoded_size;
163
164         p = nvlist;
165         xdr_int(&p, &junk);
166         xdr_int(&p, &junk);
167
168         pair = p;
169         xdr_int(&p, &encoded_size);
170         xdr_int(&p, &decoded_size);
171         while (encoded_size && decoded_size) {
172                 int namelen, pairtype, elements;
173                 const char *pairname;
174
175                 xdr_int(&p, &namelen);
176                 pairname = (const char*) p;
177                 p += roundup(namelen, 4);
178                 xdr_int(&p, &pairtype);
179
180                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
181                         xdr_int(&p, &elements);
182                         if (elementsp)
183                                 *elementsp = elements;
184                         if (type == DATA_TYPE_UINT64) {
185                                 xdr_uint64_t(&p, (uint64_t *) valuep);
186                                 return (0);
187                         } else if (type == DATA_TYPE_STRING) {
188                                 int len;
189                                 xdr_int(&p, &len);
190                                 (*(const char**) valuep) = (const char*) p;
191                                 return (0);
192                         } else if (type == DATA_TYPE_NVLIST
193                                    || type == DATA_TYPE_NVLIST_ARRAY) {
194                                 (*(const unsigned char**) valuep) =
195                                          (const unsigned char*) p;
196                                 return (0);
197                         } else {
198                                 return (EIO);
199                         }
200                 } else {
201                         /*
202                          * Not the pair we are looking for, skip to the next one.
203                          */
204                         p = pair + encoded_size;
205                 }
206
207                 pair = p;
208                 xdr_int(&p, &encoded_size);
209                 xdr_int(&p, &decoded_size);
210         }
211
212         return (EIO);
213 }
214
215 static int
216 nvlist_check_features_for_read(const unsigned char *nvlist)
217 {
218         const unsigned char *p, *pair;
219         int junk;
220         int encoded_size, decoded_size;
221         int rc;
222
223         rc = 0;
224
225         p = nvlist;
226         xdr_int(&p, &junk);
227         xdr_int(&p, &junk);
228
229         pair = p;
230         xdr_int(&p, &encoded_size);
231         xdr_int(&p, &decoded_size);
232         while (encoded_size && decoded_size) {
233                 int namelen, pairtype;
234                 const char *pairname;
235                 int i, found;
236
237                 found = 0;
238
239                 xdr_int(&p, &namelen);
240                 pairname = (const char*) p;
241                 p += roundup(namelen, 4);
242                 xdr_int(&p, &pairtype);
243
244                 for (i = 0; features_for_read[i] != NULL; i++) {
245                         if (!memcmp(pairname, features_for_read[i], namelen)) {
246                                 found = 1;
247                                 break;
248                         }
249                 }
250
251                 if (!found) {
252                         printf("ZFS: unsupported feature: %s\n", pairname);
253                         rc = EIO;
254                 }
255
256                 p = pair + encoded_size;
257
258                 pair = p;
259                 xdr_int(&p, &encoded_size);
260                 xdr_int(&p, &decoded_size);
261         }
262
263         return (rc);
264 }
265
266 /*
267  * Return the next nvlist in an nvlist array.
268  */
269 static const unsigned char *
270 nvlist_next(const unsigned char *nvlist)
271 {
272         const unsigned char *p, *pair;
273         int junk;
274         int encoded_size, decoded_size;
275
276         p = nvlist;
277         xdr_int(&p, &junk);
278         xdr_int(&p, &junk);
279
280         pair = p;
281         xdr_int(&p, &encoded_size);
282         xdr_int(&p, &decoded_size);
283         while (encoded_size && decoded_size) {
284                 p = pair + encoded_size;
285
286                 pair = p;
287                 xdr_int(&p, &encoded_size);
288                 xdr_int(&p, &decoded_size);
289         }
290
291         return p;
292 }
293
294 #ifdef TEST
295
296 static const unsigned char *
297 nvlist_print(const unsigned char *nvlist, unsigned int indent)
298 {
299         static const char* typenames[] = {
300                 "DATA_TYPE_UNKNOWN",
301                 "DATA_TYPE_BOOLEAN",
302                 "DATA_TYPE_BYTE",
303                 "DATA_TYPE_INT16",
304                 "DATA_TYPE_UINT16",
305                 "DATA_TYPE_INT32",
306                 "DATA_TYPE_UINT32",
307                 "DATA_TYPE_INT64",
308                 "DATA_TYPE_UINT64",
309                 "DATA_TYPE_STRING",
310                 "DATA_TYPE_BYTE_ARRAY",
311                 "DATA_TYPE_INT16_ARRAY",
312                 "DATA_TYPE_UINT16_ARRAY",
313                 "DATA_TYPE_INT32_ARRAY",
314                 "DATA_TYPE_UINT32_ARRAY",
315                 "DATA_TYPE_INT64_ARRAY",
316                 "DATA_TYPE_UINT64_ARRAY",
317                 "DATA_TYPE_STRING_ARRAY",
318                 "DATA_TYPE_HRTIME",
319                 "DATA_TYPE_NVLIST",
320                 "DATA_TYPE_NVLIST_ARRAY",
321                 "DATA_TYPE_BOOLEAN_VALUE",
322                 "DATA_TYPE_INT8",
323                 "DATA_TYPE_UINT8",
324                 "DATA_TYPE_BOOLEAN_ARRAY",
325                 "DATA_TYPE_INT8_ARRAY",
326                 "DATA_TYPE_UINT8_ARRAY"
327         };
328
329         unsigned int i, j;
330         const unsigned char *p, *pair;
331         int junk;
332         int encoded_size, decoded_size;
333
334         p = nvlist;
335         xdr_int(&p, &junk);
336         xdr_int(&p, &junk);
337
338         pair = p;
339         xdr_int(&p, &encoded_size);
340         xdr_int(&p, &decoded_size);
341         while (encoded_size && decoded_size) {
342                 int namelen, pairtype, elements;
343                 const char *pairname;
344
345                 xdr_int(&p, &namelen);
346                 pairname = (const char*) p;
347                 p += roundup(namelen, 4);
348                 xdr_int(&p, &pairtype);
349
350                 for (i = 0; i < indent; i++)
351                         printf(" ");
352                 printf("%s %s", typenames[pairtype], pairname);
353
354                 xdr_int(&p, &elements);
355                 switch (pairtype) {
356                 case DATA_TYPE_UINT64: {
357                         uint64_t val;
358                         xdr_uint64_t(&p, &val);
359                         printf(" = 0x%jx\n", (uintmax_t)val);
360                         break;
361                 }
362
363                 case DATA_TYPE_STRING: {
364                         int len;
365                         xdr_int(&p, &len);
366                         printf(" = \"%s\"\n", p);
367                         break;
368                 }
369
370                 case DATA_TYPE_NVLIST:
371                         printf("\n");
372                         nvlist_print(p, indent + 1);
373                         break;
374
375                 case DATA_TYPE_NVLIST_ARRAY:
376                         for (j = 0; j < elements; j++) {
377                                 printf("[%d]\n", j);
378                                 p = nvlist_print(p, indent + 1);
379                                 if (j != elements - 1) {
380                                         for (i = 0; i < indent; i++)
381                                                 printf(" ");
382                                         printf("%s %s", typenames[pairtype], pairname);
383                                 }
384                         }
385                         break;
386
387                 default:
388                         printf("\n");
389                 }
390
391                 p = pair + encoded_size;
392
393                 pair = p;
394                 xdr_int(&p, &encoded_size);
395                 xdr_int(&p, &decoded_size);
396         }
397
398         return p;
399 }
400
401 #endif
402
403 static int
404 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
405     off_t offset, size_t size)
406 {
407         size_t psize;
408         int rc;
409
410         if (!vdev->v_phys_read)
411                 return (EIO);
412
413         if (bp) {
414                 psize = BP_GET_PSIZE(bp);
415         } else {
416                 psize = size;
417         }
418
419         /*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
420         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
421         if (rc)
422                 return (rc);
423         if (bp && zio_checksum_verify(vdev->spa, bp, buf))
424                 return (EIO);
425
426         return (0);
427 }
428
429 static int
430 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
431     off_t offset, size_t bytes)
432 {
433
434         return (vdev_read_phys(vdev, bp, buf,
435                 offset + VDEV_LABEL_START_SIZE, bytes));
436 }
437
438
439 static int
440 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
441     off_t offset, size_t bytes)
442 {
443         vdev_t *kid;
444         int rc;
445
446         rc = EIO;
447         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
448                 if (kid->v_state != VDEV_STATE_HEALTHY)
449                         continue;
450                 rc = kid->v_read(kid, bp, buf, offset, bytes);
451                 if (!rc)
452                         return (0);
453         }
454
455         return (rc);
456 }
457
458 static int
459 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
460     off_t offset, size_t bytes)
461 {
462         vdev_t *kid;
463
464         /*
465          * Here we should have two kids:
466          * First one which is the one we are replacing and we can trust
467          * only this one to have valid data, but it might not be present.
468          * Second one is that one we are replacing with. It is most likely
469          * healthy, but we can't trust it has needed data, so we won't use it.
470          */
471         kid = STAILQ_FIRST(&vdev->v_children);
472         if (kid == NULL)
473                 return (EIO);
474         if (kid->v_state != VDEV_STATE_HEALTHY)
475                 return (EIO);
476         return (kid->v_read(kid, bp, buf, offset, bytes));
477 }
478
479 static vdev_t *
480 vdev_find(uint64_t guid)
481 {
482         vdev_t *vdev;
483
484         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
485                 if (vdev->v_guid == guid)
486                         return (vdev);
487
488         return (0);
489 }
490
491 static vdev_t *
492 vdev_create(uint64_t guid, vdev_read_t *_read)
493 {
494         vdev_t *vdev;
495
496         vdev = malloc(sizeof(vdev_t));
497         memset(vdev, 0, sizeof(vdev_t));
498         STAILQ_INIT(&vdev->v_children);
499         vdev->v_guid = guid;
500         vdev->v_state = VDEV_STATE_OFFLINE;
501         vdev->v_read = _read;
502         vdev->v_phys_read = 0;
503         vdev->v_read_priv = 0;
504         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
505
506         return (vdev);
507 }
508
509 static int
510 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
511     vdev_t **vdevp, int is_newer)
512 {
513         int rc;
514         uint64_t guid, id, ashift, nparity;
515         const char *type;
516         const char *path;
517         vdev_t *vdev, *kid;
518         const unsigned char *kids;
519         int nkids, i, is_new;
520         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
521
522         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
523             NULL, &guid)
524             || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id)
525             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
526             NULL, &type)) {
527                 printf("ZFS: can't find vdev details\n");
528                 return (ENOENT);
529         }
530
531         if (strcmp(type, VDEV_TYPE_MIRROR)
532             && strcmp(type, VDEV_TYPE_DISK)
533 #ifdef ZFS_TEST
534             && strcmp(type, VDEV_TYPE_FILE)
535 #endif
536             && strcmp(type, VDEV_TYPE_RAIDZ)
537             && strcmp(type, VDEV_TYPE_REPLACING)) {
538                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
539                 return (EIO);
540         }
541
542         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
543
544         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
545                         &is_offline);
546         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
547                         &is_removed);
548         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
549                         &is_faulted);
550         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL,
551                         &is_degraded);
552         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL,
553                         &isnt_present);
554
555         vdev = vdev_find(guid);
556         if (!vdev) {
557                 is_new = 1;
558
559                 if (!strcmp(type, VDEV_TYPE_MIRROR))
560                         vdev = vdev_create(guid, vdev_mirror_read);
561                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
562                         vdev = vdev_create(guid, vdev_raidz_read);
563                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
564                         vdev = vdev_create(guid, vdev_replacing_read);
565                 else
566                         vdev = vdev_create(guid, vdev_disk_read);
567
568                 vdev->v_id = id;
569                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
570                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
571                         DATA_TYPE_UINT64, NULL, &ashift) == 0) {
572                         vdev->v_ashift = ashift;
573                 } else {
574                         vdev->v_ashift = 0;
575                 }
576                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
577                         DATA_TYPE_UINT64, NULL, &nparity) == 0) {
578                         vdev->v_nparity = nparity;
579                 } else {
580                         vdev->v_nparity = 0;
581                 }
582                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
583                                 DATA_TYPE_STRING, NULL, &path) == 0) {
584                         if (strncmp(path, "/dev/", 5) == 0)
585                                 path += 5;
586                         vdev->v_name = strdup(path);
587                 } else {
588                         if (!strcmp(type, "raidz")) {
589                                 if (vdev->v_nparity == 1)
590                                         vdev->v_name = "raidz1";
591                                 else if (vdev->v_nparity == 2)
592                                         vdev->v_name = "raidz2";
593                                 else if (vdev->v_nparity == 3)
594                                         vdev->v_name = "raidz3";
595                                 else {
596                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
597                                         return (EIO);
598                                 }
599                         } else {
600                                 vdev->v_name = strdup(type);
601                         }
602                 }
603         } else {
604                 is_new = 0;
605         }
606
607         if (is_new || is_newer) {
608                 /*
609                  * This is either new vdev or we've already seen this vdev,
610                  * but from an older vdev label, so let's refresh its state
611                  * from the newer label.
612                  */
613                 if (is_offline)
614                         vdev->v_state = VDEV_STATE_OFFLINE;
615                 else if (is_removed)
616                         vdev->v_state = VDEV_STATE_REMOVED;
617                 else if (is_faulted)
618                         vdev->v_state = VDEV_STATE_FAULTED;
619                 else if (is_degraded)
620                         vdev->v_state = VDEV_STATE_DEGRADED;
621                 else if (isnt_present)
622                         vdev->v_state = VDEV_STATE_CANT_OPEN;
623         }
624
625         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
626             &nkids, &kids);
627         /*
628          * Its ok if we don't have any kids.
629          */
630         if (rc == 0) {
631                 vdev->v_nchildren = nkids;
632                 for (i = 0; i < nkids; i++) {
633                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
634                         if (rc)
635                                 return (rc);
636                         if (is_new)
637                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
638                                                    v_childlink);
639                         kids = nvlist_next(kids);
640                 }
641         } else {
642                 vdev->v_nchildren = 0;
643         }
644
645         if (vdevp)
646                 *vdevp = vdev;
647         return (0);
648 }
649
650 static void
651 vdev_set_state(vdev_t *vdev)
652 {
653         vdev_t *kid;
654         int good_kids;
655         int bad_kids;
656
657         /*
658          * A mirror or raidz is healthy if all its kids are healthy. A
659          * mirror is degraded if any of its kids is healthy; a raidz
660          * is degraded if at most nparity kids are offline.
661          */
662         if (STAILQ_FIRST(&vdev->v_children)) {
663                 good_kids = 0;
664                 bad_kids = 0;
665                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
666                         if (kid->v_state == VDEV_STATE_HEALTHY)
667                                 good_kids++;
668                         else
669                                 bad_kids++;
670                 }
671                 if (bad_kids == 0) {
672                         vdev->v_state = VDEV_STATE_HEALTHY;
673                 } else {
674                         if (vdev->v_read == vdev_mirror_read) {
675                                 if (good_kids) {
676                                         vdev->v_state = VDEV_STATE_DEGRADED;
677                                 } else {
678                                         vdev->v_state = VDEV_STATE_OFFLINE;
679                                 }
680                         } else if (vdev->v_read == vdev_raidz_read) {
681                                 if (bad_kids > vdev->v_nparity) {
682                                         vdev->v_state = VDEV_STATE_OFFLINE;
683                                 } else {
684                                         vdev->v_state = VDEV_STATE_DEGRADED;
685                                 }
686                         }
687                 }
688         }
689 }
690
691 static spa_t *
692 spa_find_by_guid(uint64_t guid)
693 {
694         spa_t *spa;
695
696         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
697                 if (spa->spa_guid == guid)
698                         return (spa);
699
700         return (0);
701 }
702
703 static spa_t *
704 spa_find_by_name(const char *name)
705 {
706         spa_t *spa;
707
708         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
709                 if (!strcmp(spa->spa_name, name))
710                         return (spa);
711
712         return (0);
713 }
714
715 #ifdef BOOT2
716 static spa_t *
717 spa_get_primary(void)
718 {
719
720         return (STAILQ_FIRST(&zfs_pools));
721 }
722
723 static vdev_t *
724 spa_get_primary_vdev(const spa_t *spa)
725 {
726         vdev_t *vdev;
727         vdev_t *kid;
728
729         if (spa == NULL)
730                 spa = spa_get_primary();
731         if (spa == NULL)
732                 return (NULL);
733         vdev = STAILQ_FIRST(&spa->spa_vdevs);
734         if (vdev == NULL)
735                 return (NULL);
736         for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
737              kid = STAILQ_FIRST(&vdev->v_children))
738                 vdev = kid;
739         return (vdev);
740 }
741 #endif
742
743 static spa_t *
744 spa_create(uint64_t guid, const char *name)
745 {
746         spa_t *spa;
747
748         if ((spa = calloc(1, sizeof(spa_t))) == NULL)
749                 return (NULL);
750         if ((spa->spa_name = strdup(name)) == NULL) {
751                 free(spa);
752                 return (NULL);
753         }
754         STAILQ_INIT(&spa->spa_vdevs);
755         spa->spa_guid = guid;
756         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
757
758         return (spa);
759 }
760
761 static const char *
762 state_name(vdev_state_t state)
763 {
764         static const char* names[] = {
765                 "UNKNOWN",
766                 "CLOSED",
767                 "OFFLINE",
768                 "REMOVED",
769                 "CANT_OPEN",
770                 "FAULTED",
771                 "DEGRADED",
772                 "ONLINE"
773         };
774         return names[state];
775 }
776
777 #ifdef BOOT2
778
779 #define pager_printf printf
780
781 #else
782
783 static int
784 pager_printf(const char *fmt, ...)
785 {
786         char line[80];
787         va_list args;
788
789         va_start(args, fmt);
790         vsprintf(line, fmt, args);
791         va_end(args);
792
793         return (pager_output(line));
794 }
795
796 #endif
797
798 #define STATUS_FORMAT   "        %s %s\n"
799
800 static int
801 print_state(int indent, const char *name, vdev_state_t state)
802 {
803         char buf[512];
804         int i;
805
806         buf[0] = 0;
807         for (i = 0; i < indent; i++)
808                 strcat(buf, "  ");
809         strcat(buf, name);
810
811         return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
812 }
813
814 static int
815 vdev_status(vdev_t *vdev, int indent)
816 {
817         vdev_t *kid;
818         int ret;
819         ret = print_state(indent, vdev->v_name, vdev->v_state);
820         if (ret != 0)
821                 return (ret);
822
823         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
824                 ret = vdev_status(kid, indent + 1);
825                 if (ret != 0)
826                         return (ret);
827         }
828         return (ret);
829 }
830
831 static int
832 spa_status(spa_t *spa)
833 {
834         static char bootfs[ZFS_MAXNAMELEN];
835         uint64_t rootid;
836         vdev_t *vdev;
837         int good_kids, bad_kids, degraded_kids, ret;
838         vdev_state_t state;
839
840         ret = pager_printf("  pool: %s\n", spa->spa_name);
841         if (ret != 0)
842                 return (ret);
843
844         if (zfs_get_root(spa, &rootid) == 0 &&
845             zfs_rlookup(spa, rootid, bootfs) == 0) {
846                 if (bootfs[0] == '\0')
847                         ret = pager_printf("bootfs: %s\n", spa->spa_name);
848                 else
849                         ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
850                             bootfs);
851                 if (ret != 0)
852                         return (ret);
853         }
854         ret = pager_printf("config:\n\n");
855         if (ret != 0)
856                 return (ret);
857         ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
858         if (ret != 0)
859                 return (ret);
860
861         good_kids = 0;
862         degraded_kids = 0;
863         bad_kids = 0;
864         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
865                 if (vdev->v_state == VDEV_STATE_HEALTHY)
866                         good_kids++;
867                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
868                         degraded_kids++;
869                 else
870                         bad_kids++;
871         }
872
873         state = VDEV_STATE_CLOSED;
874         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
875                 state = VDEV_STATE_HEALTHY;
876         else if ((good_kids + degraded_kids) > 0)
877                 state = VDEV_STATE_DEGRADED;
878
879         ret = print_state(0, spa->spa_name, state);
880         if (ret != 0)
881                 return (ret);
882         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
883                 ret = vdev_status(vdev, 1);
884                 if (ret != 0)
885                         return (ret);
886         }
887         return (ret);
888 }
889
890 static int
891 spa_all_status(void)
892 {
893         spa_t *spa;
894         int first = 1, ret = 0;
895
896         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
897                 if (!first) {
898                         ret = pager_printf("\n");
899                         if (ret != 0)
900                                 return (ret);
901                 }
902                 first = 0;
903                 ret = spa_status(spa);
904                 if (ret != 0)
905                         return (ret);
906         }
907         return (ret);
908 }
909
910 static uint64_t
911 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
912 {
913         uint64_t label_offset;
914
915         if (l < VDEV_LABELS / 2)
916                 label_offset = 0;
917         else
918                 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
919
920         return (offset + l * sizeof (vdev_label_t) + label_offset);
921 }
922
923 static int
924 vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
925 {
926         vdev_t vtmp;
927         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
928         vdev_phys_t *tmp_label;
929         spa_t *spa;
930         vdev_t *vdev, *top_vdev, *pool_vdev;
931         off_t off;
932         blkptr_t bp;
933         const unsigned char *nvlist = NULL;
934         uint64_t val;
935         uint64_t guid;
936         uint64_t best_txg = 0;
937         uint64_t pool_txg, pool_guid;
938         uint64_t psize;
939         const char *pool_name;
940         const unsigned char *vdevs;
941         const unsigned char *features;
942         int i, l, rc, is_newer;
943         char *upbuf;
944         const struct uberblock *up;
945
946         /*
947          * Load the vdev label and figure out which
948          * uberblock is most current.
949          */
950         memset(&vtmp, 0, sizeof(vtmp));
951         vtmp.v_phys_read = _read;
952         vtmp.v_read_priv = read_priv;
953         psize = P2ALIGN(ldi_get_size(read_priv),
954             (uint64_t)sizeof (vdev_label_t));
955
956         /* Test for minimum pool size. */
957         if (psize < SPA_MINDEVSIZE)
958                 return (EIO);
959
960         tmp_label = zfs_alloc(sizeof(vdev_phys_t));
961
962         for (l = 0; l < VDEV_LABELS; l++) {
963                 off = vdev_label_offset(psize, l,
964                     offsetof(vdev_label_t, vl_vdev_phys));
965
966                 BP_ZERO(&bp);
967                 BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
968                 BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
969                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
970                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
971                 DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
972                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
973
974                 if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
975                         continue;
976
977                 if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
978                         continue;
979
980                 nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
981                 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
982                     DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
983                         continue;
984
985                 if (best_txg <= pool_txg) {
986                         best_txg = pool_txg;
987                         memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
988                 }
989         }
990
991         zfs_free(tmp_label, sizeof (vdev_phys_t));
992
993         if (best_txg == 0)
994                 return (EIO);
995
996         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
997                 return (EIO);
998
999         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
1000
1001         if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
1002             NULL, &val) != 0) {
1003                 return (EIO);
1004         }
1005
1006         if (!SPA_VERSION_IS_SUPPORTED(val)) {
1007                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1008                     (unsigned) val, (unsigned) SPA_VERSION);
1009                 return (EIO);
1010         }
1011
1012         /* Check ZFS features for read */
1013         if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1014             DATA_TYPE_NVLIST, NULL, &features) == 0 &&
1015             nvlist_check_features_for_read(features) != 0) {
1016                 return (EIO);
1017         }
1018
1019         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
1020             NULL, &val) != 0) {
1021                 return (EIO);
1022         }
1023
1024         if (val == POOL_STATE_DESTROYED) {
1025                 /* We don't boot only from destroyed pools. */
1026                 return (EIO);
1027         }
1028
1029         if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
1030             NULL, &pool_txg) != 0 ||
1031             nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1032             NULL, &pool_guid) != 0 ||
1033             nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
1034             NULL, &pool_name) != 0) {
1035                 /*
1036                  * Cache and spare devices end up here - just ignore
1037                  * them.
1038                  */
1039                 /*printf("ZFS: can't find pool details\n");*/
1040                 return (EIO);
1041         }
1042
1043         if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
1044             NULL, &val) == 0 && val != 0) {
1045                 return (EIO);
1046         }
1047
1048         /*
1049          * Create the pool if this is the first time we've seen it.
1050          */
1051         spa = spa_find_by_guid(pool_guid);
1052         if (spa == NULL) {
1053                 spa = spa_create(pool_guid, pool_name);
1054                 if (spa == NULL)
1055                         return (ENOMEM);
1056         }
1057         if (pool_txg > spa->spa_txg) {
1058                 spa->spa_txg = pool_txg;
1059                 is_newer = 1;
1060         } else {
1061                 is_newer = 0;
1062         }
1063
1064         /*
1065          * Get the vdev tree and create our in-core copy of it.
1066          * If we already have a vdev with this guid, this must
1067          * be some kind of alias (overlapping slices, dangerously dedicated
1068          * disks etc).
1069          */
1070         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1071             NULL, &guid) != 0) {
1072                 return (EIO);
1073         }
1074         vdev = vdev_find(guid);
1075         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
1076                 return (EIO);
1077
1078         if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1079             NULL, &vdevs)) {
1080                 return (EIO);
1081         }
1082
1083         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1084         if (rc != 0)
1085                 return (rc);
1086
1087         /*
1088          * Add the toplevel vdev to the pool if its not already there.
1089          */
1090         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1091                 if (top_vdev == pool_vdev)
1092                         break;
1093         if (!pool_vdev && top_vdev) {
1094                 top_vdev->spa = spa;
1095                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1096         }
1097
1098         /*
1099          * We should already have created an incomplete vdev for this
1100          * vdev. Find it and initialise it with our read proc.
1101          */
1102         vdev = vdev_find(guid);
1103         if (vdev) {
1104                 vdev->v_phys_read = _read;
1105                 vdev->v_read_priv = read_priv;
1106                 vdev->v_state = VDEV_STATE_HEALTHY;
1107         } else {
1108                 printf("ZFS: inconsistent nvlist contents\n");
1109                 return (EIO);
1110         }
1111
1112         /*
1113          * Re-evaluate top-level vdev state.
1114          */
1115         vdev_set_state(top_vdev);
1116
1117         /*
1118          * Ok, we are happy with the pool so far. Lets find
1119          * the best uberblock and then we can actually access
1120          * the contents of the pool.
1121          */
1122         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1123         up = (const struct uberblock *)upbuf;
1124         for (l = 0; l < VDEV_LABELS; l++) {
1125                 for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
1126                         off = vdev_label_offset(psize, l,
1127                             VDEV_UBERBLOCK_OFFSET(vdev, i));
1128                         BP_ZERO(&bp);
1129                         DVA_SET_OFFSET(&bp.blk_dva[0], off);
1130                         BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1131                         BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1132                         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1133                         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1134                         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1135
1136                         if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1137                                 continue;
1138
1139                         if (up->ub_magic != UBERBLOCK_MAGIC)
1140                                 continue;
1141                         if (up->ub_txg < spa->spa_txg)
1142                                 continue;
1143                         if (up->ub_txg > spa->spa_uberblock.ub_txg ||
1144                             (up->ub_txg == spa->spa_uberblock.ub_txg &&
1145                             up->ub_timestamp >
1146                             spa->spa_uberblock.ub_timestamp)) {
1147                                 spa->spa_uberblock = *up;
1148                         }
1149                 }
1150         }
1151         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1152
1153         vdev->spa = spa;
1154         if (spap != NULL)
1155                 *spap = spa;
1156         return (0);
1157 }
1158
1159 static int
1160 ilog2(int n)
1161 {
1162         int v;
1163
1164         for (v = 0; v < 32; v++)
1165                 if (n == (1 << v))
1166                         return v;
1167         return -1;
1168 }
1169
1170 static int
1171 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1172 {
1173         blkptr_t gbh_bp;
1174         zio_gbh_phys_t zio_gb;
1175         char *pbuf;
1176         int i;
1177
1178         /* Artificial BP for gang block header. */
1179         gbh_bp = *bp;
1180         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1181         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1182         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1183         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1184         for (i = 0; i < SPA_DVAS_PER_BP; i++)
1185                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1186
1187         /* Read gang header block using the artificial BP. */
1188         if (zio_read(spa, &gbh_bp, &zio_gb))
1189                 return (EIO);
1190
1191         pbuf = buf;
1192         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1193                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1194
1195                 if (BP_IS_HOLE(gbp))
1196                         continue;
1197                 if (zio_read(spa, gbp, pbuf))
1198                         return (EIO);
1199                 pbuf += BP_GET_PSIZE(gbp);
1200         }
1201
1202         if (zio_checksum_verify(spa, bp, buf))
1203                 return (EIO);
1204         return (0);
1205 }
1206
1207 static int
1208 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1209 {
1210         int cpfunc = BP_GET_COMPRESS(bp);
1211         uint64_t align, size;
1212         void *pbuf;
1213         int i, error;
1214
1215         /*
1216          * Process data embedded in block pointer
1217          */
1218         if (BP_IS_EMBEDDED(bp)) {
1219                 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1220
1221                 size = BPE_GET_PSIZE(bp);
1222                 ASSERT(size <= BPE_PAYLOAD_SIZE);
1223
1224                 if (cpfunc != ZIO_COMPRESS_OFF)
1225                         pbuf = zfs_alloc(size);
1226                 else
1227                         pbuf = buf;
1228
1229                 decode_embedded_bp_compressed(bp, pbuf);
1230                 error = 0;
1231
1232                 if (cpfunc != ZIO_COMPRESS_OFF) {
1233                         error = zio_decompress_data(cpfunc, pbuf,
1234                             size, buf, BP_GET_LSIZE(bp));
1235                         zfs_free(pbuf, size);
1236                 }
1237                 if (error != 0)
1238                         printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1239                             error);
1240                 return (error);
1241         }
1242
1243         error = EIO;
1244
1245         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1246                 const dva_t *dva = &bp->blk_dva[i];
1247                 vdev_t *vdev;
1248                 int vdevid;
1249                 off_t offset;
1250
1251                 if (!dva->dva_word[0] && !dva->dva_word[1])
1252                         continue;
1253
1254                 vdevid = DVA_GET_VDEV(dva);
1255                 offset = DVA_GET_OFFSET(dva);
1256                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1257                         if (vdev->v_id == vdevid)
1258                                 break;
1259                 }
1260                 if (!vdev || !vdev->v_read)
1261                         continue;
1262
1263                 size = BP_GET_PSIZE(bp);
1264                 if (vdev->v_read == vdev_raidz_read) {
1265                         align = 1ULL << vdev->v_top->v_ashift;
1266                         if (P2PHASE(size, align) != 0)
1267                                 size = P2ROUNDUP(size, align);
1268                 }
1269                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1270                         pbuf = zfs_alloc(size);
1271                 else
1272                         pbuf = buf;
1273
1274                 if (DVA_GET_GANG(dva))
1275                         error = zio_read_gang(spa, bp, pbuf);
1276                 else
1277                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
1278                 if (error == 0) {
1279                         if (cpfunc != ZIO_COMPRESS_OFF)
1280                                 error = zio_decompress_data(cpfunc, pbuf,
1281                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1282                         else if (size != BP_GET_PSIZE(bp))
1283                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1284                 }
1285                 if (buf != pbuf)
1286                         zfs_free(pbuf, size);
1287                 if (error == 0)
1288                         break;
1289         }
1290         if (error != 0)
1291                 printf("ZFS: i/o error - all block copies unavailable\n");
1292         return (error);
1293 }
1294
1295 static int
1296 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1297 {
1298         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1299         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1300         int nlevels = dnode->dn_nlevels;
1301         int i, rc;
1302
1303         if (bsize > SPA_MAXBLOCKSIZE) {
1304                 printf("ZFS: I/O error - blocks larger than %llu are not "
1305                     "supported\n", SPA_MAXBLOCKSIZE);
1306                 return (EIO);
1307         }
1308
1309         /*
1310          * Note: bsize may not be a power of two here so we need to do an
1311          * actual divide rather than a bitshift.
1312          */
1313         while (buflen > 0) {
1314                 uint64_t bn = offset / bsize;
1315                 int boff = offset % bsize;
1316                 int ibn;
1317                 const blkptr_t *indbp;
1318                 blkptr_t bp;
1319
1320                 if (bn > dnode->dn_maxblkid)
1321                         return (EIO);
1322
1323                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1324                         goto cached;
1325
1326                 indbp = dnode->dn_blkptr;
1327                 for (i = 0; i < nlevels; i++) {
1328                         /*
1329                          * Copy the bp from the indirect array so that
1330                          * we can re-use the scratch buffer for multi-level
1331                          * objects.
1332                          */
1333                         ibn = bn >> ((nlevels - i - 1) * ibshift);
1334                         ibn &= ((1 << ibshift) - 1);
1335                         bp = indbp[ibn];
1336                         if (BP_IS_HOLE(&bp)) {
1337                                 memset(dnode_cache_buf, 0, bsize);
1338                                 break;
1339                         }
1340                         rc = zio_read(spa, &bp, dnode_cache_buf);
1341                         if (rc)
1342                                 return (rc);
1343                         indbp = (const blkptr_t *) dnode_cache_buf;
1344                 }
1345                 dnode_cache_obj = dnode;
1346                 dnode_cache_bn = bn;
1347         cached:
1348
1349                 /*
1350                  * The buffer contains our data block. Copy what we
1351                  * need from it and loop.
1352                  */ 
1353                 i = bsize - boff;
1354                 if (i > buflen) i = buflen;
1355                 memcpy(buf, &dnode_cache_buf[boff], i);
1356                 buf = ((char*) buf) + i;
1357                 offset += i;
1358                 buflen -= i;
1359         }
1360
1361         return (0);
1362 }
1363
1364 /*
1365  * Lookup a value in a microzap directory. Assumes that the zap
1366  * scratch buffer contains the directory contents.
1367  */
1368 static int
1369 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1370 {
1371         const mzap_phys_t *mz;
1372         const mzap_ent_phys_t *mze;
1373         size_t size;
1374         int chunks, i;
1375
1376         /*
1377          * Microzap objects use exactly one block. Read the whole
1378          * thing.
1379          */
1380         size = dnode->dn_datablkszsec * 512;
1381
1382         mz = (const mzap_phys_t *) zap_scratch;
1383         chunks = size / MZAP_ENT_LEN - 1;
1384
1385         for (i = 0; i < chunks; i++) {
1386                 mze = &mz->mz_chunk[i];
1387                 if (!strcmp(mze->mze_name, name)) {
1388                         *value = mze->mze_value;
1389                         return (0);
1390                 }
1391         }
1392
1393         return (ENOENT);
1394 }
1395
1396 /*
1397  * Compare a name with a zap leaf entry. Return non-zero if the name
1398  * matches.
1399  */
1400 static int
1401 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1402 {
1403         size_t namelen;
1404         const zap_leaf_chunk_t *nc;
1405         const char *p;
1406
1407         namelen = zc->l_entry.le_name_numints;
1408                         
1409         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1410         p = name;
1411         while (namelen > 0) {
1412                 size_t len;
1413                 len = namelen;
1414                 if (len > ZAP_LEAF_ARRAY_BYTES)
1415                         len = ZAP_LEAF_ARRAY_BYTES;
1416                 if (memcmp(p, nc->l_array.la_array, len))
1417                         return (0);
1418                 p += len;
1419                 namelen -= len;
1420                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1421         }
1422
1423         return 1;
1424 }
1425
1426 /*
1427  * Extract a uint64_t value from a zap leaf entry.
1428  */
1429 static uint64_t
1430 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1431 {
1432         const zap_leaf_chunk_t *vc;
1433         int i;
1434         uint64_t value;
1435         const uint8_t *p;
1436
1437         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1438         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1439                 value = (value << 8) | p[i];
1440         }
1441
1442         return value;
1443 }
1444
1445 static void
1446 stv(int len, void *addr, uint64_t value)
1447 {
1448         switch (len) {
1449         case 1:
1450                 *(uint8_t *)addr = value;
1451                 return;
1452         case 2:
1453                 *(uint16_t *)addr = value;
1454                 return;
1455         case 4:
1456                 *(uint32_t *)addr = value;
1457                 return;
1458         case 8:
1459                 *(uint64_t *)addr = value;
1460                 return;
1461         }
1462 }
1463
1464 /*
1465  * Extract a array from a zap leaf entry.
1466  */
1467 static void
1468 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
1469     uint64_t integer_size, uint64_t num_integers, void *buf)
1470 {
1471         uint64_t array_int_len = zc->l_entry.le_value_intlen;
1472         uint64_t value = 0;
1473         uint64_t *u64 = buf;
1474         char *p = buf;
1475         int len = MIN(zc->l_entry.le_value_numints, num_integers);
1476         int chunk = zc->l_entry.le_value_chunk;
1477         int byten = 0;
1478
1479         if (integer_size == 8 && len == 1) {
1480                 *u64 = fzap_leaf_value(zl, zc);
1481                 return;
1482         }
1483
1484         while (len > 0) {
1485                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
1486                 int i;
1487
1488                 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
1489                 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
1490                         value = (value << 8) | la->la_array[i];
1491                         byten++;
1492                         if (byten == array_int_len) {
1493                                 stv(integer_size, p, value);
1494                                 byten = 0;
1495                                 len--;
1496                                 if (len == 0)
1497                                         return;
1498                                 p += integer_size;
1499                         }
1500                 }
1501                 chunk = la->la_next;
1502         }
1503 }
1504
1505 /*
1506  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1507  * buffer contains the directory header.
1508  */
1509 static int
1510 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1511     uint64_t integer_size, uint64_t num_integers, void *value)
1512 {
1513         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1514         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1515         fat_zap_t z;
1516         uint64_t *ptrtbl;
1517         uint64_t hash;
1518         int rc;
1519
1520         if (zh.zap_magic != ZAP_MAGIC)
1521                 return (EIO);
1522
1523         z.zap_block_shift = ilog2(bsize);
1524         z.zap_phys = (zap_phys_t *) zap_scratch;
1525
1526         /*
1527          * Figure out where the pointer table is and read it in if necessary.
1528          */
1529         if (zh.zap_ptrtbl.zt_blk) {
1530                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1531                                zap_scratch, bsize);
1532                 if (rc)
1533                         return (rc);
1534                 ptrtbl = (uint64_t *) zap_scratch;
1535         } else {
1536                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1537         }
1538
1539         hash = zap_hash(zh.zap_salt, name);
1540
1541         zap_leaf_t zl;
1542         zl.l_bs = z.zap_block_shift;
1543
1544         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1545         zap_leaf_chunk_t *zc;
1546
1547         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1548         if (rc)
1549                 return (rc);
1550
1551         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1552
1553         /*
1554          * Make sure this chunk matches our hash.
1555          */
1556         if (zl.l_phys->l_hdr.lh_prefix_len > 0
1557             && zl.l_phys->l_hdr.lh_prefix
1558             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1559                 return (ENOENT);
1560
1561         /*
1562          * Hash within the chunk to find our entry.
1563          */
1564         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1565         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1566         h = zl.l_phys->l_hash[h];
1567         if (h == 0xffff)
1568                 return (ENOENT);
1569         zc = &ZAP_LEAF_CHUNK(&zl, h);
1570         while (zc->l_entry.le_hash != hash) {
1571                 if (zc->l_entry.le_next == 0xffff) {
1572                         zc = NULL;
1573                         break;
1574                 }
1575                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1576         }
1577         if (fzap_name_equal(&zl, zc, name)) {
1578                 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints >
1579                     integer_size * num_integers)
1580                         return (E2BIG);
1581                 fzap_leaf_array(&zl, zc, integer_size, num_integers, value);
1582                 return (0);
1583         }
1584
1585         return (ENOENT);
1586 }
1587
1588 /*
1589  * Lookup a name in a zap object and return its value as a uint64_t.
1590  */
1591 static int
1592 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1593     uint64_t integer_size, uint64_t num_integers, void *value)
1594 {
1595         int rc;
1596         uint64_t zap_type;
1597         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1598
1599         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1600         if (rc)
1601                 return (rc);
1602
1603         zap_type = *(uint64_t *) zap_scratch;
1604         if (zap_type == ZBT_MICRO)
1605                 return mzap_lookup(dnode, name, value);
1606         else if (zap_type == ZBT_HEADER) {
1607                 return fzap_lookup(spa, dnode, name, integer_size,
1608                     num_integers, value);
1609         }
1610         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1611         return (EIO);
1612 }
1613
1614 /*
1615  * List a microzap directory. Assumes that the zap scratch buffer contains
1616  * the directory contents.
1617  */
1618 static int
1619 mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1620 {
1621         const mzap_phys_t *mz;
1622         const mzap_ent_phys_t *mze;
1623         size_t size;
1624         int chunks, i, rc;
1625
1626         /*
1627          * Microzap objects use exactly one block. Read the whole
1628          * thing.
1629          */
1630         size = dnode->dn_datablkszsec * 512;
1631         mz = (const mzap_phys_t *) zap_scratch;
1632         chunks = size / MZAP_ENT_LEN - 1;
1633
1634         for (i = 0; i < chunks; i++) {
1635                 mze = &mz->mz_chunk[i];
1636                 if (mze->mze_name[0]) {
1637                         rc = callback(mze->mze_name, mze->mze_value);
1638                         if (rc != 0)
1639                                 return (rc);
1640                 }
1641         }
1642
1643         return (0);
1644 }
1645
1646 /*
1647  * List a fatzap directory. Assumes that the zap scratch buffer contains
1648  * the directory header.
1649  */
1650 static int
1651 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1652 {
1653         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1654         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1655         fat_zap_t z;
1656         int i, j, rc;
1657
1658         if (zh.zap_magic != ZAP_MAGIC)
1659                 return (EIO);
1660
1661         z.zap_block_shift = ilog2(bsize);
1662         z.zap_phys = (zap_phys_t *) zap_scratch;
1663
1664         /*
1665          * This assumes that the leaf blocks start at block 1. The
1666          * documentation isn't exactly clear on this.
1667          */
1668         zap_leaf_t zl;
1669         zl.l_bs = z.zap_block_shift;
1670         for (i = 0; i < zh.zap_num_leafs; i++) {
1671                 off_t off = (i + 1) << zl.l_bs;
1672                 char name[256], *p;
1673                 uint64_t value;
1674
1675                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1676                         return (EIO);
1677
1678                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1679
1680                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1681                         zap_leaf_chunk_t *zc, *nc;
1682                         int namelen;
1683
1684                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1685                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1686                                 continue;
1687                         namelen = zc->l_entry.le_name_numints;
1688                         if (namelen > sizeof(name))
1689                                 namelen = sizeof(name);
1690
1691                         /*
1692                          * Paste the name back together.
1693                          */
1694                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1695                         p = name;
1696                         while (namelen > 0) {
1697                                 int len;
1698                                 len = namelen;
1699                                 if (len > ZAP_LEAF_ARRAY_BYTES)
1700                                         len = ZAP_LEAF_ARRAY_BYTES;
1701                                 memcpy(p, nc->l_array.la_array, len);
1702                                 p += len;
1703                                 namelen -= len;
1704                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1705                         }
1706
1707                         /*
1708                          * Assume the first eight bytes of the value are
1709                          * a uint64_t.
1710                          */
1711                         value = fzap_leaf_value(&zl, zc);
1712
1713                         //printf("%s 0x%jx\n", name, (uintmax_t)value);
1714                         rc = callback((const char *)name, value);
1715                         if (rc != 0)
1716                                 return (rc);
1717                 }
1718         }
1719
1720         return (0);
1721 }
1722
1723 static int zfs_printf(const char *name, uint64_t value __unused)
1724 {
1725
1726         printf("%s\n", name);
1727
1728         return (0);
1729 }
1730
1731 /*
1732  * List a zap directory.
1733  */
1734 static int
1735 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1736 {
1737         uint64_t zap_type;
1738         size_t size = dnode->dn_datablkszsec * 512;
1739
1740         if (dnode_read(spa, dnode, 0, zap_scratch, size))
1741                 return (EIO);
1742
1743         zap_type = *(uint64_t *) zap_scratch;
1744         if (zap_type == ZBT_MICRO)
1745                 return mzap_list(dnode, zfs_printf);
1746         else
1747                 return fzap_list(spa, dnode, zfs_printf);
1748 }
1749
1750 static int
1751 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1752 {
1753         off_t offset;
1754
1755         offset = objnum * sizeof(dnode_phys_t);
1756         return dnode_read(spa, &os->os_meta_dnode, offset,
1757                 dnode, sizeof(dnode_phys_t));
1758 }
1759
1760 static int
1761 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1762 {
1763         const mzap_phys_t *mz;
1764         const mzap_ent_phys_t *mze;
1765         size_t size;
1766         int chunks, i;
1767
1768         /*
1769          * Microzap objects use exactly one block. Read the whole
1770          * thing.
1771          */
1772         size = dnode->dn_datablkszsec * 512;
1773
1774         mz = (const mzap_phys_t *) zap_scratch;
1775         chunks = size / MZAP_ENT_LEN - 1;
1776
1777         for (i = 0; i < chunks; i++) {
1778                 mze = &mz->mz_chunk[i];
1779                 if (value == mze->mze_value) {
1780                         strcpy(name, mze->mze_name);
1781                         return (0);
1782                 }
1783         }
1784
1785         return (ENOENT);
1786 }
1787
1788 static void
1789 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1790 {
1791         size_t namelen;
1792         const zap_leaf_chunk_t *nc;
1793         char *p;
1794
1795         namelen = zc->l_entry.le_name_numints;
1796
1797         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1798         p = name;
1799         while (namelen > 0) {
1800                 size_t len;
1801                 len = namelen;
1802                 if (len > ZAP_LEAF_ARRAY_BYTES)
1803                         len = ZAP_LEAF_ARRAY_BYTES;
1804                 memcpy(p, nc->l_array.la_array, len);
1805                 p += len;
1806                 namelen -= len;
1807                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1808         }
1809
1810         *p = '\0';
1811 }
1812
1813 static int
1814 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1815 {
1816         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1817         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1818         fat_zap_t z;
1819         int i, j;
1820
1821         if (zh.zap_magic != ZAP_MAGIC)
1822                 return (EIO);
1823
1824         z.zap_block_shift = ilog2(bsize);
1825         z.zap_phys = (zap_phys_t *) zap_scratch;
1826
1827         /*
1828          * This assumes that the leaf blocks start at block 1. The
1829          * documentation isn't exactly clear on this.
1830          */
1831         zap_leaf_t zl;
1832         zl.l_bs = z.zap_block_shift;
1833         for (i = 0; i < zh.zap_num_leafs; i++) {
1834                 off_t off = (i + 1) << zl.l_bs;
1835
1836                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1837                         return (EIO);
1838
1839                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1840
1841                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1842                         zap_leaf_chunk_t *zc;
1843
1844                         zc = &ZAP_LEAF_CHUNK(&zl, j);
1845                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1846                                 continue;
1847                         if (zc->l_entry.le_value_intlen != 8 ||
1848                             zc->l_entry.le_value_numints != 1)
1849                                 continue;
1850
1851                         if (fzap_leaf_value(&zl, zc) == value) {
1852                                 fzap_name_copy(&zl, zc, name);
1853                                 return (0);
1854                         }
1855                 }
1856         }
1857
1858         return (ENOENT);
1859 }
1860
1861 static int
1862 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1863 {
1864         int rc;
1865         uint64_t zap_type;
1866         size_t size = dnode->dn_datablkszsec * 512;
1867
1868         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1869         if (rc)
1870                 return (rc);
1871
1872         zap_type = *(uint64_t *) zap_scratch;
1873         if (zap_type == ZBT_MICRO)
1874                 return mzap_rlookup(spa, dnode, name, value);
1875         else
1876                 return fzap_rlookup(spa, dnode, name, value);
1877 }
1878
1879 static int
1880 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1881 {
1882         char name[256];
1883         char component[256];
1884         uint64_t dir_obj, parent_obj, child_dir_zapobj;
1885         dnode_phys_t child_dir_zap, dataset, dir, parent;
1886         dsl_dir_phys_t *dd;
1887         dsl_dataset_phys_t *ds;
1888         char *p;
1889         int len;
1890
1891         p = &name[sizeof(name) - 1];
1892         *p = '\0';
1893
1894         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1895                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1896                 return (EIO);
1897         }
1898         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1899         dir_obj = ds->ds_dir_obj;
1900
1901         for (;;) {
1902                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1903                         return (EIO);
1904                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1905
1906                 /* Actual loop condition. */
1907                 parent_obj  = dd->dd_parent_obj;
1908                 if (parent_obj == 0)
1909                         break;
1910
1911                 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1912                         return (EIO);
1913                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1914                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1915                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1916                         return (EIO);
1917                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1918                         return (EIO);
1919
1920                 len = strlen(component);
1921                 p -= len;
1922                 memcpy(p, component, len);
1923                 --p;
1924                 *p = '/';
1925
1926                 /* Actual loop iteration. */
1927                 dir_obj = parent_obj;
1928         }
1929
1930         if (*p != '\0')
1931                 ++p;
1932         strcpy(result, p);
1933
1934         return (0);
1935 }
1936
1937 static int
1938 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1939 {
1940         char element[256];
1941         uint64_t dir_obj, child_dir_zapobj;
1942         dnode_phys_t child_dir_zap, dir;
1943         dsl_dir_phys_t *dd;
1944         const char *p, *q;
1945
1946         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1947                 return (EIO);
1948         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
1949             1, &dir_obj))
1950                 return (EIO);
1951
1952         p = name;
1953         for (;;) {
1954                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1955                         return (EIO);
1956                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1957
1958                 while (*p == '/')
1959                         p++;
1960                 /* Actual loop condition #1. */
1961                 if (*p == '\0')
1962                         break;
1963
1964                 q = strchr(p, '/');
1965                 if (q) {
1966                         memcpy(element, p, q - p);
1967                         element[q - p] = '\0';
1968                         p = q + 1;
1969                 } else {
1970                         strcpy(element, p);
1971                         p += strlen(p);
1972                 }
1973
1974                 child_dir_zapobj = dd->dd_child_dir_zapobj;
1975                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1976                         return (EIO);
1977
1978                 /* Actual loop condition #2. */
1979                 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
1980                     1, &dir_obj) != 0)
1981                         return (ENOENT);
1982         }
1983
1984         *objnum = dd->dd_head_dataset_obj;
1985         return (0);
1986 }
1987
1988 #ifndef BOOT2
1989 static int
1990 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1991 {
1992         uint64_t dir_obj, child_dir_zapobj;
1993         dnode_phys_t child_dir_zap, dir, dataset;
1994         dsl_dataset_phys_t *ds;
1995         dsl_dir_phys_t *dd;
1996
1997         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1998                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1999                 return (EIO);
2000         }
2001         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2002         dir_obj = ds->ds_dir_obj;
2003
2004         if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
2005                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2006                 return (EIO);
2007         }
2008         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2009
2010         child_dir_zapobj = dd->dd_child_dir_zapobj;
2011         if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
2012                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2013                 return (EIO);
2014         }
2015
2016         return (zap_list(spa, &child_dir_zap) != 0);
2017 }
2018
2019 int
2020 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t))
2021 {
2022         uint64_t dir_obj, child_dir_zapobj, zap_type;
2023         dnode_phys_t child_dir_zap, dir, dataset;
2024         dsl_dataset_phys_t *ds;
2025         dsl_dir_phys_t *dd;
2026         int err;
2027
2028         err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
2029         if (err != 0) {
2030                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2031                 return (err);
2032         }
2033         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2034         dir_obj = ds->ds_dir_obj;
2035
2036         err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
2037         if (err != 0) {
2038                 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2039                 return (err);
2040         }
2041         dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2042
2043         child_dir_zapobj = dd->dd_child_dir_zapobj;
2044         err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
2045         if (err != 0) {
2046                 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2047                 return (err);
2048         }
2049
2050         err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
2051         if (err != 0)
2052                 return (err);
2053
2054         zap_type = *(uint64_t *) zap_scratch;
2055         if (zap_type == ZBT_MICRO)
2056                 return mzap_list(&child_dir_zap, callback);
2057         else
2058                 return fzap_list(spa, &child_dir_zap, callback);
2059 }
2060 #endif
2061
2062 /*
2063  * Find the object set given the object number of its dataset object
2064  * and return its details in *objset
2065  */
2066 static int
2067 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
2068 {
2069         dnode_phys_t dataset;
2070         dsl_dataset_phys_t *ds;
2071
2072         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2073                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2074                 return (EIO);
2075         }
2076
2077         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2078         if (zio_read(spa, &ds->ds_bp, objset)) {
2079                 printf("ZFS: can't read object set for dataset %ju\n",
2080                     (uintmax_t)objnum);
2081                 return (EIO);
2082         }
2083
2084         return (0);
2085 }
2086
2087 /*
2088  * Find the object set pointed to by the BOOTFS property or the root
2089  * dataset if there is none and return its details in *objset
2090  */
2091 static int
2092 zfs_get_root(const spa_t *spa, uint64_t *objid)
2093 {
2094         dnode_phys_t dir, propdir;
2095         uint64_t props, bootfs, root;
2096
2097         *objid = 0;
2098
2099         /*
2100          * Start with the MOS directory object.
2101          */
2102         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
2103                 printf("ZFS: can't read MOS object directory\n");
2104                 return (EIO);
2105         }
2106
2107         /*
2108          * Lookup the pool_props and see if we can find a bootfs.
2109          */
2110         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, sizeof (props), 1, &props) == 0
2111              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
2112              && zap_lookup(spa, &propdir, "bootfs", sizeof (bootfs), 1, &bootfs) == 0
2113              && bootfs != 0)
2114         {
2115                 *objid = bootfs;
2116                 return (0);
2117         }
2118         /*
2119          * Lookup the root dataset directory
2120          */
2121         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root)
2122             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
2123                 printf("ZFS: can't find root dsl_dir\n");
2124                 return (EIO);
2125         }
2126
2127         /*
2128          * Use the information from the dataset directory's bonus buffer
2129          * to find the dataset object and from that the object set itself.
2130          */
2131         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
2132         *objid = dd->dd_head_dataset_obj;
2133         return (0);
2134 }
2135
2136 static int
2137 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
2138 {
2139
2140         mount->spa = spa;
2141
2142         /*
2143          * Find the root object set if not explicitly provided
2144          */
2145         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
2146                 printf("ZFS: can't find root filesystem\n");
2147                 return (EIO);
2148         }
2149
2150         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
2151                 printf("ZFS: can't open root filesystem\n");
2152                 return (EIO);
2153         }
2154
2155         mount->rootobj = rootobj;
2156
2157         return (0);
2158 }
2159
2160 /*
2161  * callback function for feature name checks.
2162  */
2163 static int
2164 check_feature(const char *name, uint64_t value)
2165 {
2166         int i;
2167
2168         if (value == 0)
2169                 return (0);
2170         if (name[0] == '\0')
2171                 return (0);
2172
2173         for (i = 0; features_for_read[i] != NULL; i++) {
2174                 if (strcmp(name, features_for_read[i]) == 0)
2175                         return (0);
2176         }
2177         printf("ZFS: unsupported feature: %s\n", name);
2178         return (EIO);
2179 }
2180
2181 /*
2182  * Checks whether the MOS features that are active are supported.
2183  */
2184 static int
2185 check_mos_features(const spa_t *spa)
2186 {
2187         dnode_phys_t dir;
2188         uint64_t objnum, zap_type;
2189         size_t size;
2190         int rc;
2191
2192         if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
2193             &dir)) != 0)
2194                 return (rc);
2195         if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
2196             sizeof (objnum), 1, &objnum)) != 0) {
2197                 /*
2198                  * It is older pool without features. As we have already
2199                  * tested the label, just return without raising the error.
2200                  */
2201                 return (0);
2202         }
2203
2204         if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
2205                 return (rc);
2206
2207         if (dir.dn_type != DMU_OTN_ZAP_METADATA)
2208                 return (EIO);
2209
2210         size = dir.dn_datablkszsec * 512;
2211         if (dnode_read(spa, &dir, 0, zap_scratch, size))
2212                 return (EIO);
2213
2214         zap_type = *(uint64_t *) zap_scratch;
2215         if (zap_type == ZBT_MICRO)
2216                 rc = mzap_list(&dir, check_feature);
2217         else
2218                 rc = fzap_list(spa, &dir, check_feature);
2219
2220         return (rc);
2221 }
2222
2223 static int
2224 zfs_spa_init(spa_t *spa)
2225 {
2226         dnode_phys_t dir;
2227         int rc;
2228
2229         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
2230                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
2231                 return (EIO);
2232         }
2233         if (spa->spa_mos.os_type != DMU_OST_META) {
2234                 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
2235                 return (EIO);
2236         }
2237
2238         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
2239             &dir)) {
2240                 printf("ZFS: failed to read pool %s directory object\n",
2241                     spa->spa_name);
2242                 return (EIO);
2243         }
2244         /* this is allowed to fail, older pools do not have salt */
2245         rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
2246             sizeof (spa->spa_cksum_salt.zcs_bytes),
2247             spa->spa_cksum_salt.zcs_bytes);
2248
2249         rc = check_mos_features(spa);
2250         if (rc != 0) {
2251                 printf("ZFS: pool %s is not supported\n", spa->spa_name);
2252         }
2253
2254         return (rc);
2255 }
2256
2257 static int
2258 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
2259 {
2260
2261         if (dn->dn_bonustype != DMU_OT_SA) {
2262                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
2263
2264                 sb->st_mode = zp->zp_mode;
2265                 sb->st_uid = zp->zp_uid;
2266                 sb->st_gid = zp->zp_gid;
2267                 sb->st_size = zp->zp_size;
2268         } else {
2269                 sa_hdr_phys_t *sahdrp;
2270                 int hdrsize;
2271                 size_t size = 0;
2272                 void *buf = NULL;
2273
2274                 if (dn->dn_bonuslen != 0)
2275                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2276                 else {
2277                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
2278                                 blkptr_t *bp = DN_SPILL_BLKPTR(dn);
2279                                 int error;
2280
2281                                 size = BP_GET_LSIZE(bp);
2282                                 buf = zfs_alloc(size);
2283                                 error = zio_read(spa, bp, buf);
2284                                 if (error != 0) {
2285                                         zfs_free(buf, size);
2286                                         return (error);
2287                                 }
2288                                 sahdrp = buf;
2289                         } else {
2290                                 return (EIO);
2291                         }
2292                 }
2293                 hdrsize = SA_HDR_SIZE(sahdrp);
2294                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2295                     SA_MODE_OFFSET);
2296                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2297                     SA_UID_OFFSET);
2298                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2299                     SA_GID_OFFSET);
2300                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2301                     SA_SIZE_OFFSET);
2302                 if (buf != NULL)
2303                         zfs_free(buf, size);
2304         }
2305
2306         return (0);
2307 }
2308
2309 static int
2310 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
2311 {
2312         int rc = 0;
2313
2314         if (dn->dn_bonustype == DMU_OT_SA) {
2315                 sa_hdr_phys_t *sahdrp = NULL;
2316                 size_t size = 0;
2317                 void *buf = NULL;
2318                 int hdrsize;
2319                 char *p;
2320
2321                 if (dn->dn_bonuslen != 0)
2322                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2323                 else {
2324                         blkptr_t *bp;
2325
2326                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
2327                                 return (EIO);
2328                         bp = DN_SPILL_BLKPTR(dn);
2329
2330                         size = BP_GET_LSIZE(bp);
2331                         buf = zfs_alloc(size);
2332                         rc = zio_read(spa, bp, buf);
2333                         if (rc != 0) {
2334                                 zfs_free(buf, size);
2335                                 return (rc);
2336                         }
2337                         sahdrp = buf;
2338                 }
2339                 hdrsize = SA_HDR_SIZE(sahdrp);
2340                 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
2341                 memcpy(path, p, psize);
2342                 if (buf != NULL)
2343                         zfs_free(buf, size);
2344                 return (0);
2345         }
2346         /*
2347          * Second test is purely to silence bogus compiler
2348          * warning about accessing past the end of dn_bonus.
2349          */
2350         if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
2351             sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
2352                 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
2353         } else {
2354                 rc = dnode_read(spa, dn, 0, path, psize);
2355         }
2356         return (rc);
2357 }
2358
2359 struct obj_list {
2360         uint64_t                objnum;
2361         STAILQ_ENTRY(obj_list)  entry;
2362 };
2363
2364 /*
2365  * Lookup a file and return its dnode.
2366  */
2367 static int
2368 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2369 {
2370         int rc;
2371         uint64_t objnum;
2372         const spa_t *spa;
2373         dnode_phys_t dn;
2374         const char *p, *q;
2375         char element[256];
2376         char path[1024];
2377         int symlinks_followed = 0;
2378         struct stat sb;
2379         struct obj_list *entry, *tentry;
2380         STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
2381
2382         spa = mount->spa;
2383         if (mount->objset.os_type != DMU_OST_ZFS) {
2384                 printf("ZFS: unexpected object set type %ju\n",
2385                     (uintmax_t)mount->objset.os_type);
2386                 return (EIO);
2387         }
2388
2389         if ((entry = malloc(sizeof(struct obj_list))) == NULL)
2390                 return (ENOMEM);
2391
2392         /*
2393          * Get the root directory dnode.
2394          */
2395         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2396         if (rc) {
2397                 free(entry);
2398                 return (rc);
2399         }
2400
2401         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum);
2402         if (rc) {
2403                 free(entry);
2404                 return (rc);
2405         }
2406         entry->objnum = objnum;
2407         STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2408
2409         rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2410         if (rc != 0)
2411                 goto done;
2412
2413         p = upath;
2414         while (p && *p) {
2415                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2416                 if (rc != 0)
2417                         goto done;
2418
2419                 while (*p == '/')
2420                         p++;
2421                 if (*p == '\0')
2422                         break;
2423                 q = p;
2424                 while (*q != '\0' && *q != '/')
2425                         q++;
2426
2427                 /* skip dot */
2428                 if (p + 1 == q && p[0] == '.') {
2429                         p++;
2430                         continue;
2431                 }
2432                 /* double dot */
2433                 if (p + 2 == q && p[0] == '.' && p[1] == '.') {
2434                         p += 2;
2435                         if (STAILQ_FIRST(&on_cache) ==
2436                             STAILQ_LAST(&on_cache, obj_list, entry)) {
2437                                 rc = ENOENT;
2438                                 goto done;
2439                         }
2440                         entry = STAILQ_FIRST(&on_cache);
2441                         STAILQ_REMOVE_HEAD(&on_cache, entry);
2442                         free(entry);
2443                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
2444                         continue;
2445                 }
2446                 if (q - p + 1 > sizeof(element)) {
2447                         rc = ENAMETOOLONG;
2448                         goto done;
2449                 }
2450                 memcpy(element, p, q - p);
2451                 element[q - p] = 0;
2452                 p = q;
2453
2454                 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
2455                         goto done;
2456                 if (!S_ISDIR(sb.st_mode)) {
2457                         rc = ENOTDIR;
2458                         goto done;
2459                 }
2460
2461                 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
2462                 if (rc)
2463                         goto done;
2464                 objnum = ZFS_DIRENT_OBJ(objnum);
2465
2466                 if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
2467                         rc = ENOMEM;
2468                         goto done;
2469                 }
2470                 entry->objnum = objnum;
2471                 STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2472                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2473                 if (rc)
2474                         goto done;
2475
2476                 /*
2477                  * Check for symlink.
2478                  */
2479                 rc = zfs_dnode_stat(spa, &dn, &sb);
2480                 if (rc)
2481                         goto done;
2482                 if (S_ISLNK(sb.st_mode)) {
2483                         if (symlinks_followed > 10) {
2484                                 rc = EMLINK;
2485                                 goto done;
2486                         }
2487                         symlinks_followed++;
2488
2489                         /*
2490                          * Read the link value and copy the tail of our
2491                          * current path onto the end.
2492                          */
2493                         if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
2494                                 rc = ENAMETOOLONG;
2495                                 goto done;
2496                         }
2497                         strcpy(&path[sb.st_size], p);
2498
2499                         rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
2500                         if (rc != 0)
2501                                 goto done;
2502
2503                         /*
2504                          * Restart with the new path, starting either at
2505                          * the root or at the parent depending whether or
2506                          * not the link is relative.
2507                          */
2508                         p = path;
2509                         if (*p == '/') {
2510                                 while (STAILQ_FIRST(&on_cache) !=
2511                                     STAILQ_LAST(&on_cache, obj_list, entry)) {
2512                                         entry = STAILQ_FIRST(&on_cache);
2513                                         STAILQ_REMOVE_HEAD(&on_cache, entry);
2514                                         free(entry);
2515                                 }
2516                         } else {
2517                                 entry = STAILQ_FIRST(&on_cache);
2518                                 STAILQ_REMOVE_HEAD(&on_cache, entry);
2519                                 free(entry);
2520                         }
2521                         objnum = (STAILQ_FIRST(&on_cache))->objnum;
2522                 }
2523         }
2524
2525         *dnode = dn;
2526 done:
2527         STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
2528                 free(entry);
2529         return (rc);
2530 }