]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - lib/libzfs/libzfs_import.c
OpenZFS 9235 - rename zpool_rewind_policy_t to zpool_load_policy_t
[FreeBSD/FreeBSD.git] / lib / libzfs / libzfs_import.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25  * Copyright 2015 RackTop Systems.
26  * Copyright (c) 2016, Intel Corporation.
27  */
28
29 /*
30  * Pool import support functions.
31  *
32  * To import a pool, we rely on reading the configuration information from the
33  * ZFS label of each device.  If we successfully read the label, then we
34  * organize the configuration information in the following hierarchy:
35  *
36  *      pool guid -> toplevel vdev guid -> label txg
37  *
38  * Duplicate entries matching this same tuple will be discarded.  Once we have
39  * examined every device, we pick the best label txg config for each toplevel
40  * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
41  * update any paths that have changed.  Finally, we attempt to import the pool
42  * using our derived config, and record the results.
43  */
44
45 #include <ctype.h>
46 #include <devid.h>
47 #include <dirent.h>
48 #include <errno.h>
49 #include <libintl.h>
50 #include <libgen.h>
51 #ifdef HAVE_LIBUDEV
52 #include <libudev.h>
53 #include <sched.h>
54 #endif
55 #include <stddef.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <sys/stat.h>
59 #include <unistd.h>
60 #include <fcntl.h>
61 #include <sys/vtoc.h>
62 #include <sys/dktp/fdisk.h>
63 #include <sys/efi_partition.h>
64 #include <thread_pool.h>
65 #include <sys/vdev_impl.h>
66 #include <blkid/blkid.h>
67 #include "libzfs.h"
68 #include "libzfs_impl.h"
69 #include <libzfs.h>
70
71 /*
72  * Intermediate structures used to gather configuration information.
73  */
74 typedef struct config_entry {
75         uint64_t                ce_txg;
76         nvlist_t                *ce_config;
77         struct config_entry     *ce_next;
78 } config_entry_t;
79
80 typedef struct vdev_entry {
81         uint64_t                ve_guid;
82         config_entry_t          *ve_configs;
83         struct vdev_entry       *ve_next;
84 } vdev_entry_t;
85
86 typedef struct pool_entry {
87         uint64_t                pe_guid;
88         vdev_entry_t            *pe_vdevs;
89         struct pool_entry       *pe_next;
90 } pool_entry_t;
91
92 typedef struct name_entry {
93         char                    *ne_name;
94         uint64_t                ne_guid;
95         uint64_t                ne_order;
96         uint64_t                ne_num_labels;
97         struct name_entry       *ne_next;
98 } name_entry_t;
99
100 typedef struct pool_list {
101         pool_entry_t            *pools;
102         name_entry_t            *names;
103 } pool_list_t;
104
105 #define DEV_BYID_PATH   "/dev/disk/by-id/"
106
107 /*
108  * Linux persistent device strings for vdev labels
109  *
110  * based on libudev for consistency with libudev disk add/remove events
111  */
112 #ifdef HAVE_LIBUDEV
113
114 typedef struct vdev_dev_strs {
115         char    vds_devid[128];
116         char    vds_devphys[128];
117 } vdev_dev_strs_t;
118
119 /*
120  * Obtain the persistent device id string (describes what)
121  *
122  * used by ZED vdev matching for auto-{online,expand,replace}
123  */
124 int
125 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
126 {
127         struct udev_list_entry *entry;
128         const char *bus;
129         char devbyid[MAXPATHLEN];
130
131         /* The bus based by-id path is preferred */
132         bus = udev_device_get_property_value(dev, "ID_BUS");
133
134         if (bus == NULL) {
135                 const char *dm_uuid;
136
137                 /*
138                  * For multipath nodes use the persistent uuid based identifier
139                  *
140                  * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
141                  */
142                 dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
143                 if (dm_uuid != NULL) {
144                         (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
145                         return (0);
146                 }
147
148                 /*
149                  * NVME 'by-id' symlinks are similar to bus case
150                  */
151                 struct udev_device *parent;
152
153                 parent = udev_device_get_parent_with_subsystem_devtype(dev,
154                     "nvme", NULL);
155                 if (parent != NULL)
156                         bus = "nvme";   /* continue with bus symlink search */
157                 else
158                         return (ENODATA);
159         }
160
161         /*
162          * locate the bus specific by-id link
163          */
164         (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
165         entry = udev_device_get_devlinks_list_entry(dev);
166         while (entry != NULL) {
167                 const char *name;
168
169                 name = udev_list_entry_get_name(entry);
170                 if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
171                         name += strlen(DEV_BYID_PATH);
172                         (void) strlcpy(bufptr, name, buflen);
173                         return (0);
174                 }
175                 entry = udev_list_entry_get_next(entry);
176         }
177
178         return (ENODATA);
179 }
180
181 /*
182  * Obtain the persistent physical location string (describes where)
183  *
184  * used by ZED vdev matching for auto-{online,expand,replace}
185  */
186 int
187 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
188 {
189         const char *physpath = NULL;
190
191         /*
192          * Normal disks use ID_PATH for their physical path.  Device mapper
193          * devices are virtual and don't have a physical path.  For them we
194          * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file.
195          * ID_VDEV provides a persistent path to a virtual device.  If you
196          * don't have vdev_id.conf setup, you cannot use multipath autoreplace.
197          */
198         if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) &&
199             physpath[0])) {
200                 if (!((physpath =
201                     udev_device_get_property_value(dev, "ID_VDEV")) &&
202                     physpath[0])) {
203                         return (ENODATA);
204                 }
205         }
206
207         (void) strlcpy(bufptr, physpath, buflen);
208
209         return (0);
210 }
211
212 boolean_t
213 udev_is_mpath(struct udev_device *dev)
214 {
215         return udev_device_get_property_value(dev, "DM_UUID") &&
216             udev_device_get_property_value(dev, "MPATH_SBIN_PATH");
217 }
218
219 /*
220  * A disk is considered a multipath whole disk when:
221  *      DEVNAME key value has "dm-"
222  *      DM_NAME key value has "mpath" prefix
223  *      DM_UUID key exists
224  *      ID_PART_TABLE_TYPE key does not exist or is not gpt
225  */
226 static boolean_t
227 udev_mpath_whole_disk(struct udev_device *dev)
228 {
229         const char *devname, *type, *uuid;
230
231         devname = udev_device_get_property_value(dev, "DEVNAME");
232         type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
233         uuid = udev_device_get_property_value(dev, "DM_UUID");
234
235         if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
236             ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
237             (uuid != NULL)) {
238                 return (B_TRUE);
239         }
240
241         return (B_FALSE);
242 }
243
244 /*
245  * Check if a disk is effectively a multipath whole disk
246  */
247 boolean_t
248 is_mpath_whole_disk(const char *path)
249 {
250         struct udev *udev;
251         struct udev_device *dev = NULL;
252         char nodepath[MAXPATHLEN];
253         char *sysname;
254         boolean_t wholedisk = B_FALSE;
255
256         if (realpath(path, nodepath) == NULL)
257                 return (B_FALSE);
258         sysname = strrchr(nodepath, '/') + 1;
259         if (strncmp(sysname, "dm-", 3) != 0)
260                 return (B_FALSE);
261         if ((udev = udev_new()) == NULL)
262                 return (B_FALSE);
263         if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
264             sysname)) == NULL) {
265                 udev_device_unref(dev);
266                 return (B_FALSE);
267         }
268
269         wholedisk = udev_mpath_whole_disk(dev);
270
271         udev_device_unref(dev);
272         return (wholedisk);
273 }
274
275 static int
276 udev_device_is_ready(struct udev_device *dev)
277 {
278 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
279         return (udev_device_get_is_initialized(dev));
280 #else
281         /* wait for DEVLINKS property to be initialized */
282         return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
283 #endif
284 }
285
286 /*
287  * Wait up to timeout_ms for udev to set up the device node.  The device is
288  * considered ready when libudev determines it has been initialized, all of
289  * the device links have been verified to exist, and it has been allowed to
290  * settle.  At this point the device the device can be accessed reliably.
291  * Depending on the complexity of the udev rules this process could take
292  * several seconds.
293  */
294 int
295 zpool_label_disk_wait(char *path, int timeout_ms)
296 {
297         struct udev *udev;
298         struct udev_device *dev = NULL;
299         char nodepath[MAXPATHLEN];
300         char *sysname = NULL;
301         int ret = ENODEV;
302         int settle_ms = 50;
303         long sleep_ms = 10;
304         hrtime_t start, settle;
305
306         if ((udev = udev_new()) == NULL)
307                 return (ENXIO);
308
309         start = gethrtime();
310         settle = 0;
311
312         do {
313                 if (sysname == NULL) {
314                         if (realpath(path, nodepath) != NULL) {
315                                 sysname = strrchr(nodepath, '/') + 1;
316                         } else {
317                                 (void) usleep(sleep_ms * MILLISEC);
318                                 continue;
319                         }
320                 }
321
322                 dev = udev_device_new_from_subsystem_sysname(udev,
323                     "block", sysname);
324                 if ((dev != NULL) && udev_device_is_ready(dev)) {
325                         struct udev_list_entry *links, *link = NULL;
326
327                         ret = 0;
328                         links = udev_device_get_devlinks_list_entry(dev);
329
330                         udev_list_entry_foreach(link, links) {
331                                 struct stat64 statbuf;
332                                 const char *name;
333
334                                 name = udev_list_entry_get_name(link);
335                                 errno = 0;
336                                 if (stat64(name, &statbuf) == 0 && errno == 0)
337                                         continue;
338
339                                 settle = 0;
340                                 ret = ENODEV;
341                                 break;
342                         }
343
344                         if (ret == 0) {
345                                 if (settle == 0) {
346                                         settle = gethrtime();
347                                 } else if (NSEC2MSEC(gethrtime() - settle) >=
348                                     settle_ms) {
349                                         udev_device_unref(dev);
350                                         break;
351                                 }
352                         }
353                 }
354
355                 udev_device_unref(dev);
356                 (void) usleep(sleep_ms * MILLISEC);
357
358         } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
359
360         udev_unref(udev);
361
362         return (ret);
363 }
364
365
366 /*
367  * Encode the persistent devices strings
368  * used for the vdev disk label
369  */
370 static int
371 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
372     boolean_t wholedisk)
373 {
374         struct udev *udev;
375         struct udev_device *dev = NULL;
376         char nodepath[MAXPATHLEN];
377         char *sysname;
378         int ret = ENODEV;
379         hrtime_t start;
380
381         if ((udev = udev_new()) == NULL)
382                 return (ENXIO);
383
384         /* resolve path to a runtime device node instance */
385         if (realpath(path, nodepath) == NULL)
386                 goto no_dev;
387
388         sysname = strrchr(nodepath, '/') + 1;
389
390         /*
391          * Wait up to 3 seconds for udev to set up the device node context
392          */
393         start = gethrtime();
394         do {
395                 dev = udev_device_new_from_subsystem_sysname(udev, "block",
396                     sysname);
397                 if (dev == NULL)
398                         goto no_dev;
399                 if (udev_device_is_ready(dev))
400                         break;  /* udev ready */
401
402                 udev_device_unref(dev);
403                 dev = NULL;
404
405                 if (NSEC2MSEC(gethrtime() - start) < 10)
406                         (void) sched_yield();   /* yield/busy wait up to 10ms */
407                 else
408                         (void) usleep(10 * MILLISEC);
409
410         } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
411
412         if (dev == NULL)
413                 goto no_dev;
414
415         /*
416          * Only whole disks require extra device strings
417          */
418         if (!wholedisk && !udev_mpath_whole_disk(dev))
419                 goto no_dev;
420
421         ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
422         if (ret != 0)
423                 goto no_dev_ref;
424
425         /* physical location string (optional) */
426         if (zfs_device_get_physical(dev, ds->vds_devphys,
427             sizeof (ds->vds_devphys)) != 0) {
428                 ds->vds_devphys[0] = '\0'; /* empty string --> not available */
429         }
430
431 no_dev_ref:
432         udev_device_unref(dev);
433 no_dev:
434         udev_unref(udev);
435
436         return (ret);
437 }
438
439 /*
440  * Update a leaf vdev's persistent device strings (Linux only)
441  *
442  * - only applies for a dedicated leaf vdev (aka whole disk)
443  * - updated during pool create|add|attach|import
444  * - used for matching device matching during auto-{online,expand,replace}
445  * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
446  * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
447  *
448  * single device node example:
449  *      devid:          'scsi-MG03SCA300_350000494a8cb3d67-part1'
450  *      phys_path:      'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
451  *
452  * multipath device node example:
453  *      devid:          'dm-uuid-mpath-35000c5006304de3f'
454  *
455  * We also store the enclosure sysfs path for turning on enclosure LEDs
456  * (if applicable):
457  *      vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
458  */
459 void
460 update_vdev_config_dev_strs(nvlist_t *nv)
461 {
462         vdev_dev_strs_t vds;
463         char *env, *type, *path;
464         uint64_t wholedisk = 0;
465         char *upath, *spath;
466
467         /*
468          * For the benefit of legacy ZFS implementations, allow
469          * for opting out of devid strings in the vdev label.
470          *
471          * example use:
472          *      env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
473          *
474          * explanation:
475          * Older ZFS on Linux implementations had issues when attempting to
476          * display pool config VDEV names if a "devid" NVP value is present
477          * in the pool's config.
478          *
479          * For example, a pool that originated on illumos platform would
480          * have a devid value in the config and "zpool status" would fail
481          * when listing the config.
482          *
483          * A pool can be stripped of any "devid" values on import or
484          * prevented from adding them on zpool create|add by setting
485          * ZFS_VDEV_DEVID_OPT_OUT.
486          */
487         env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
488         if (env && (strtoul(env, NULL, 0) > 0 ||
489             !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
490                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
491                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
492                 return;
493         }
494
495         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
496             strcmp(type, VDEV_TYPE_DISK) != 0) {
497                 return;
498         }
499         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
500                 return;
501         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
502
503         /*
504          * Update device string values in config nvlist
505          */
506         if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
507                 (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
508                 if (vds.vds_devphys[0] != '\0') {
509                         (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
510                             vds.vds_devphys);
511                 }
512
513                 /* Add enclosure sysfs path (if disk is in an enclosure) */
514                 upath = zfs_get_underlying_path(path);
515                 spath = zfs_get_enclosure_sysfs_path(upath);
516                 if (spath)
517                         nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
518                             spath);
519                 else
520                         nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
521
522                 free(upath);
523                 free(spath);
524         } else {
525                 /* clear out any stale entries */
526                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
527                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
528                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
529         }
530 }
531 #else
532
533 boolean_t
534 is_mpath_whole_disk(const char *path)
535 {
536         return (B_FALSE);
537 }
538
539 /*
540  * Wait up to timeout_ms for udev to set up the device node.  The device is
541  * considered ready when the provided path have been verified to exist and
542  * it has been allowed to settle.  At this point the device the device can
543  * be accessed reliably.  Depending on the complexity of the udev rules thisi
544  * process could take several seconds.
545  */
546 int
547 zpool_label_disk_wait(char *path, int timeout_ms)
548 {
549         int settle_ms = 50;
550         long sleep_ms = 10;
551         hrtime_t start, settle;
552         struct stat64 statbuf;
553
554         start = gethrtime();
555         settle = 0;
556
557         do {
558                 errno = 0;
559                 if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
560                         if (settle == 0)
561                                 settle = gethrtime();
562                         else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
563                                 return (0);
564                 } else if (errno != ENOENT) {
565                         return (errno);
566                 }
567
568                 usleep(sleep_ms * MILLISEC);
569         } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
570
571         return (ENODEV);
572 }
573
574 void
575 update_vdev_config_dev_strs(nvlist_t *nv)
576 {
577 }
578
579 #endif /* HAVE_LIBUDEV */
580
581 /*
582  * Go through and fix up any path and/or devid information for the given vdev
583  * configuration.
584  */
585 static int
586 fix_paths(nvlist_t *nv, name_entry_t *names)
587 {
588         nvlist_t **child;
589         uint_t c, children;
590         uint64_t guid;
591         name_entry_t *ne, *best;
592         char *path;
593
594         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
595             &child, &children) == 0) {
596                 for (c = 0; c < children; c++)
597                         if (fix_paths(child[c], names) != 0)
598                                 return (-1);
599                 return (0);
600         }
601
602         /*
603          * This is a leaf (file or disk) vdev.  In either case, go through
604          * the name list and see if we find a matching guid.  If so, replace
605          * the path and see if we can calculate a new devid.
606          *
607          * There may be multiple names associated with a particular guid, in
608          * which case we have overlapping partitions or multiple paths to the
609          * same disk.  In this case we prefer to use the path name which
610          * matches the ZPOOL_CONFIG_PATH.  If no matching entry is found we
611          * use the lowest order device which corresponds to the first match
612          * while traversing the ZPOOL_IMPORT_PATH search path.
613          */
614         verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
615         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
616                 path = NULL;
617
618         best = NULL;
619         for (ne = names; ne != NULL; ne = ne->ne_next) {
620                 if (ne->ne_guid == guid) {
621                         if (path == NULL) {
622                                 best = ne;
623                                 break;
624                         }
625
626                         if ((strlen(path) == strlen(ne->ne_name)) &&
627                             strncmp(path, ne->ne_name, strlen(path)) == 0) {
628                                 best = ne;
629                                 break;
630                         }
631
632                         if (best == NULL) {
633                                 best = ne;
634                                 continue;
635                         }
636
637                         /* Prefer paths with move vdev labels. */
638                         if (ne->ne_num_labels > best->ne_num_labels) {
639                                 best = ne;
640                                 continue;
641                         }
642
643                         /* Prefer paths earlier in the search order. */
644                         if (ne->ne_num_labels == best->ne_num_labels &&
645                             ne->ne_order < best->ne_order) {
646                                 best = ne;
647                                 continue;
648                         }
649                 }
650         }
651
652         if (best == NULL)
653                 return (0);
654
655         if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
656                 return (-1);
657
658         /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */
659         update_vdev_config_dev_strs(nv);
660
661         return (0);
662 }
663
664 /*
665  * Add the given configuration to the list of known devices.
666  */
667 static int
668 add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path,
669     int order, int num_labels, nvlist_t *config)
670 {
671         uint64_t pool_guid, vdev_guid, top_guid, txg, state;
672         pool_entry_t *pe;
673         vdev_entry_t *ve;
674         config_entry_t *ce;
675         name_entry_t *ne;
676
677         /*
678          * If this is a hot spare not currently in use or level 2 cache
679          * device, add it to the list of names to translate, but don't do
680          * anything else.
681          */
682         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
683             &state) == 0 &&
684             (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
685             nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
686                 if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) {
687                         nvlist_free(config);
688                         return (-1);
689                 }
690
691                 if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
692                         free(ne);
693                         nvlist_free(config);
694                         return (-1);
695                 }
696                 ne->ne_guid = vdev_guid;
697                 ne->ne_order = order;
698                 ne->ne_num_labels = num_labels;
699                 ne->ne_next = pl->names;
700                 pl->names = ne;
701                 nvlist_free(config);
702                 return (0);
703         }
704
705         /*
706          * If we have a valid config but cannot read any of these fields, then
707          * it means we have a half-initialized label.  In vdev_label_init()
708          * we write a label with txg == 0 so that we can identify the device
709          * in case the user refers to the same disk later on.  If we fail to
710          * create the pool, we'll be left with a label in this state
711          * which should not be considered part of a valid pool.
712          */
713         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
714             &pool_guid) != 0 ||
715             nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
716             &vdev_guid) != 0 ||
717             nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
718             &top_guid) != 0 ||
719             nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
720             &txg) != 0 || txg == 0) {
721                 nvlist_free(config);
722                 return (0);
723         }
724
725         /*
726          * First, see if we know about this pool.  If not, then add it to the
727          * list of known pools.
728          */
729         for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
730                 if (pe->pe_guid == pool_guid)
731                         break;
732         }
733
734         if (pe == NULL) {
735                 if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
736                         nvlist_free(config);
737                         return (-1);
738                 }
739                 pe->pe_guid = pool_guid;
740                 pe->pe_next = pl->pools;
741                 pl->pools = pe;
742         }
743
744         /*
745          * Second, see if we know about this toplevel vdev.  Add it if its
746          * missing.
747          */
748         for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
749                 if (ve->ve_guid == top_guid)
750                         break;
751         }
752
753         if (ve == NULL) {
754                 if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
755                         nvlist_free(config);
756                         return (-1);
757                 }
758                 ve->ve_guid = top_guid;
759                 ve->ve_next = pe->pe_vdevs;
760                 pe->pe_vdevs = ve;
761         }
762
763         /*
764          * Third, see if we have a config with a matching transaction group.  If
765          * so, then we do nothing.  Otherwise, add it to the list of known
766          * configs.
767          */
768         for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
769                 if (ce->ce_txg == txg)
770                         break;
771         }
772
773         if (ce == NULL) {
774                 if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) {
775                         nvlist_free(config);
776                         return (-1);
777                 }
778                 ce->ce_txg = txg;
779                 ce->ce_config = config;
780                 ce->ce_next = ve->ve_configs;
781                 ve->ve_configs = ce;
782         } else {
783                 nvlist_free(config);
784         }
785
786         /*
787          * At this point we've successfully added our config to the list of
788          * known configs.  The last thing to do is add the vdev guid -> path
789          * mappings so that we can fix up the configuration as necessary before
790          * doing the import.
791          */
792         if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
793                 return (-1);
794
795         if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
796                 free(ne);
797                 return (-1);
798         }
799
800         ne->ne_guid = vdev_guid;
801         ne->ne_order = order;
802         ne->ne_num_labels = num_labels;
803         ne->ne_next = pl->names;
804         pl->names = ne;
805
806         return (0);
807 }
808
809 /*
810  * Returns true if the named pool matches the given GUID.
811  */
812 static int
813 pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid,
814     boolean_t *isactive)
815 {
816         zpool_handle_t *zhp;
817         uint64_t theguid;
818
819         if (zpool_open_silent(hdl, name, &zhp) != 0)
820                 return (-1);
821
822         if (zhp == NULL) {
823                 *isactive = B_FALSE;
824                 return (0);
825         }
826
827         verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID,
828             &theguid) == 0);
829
830         zpool_close(zhp);
831
832         *isactive = (theguid == guid);
833         return (0);
834 }
835
836 static nvlist_t *
837 refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
838 {
839         nvlist_t *nvl;
840         zfs_cmd_t zc = {"\0"};
841         int err, dstbuf_size;
842
843         if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0)
844                 return (NULL);
845
846         dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4);
847
848         if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) {
849                 zcmd_free_nvlists(&zc);
850                 return (NULL);
851         }
852
853         while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT,
854             &zc)) != 0 && errno == ENOMEM) {
855                 if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
856                         zcmd_free_nvlists(&zc);
857                         return (NULL);
858                 }
859         }
860
861         if (err) {
862                 zcmd_free_nvlists(&zc);
863                 return (NULL);
864         }
865
866         if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) {
867                 zcmd_free_nvlists(&zc);
868                 return (NULL);
869         }
870
871         zcmd_free_nvlists(&zc);
872         return (nvl);
873 }
874
875 /*
876  * Determine if the vdev id is a hole in the namespace.
877  */
878 boolean_t
879 vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
880 {
881         int c;
882
883         for (c = 0; c < holes; c++) {
884
885                 /* Top-level is a hole */
886                 if (hole_array[c] == id)
887                         return (B_TRUE);
888         }
889         return (B_FALSE);
890 }
891
892 /*
893  * Convert our list of pools into the definitive set of configurations.  We
894  * start by picking the best config for each toplevel vdev.  Once that's done,
895  * we assemble the toplevel vdevs into a full config for the pool.  We make a
896  * pass to fix up any incorrect paths, and then add it to the main list to
897  * return to the user.
898  */
899 static nvlist_t *
900 get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
901     nvlist_t *policy)
902 {
903         pool_entry_t *pe;
904         vdev_entry_t *ve;
905         config_entry_t *ce;
906         nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
907         nvlist_t **spares, **l2cache;
908         uint_t i, nspares, nl2cache;
909         boolean_t config_seen;
910         uint64_t best_txg;
911         char *name, *hostname = NULL;
912         uint64_t guid;
913         uint_t children = 0;
914         nvlist_t **child = NULL;
915         uint_t holes;
916         uint64_t *hole_array, max_id;
917         uint_t c;
918         boolean_t isactive;
919         uint64_t hostid;
920         nvlist_t *nvl;
921         boolean_t valid_top_config = B_FALSE;
922
923         if (nvlist_alloc(&ret, 0, 0) != 0)
924                 goto nomem;
925
926         for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
927                 uint64_t id, max_txg = 0;
928
929                 if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
930                         goto nomem;
931                 config_seen = B_FALSE;
932
933                 /*
934                  * Iterate over all toplevel vdevs.  Grab the pool configuration
935                  * from the first one we find, and then go through the rest and
936                  * add them as necessary to the 'vdevs' member of the config.
937                  */
938                 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
939
940                         /*
941                          * Determine the best configuration for this vdev by
942                          * selecting the config with the latest transaction
943                          * group.
944                          */
945                         best_txg = 0;
946                         for (ce = ve->ve_configs; ce != NULL;
947                             ce = ce->ce_next) {
948
949                                 if (ce->ce_txg > best_txg) {
950                                         tmp = ce->ce_config;
951                                         best_txg = ce->ce_txg;
952                                 }
953                         }
954
955                         /*
956                          * We rely on the fact that the max txg for the
957                          * pool will contain the most up-to-date information
958                          * about the valid top-levels in the vdev namespace.
959                          */
960                         if (best_txg > max_txg) {
961                                 (void) nvlist_remove(config,
962                                     ZPOOL_CONFIG_VDEV_CHILDREN,
963                                     DATA_TYPE_UINT64);
964                                 (void) nvlist_remove(config,
965                                     ZPOOL_CONFIG_HOLE_ARRAY,
966                                     DATA_TYPE_UINT64_ARRAY);
967
968                                 max_txg = best_txg;
969                                 hole_array = NULL;
970                                 holes = 0;
971                                 max_id = 0;
972                                 valid_top_config = B_FALSE;
973
974                                 if (nvlist_lookup_uint64(tmp,
975                                     ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
976                                         verify(nvlist_add_uint64(config,
977                                             ZPOOL_CONFIG_VDEV_CHILDREN,
978                                             max_id) == 0);
979                                         valid_top_config = B_TRUE;
980                                 }
981
982                                 if (nvlist_lookup_uint64_array(tmp,
983                                     ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
984                                     &holes) == 0) {
985                                         verify(nvlist_add_uint64_array(config,
986                                             ZPOOL_CONFIG_HOLE_ARRAY,
987                                             hole_array, holes) == 0);
988                                 }
989                         }
990
991                         if (!config_seen) {
992                                 /*
993                                  * Copy the relevant pieces of data to the pool
994                                  * configuration:
995                                  *
996                                  *      version
997                                  *      pool guid
998                                  *      name
999                                  *      comment (if available)
1000                                  *      pool state
1001                                  *      hostid (if available)
1002                                  *      hostname (if available)
1003                                  */
1004                                 uint64_t state, version;
1005                                 char *comment = NULL;
1006
1007                                 version = fnvlist_lookup_uint64(tmp,
1008                                     ZPOOL_CONFIG_VERSION);
1009                                 fnvlist_add_uint64(config,
1010                                     ZPOOL_CONFIG_VERSION, version);
1011                                 guid = fnvlist_lookup_uint64(tmp,
1012                                     ZPOOL_CONFIG_POOL_GUID);
1013                                 fnvlist_add_uint64(config,
1014                                     ZPOOL_CONFIG_POOL_GUID, guid);
1015                                 name = fnvlist_lookup_string(tmp,
1016                                     ZPOOL_CONFIG_POOL_NAME);
1017                                 fnvlist_add_string(config,
1018                                     ZPOOL_CONFIG_POOL_NAME, name);
1019
1020                                 if (nvlist_lookup_string(tmp,
1021                                     ZPOOL_CONFIG_COMMENT, &comment) == 0)
1022                                         fnvlist_add_string(config,
1023                                             ZPOOL_CONFIG_COMMENT, comment);
1024
1025                                 state = fnvlist_lookup_uint64(tmp,
1026                                     ZPOOL_CONFIG_POOL_STATE);
1027                                 fnvlist_add_uint64(config,
1028                                     ZPOOL_CONFIG_POOL_STATE, state);
1029
1030                                 hostid = 0;
1031                                 if (nvlist_lookup_uint64(tmp,
1032                                     ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
1033                                         fnvlist_add_uint64(config,
1034                                             ZPOOL_CONFIG_HOSTID, hostid);
1035                                         hostname = fnvlist_lookup_string(tmp,
1036                                             ZPOOL_CONFIG_HOSTNAME);
1037                                         fnvlist_add_string(config,
1038                                             ZPOOL_CONFIG_HOSTNAME, hostname);
1039                                 }
1040
1041                                 config_seen = B_TRUE;
1042                         }
1043
1044                         /*
1045                          * Add this top-level vdev to the child array.
1046                          */
1047                         verify(nvlist_lookup_nvlist(tmp,
1048                             ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
1049                         verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
1050                             &id) == 0);
1051
1052                         if (id >= children) {
1053                                 nvlist_t **newchild;
1054
1055                                 newchild = zfs_alloc(hdl, (id + 1) *
1056                                     sizeof (nvlist_t *));
1057                                 if (newchild == NULL)
1058                                         goto nomem;
1059
1060                                 for (c = 0; c < children; c++)
1061                                         newchild[c] = child[c];
1062
1063                                 free(child);
1064                                 child = newchild;
1065                                 children = id + 1;
1066                         }
1067                         if (nvlist_dup(nvtop, &child[id], 0) != 0)
1068                                 goto nomem;
1069
1070                 }
1071
1072                 /*
1073                  * If we have information about all the top-levels then
1074                  * clean up the nvlist which we've constructed. This
1075                  * means removing any extraneous devices that are
1076                  * beyond the valid range or adding devices to the end
1077                  * of our array which appear to be missing.
1078                  */
1079                 if (valid_top_config) {
1080                         if (max_id < children) {
1081                                 for (c = max_id; c < children; c++)
1082                                         nvlist_free(child[c]);
1083                                 children = max_id;
1084                         } else if (max_id > children) {
1085                                 nvlist_t **newchild;
1086
1087                                 newchild = zfs_alloc(hdl, (max_id) *
1088                                     sizeof (nvlist_t *));
1089                                 if (newchild == NULL)
1090                                         goto nomem;
1091
1092                                 for (c = 0; c < children; c++)
1093                                         newchild[c] = child[c];
1094
1095                                 free(child);
1096                                 child = newchild;
1097                                 children = max_id;
1098                         }
1099                 }
1100
1101                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
1102                     &guid) == 0);
1103
1104                 /*
1105                  * The vdev namespace may contain holes as a result of
1106                  * device removal. We must add them back into the vdev
1107                  * tree before we process any missing devices.
1108                  */
1109                 if (holes > 0) {
1110                         ASSERT(valid_top_config);
1111
1112                         for (c = 0; c < children; c++) {
1113                                 nvlist_t *holey;
1114
1115                                 if (child[c] != NULL ||
1116                                     !vdev_is_hole(hole_array, holes, c))
1117                                         continue;
1118
1119                                 if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
1120                                     0) != 0)
1121                                         goto nomem;
1122
1123                                 /*
1124                                  * Holes in the namespace are treated as
1125                                  * "hole" top-level vdevs and have a
1126                                  * special flag set on them.
1127                                  */
1128                                 if (nvlist_add_string(holey,
1129                                     ZPOOL_CONFIG_TYPE,
1130                                     VDEV_TYPE_HOLE) != 0 ||
1131                                     nvlist_add_uint64(holey,
1132                                     ZPOOL_CONFIG_ID, c) != 0 ||
1133                                     nvlist_add_uint64(holey,
1134                                     ZPOOL_CONFIG_GUID, 0ULL) != 0) {
1135                                         nvlist_free(holey);
1136                                         goto nomem;
1137                                 }
1138                                 child[c] = holey;
1139                         }
1140                 }
1141
1142                 /*
1143                  * Look for any missing top-level vdevs.  If this is the case,
1144                  * create a faked up 'missing' vdev as a placeholder.  We cannot
1145                  * simply compress the child array, because the kernel performs
1146                  * certain checks to make sure the vdev IDs match their location
1147                  * in the configuration.
1148                  */
1149                 for (c = 0; c < children; c++) {
1150                         if (child[c] == NULL) {
1151                                 nvlist_t *missing;
1152                                 if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
1153                                     0) != 0)
1154                                         goto nomem;
1155                                 if (nvlist_add_string(missing,
1156                                     ZPOOL_CONFIG_TYPE,
1157                                     VDEV_TYPE_MISSING) != 0 ||
1158                                     nvlist_add_uint64(missing,
1159                                     ZPOOL_CONFIG_ID, c) != 0 ||
1160                                     nvlist_add_uint64(missing,
1161                                     ZPOOL_CONFIG_GUID, 0ULL) != 0) {
1162                                         nvlist_free(missing);
1163                                         goto nomem;
1164                                 }
1165                                 child[c] = missing;
1166                         }
1167                 }
1168
1169                 /*
1170                  * Put all of this pool's top-level vdevs into a root vdev.
1171                  */
1172                 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
1173                         goto nomem;
1174                 if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1175                     VDEV_TYPE_ROOT) != 0 ||
1176                     nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||
1177                     nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||
1178                     nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1179                     child, children) != 0) {
1180                         nvlist_free(nvroot);
1181                         goto nomem;
1182                 }
1183
1184                 for (c = 0; c < children; c++)
1185                         nvlist_free(child[c]);
1186                 free(child);
1187                 children = 0;
1188                 child = NULL;
1189
1190                 /*
1191                  * Go through and fix up any paths and/or devids based on our
1192                  * known list of vdev GUID -> path mappings.
1193                  */
1194                 if (fix_paths(nvroot, pl->names) != 0) {
1195                         nvlist_free(nvroot);
1196                         goto nomem;
1197                 }
1198
1199                 /*
1200                  * Add the root vdev to this pool's configuration.
1201                  */
1202                 if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1203                     nvroot) != 0) {
1204                         nvlist_free(nvroot);
1205                         goto nomem;
1206                 }
1207                 nvlist_free(nvroot);
1208
1209                 /*
1210                  * zdb uses this path to report on active pools that were
1211                  * imported or created using -R.
1212                  */
1213                 if (active_ok)
1214                         goto add_pool;
1215
1216                 /*
1217                  * Determine if this pool is currently active, in which case we
1218                  * can't actually import it.
1219                  */
1220                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
1221                     &name) == 0);
1222                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
1223                     &guid) == 0);
1224
1225                 if (pool_active(hdl, name, guid, &isactive) != 0)
1226                         goto error;
1227
1228                 if (isactive) {
1229                         nvlist_free(config);
1230                         config = NULL;
1231                         continue;
1232                 }
1233
1234                 if (policy != NULL) {
1235                         if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
1236                             policy) != 0)
1237                                 goto nomem;
1238                 }
1239
1240                 if ((nvl = refresh_config(hdl, config)) == NULL) {
1241                         nvlist_free(config);
1242                         config = NULL;
1243                         continue;
1244                 }
1245
1246                 nvlist_free(config);
1247                 config = nvl;
1248
1249                 /*
1250                  * Go through and update the paths for spares, now that we have
1251                  * them.
1252                  */
1253                 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1254                     &nvroot) == 0);
1255                 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1256                     &spares, &nspares) == 0) {
1257                         for (i = 0; i < nspares; i++) {
1258                                 if (fix_paths(spares[i], pl->names) != 0)
1259                                         goto nomem;
1260                         }
1261                 }
1262
1263                 /*
1264                  * Update the paths for l2cache devices.
1265                  */
1266                 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1267                     &l2cache, &nl2cache) == 0) {
1268                         for (i = 0; i < nl2cache; i++) {
1269                                 if (fix_paths(l2cache[i], pl->names) != 0)
1270                                         goto nomem;
1271                         }
1272                 }
1273
1274                 /*
1275                  * Restore the original information read from the actual label.
1276                  */
1277                 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
1278                     DATA_TYPE_UINT64);
1279                 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
1280                     DATA_TYPE_STRING);
1281                 if (hostid != 0) {
1282                         verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
1283                             hostid) == 0);
1284                         verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
1285                             hostname) == 0);
1286                 }
1287
1288 add_pool:
1289                 /*
1290                  * Add this pool to the list of configs.
1291                  */
1292                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
1293                     &name) == 0);
1294                 if (nvlist_add_nvlist(ret, name, config) != 0)
1295                         goto nomem;
1296
1297                 nvlist_free(config);
1298                 config = NULL;
1299         }
1300
1301         return (ret);
1302
1303 nomem:
1304         (void) no_memory(hdl);
1305 error:
1306         nvlist_free(config);
1307         nvlist_free(ret);
1308         for (c = 0; c < children; c++)
1309                 nvlist_free(child[c]);
1310         free(child);
1311
1312         return (NULL);
1313 }
1314
1315 /*
1316  * Return the offset of the given label.
1317  */
1318 static uint64_t
1319 label_offset(uint64_t size, int l)
1320 {
1321         ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0);
1322         return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
1323             0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
1324 }
1325
1326 /*
1327  * Given a file descriptor, read the label information and return an nvlist
1328  * describing the configuration, if there is one.  The number of valid
1329  * labels found will be returned in num_labels when non-NULL.
1330  */
1331 int
1332 zpool_read_label(int fd, nvlist_t **config, int *num_labels)
1333 {
1334         struct stat64 statbuf;
1335         int l, count = 0;
1336         vdev_label_t *label;
1337         nvlist_t *expected_config = NULL;
1338         uint64_t expected_guid = 0, size;
1339         int error;
1340
1341         *config = NULL;
1342
1343         if (fstat64_blk(fd, &statbuf) == -1)
1344                 return (0);
1345         size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
1346
1347         error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
1348         if (error)
1349                 return (-1);
1350
1351         for (l = 0; l < VDEV_LABELS; l++) {
1352                 uint64_t state, guid, txg;
1353
1354                 if (pread64(fd, label, sizeof (vdev_label_t),
1355                     label_offset(size, l)) != sizeof (vdev_label_t))
1356                         continue;
1357
1358                 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
1359                     sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0)
1360                         continue;
1361
1362                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
1363                     &guid) != 0 || guid == 0) {
1364                         nvlist_free(*config);
1365                         continue;
1366                 }
1367
1368                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
1369                     &state) != 0 || state > POOL_STATE_L2CACHE) {
1370                         nvlist_free(*config);
1371                         continue;
1372                 }
1373
1374                 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
1375                     (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
1376                     &txg) != 0 || txg == 0)) {
1377                         nvlist_free(*config);
1378                         continue;
1379                 }
1380
1381                 if (expected_guid) {
1382                         if (expected_guid == guid)
1383                                 count++;
1384
1385                         nvlist_free(*config);
1386                 } else {
1387                         expected_config = *config;
1388                         expected_guid = guid;
1389                         count++;
1390                 }
1391         }
1392
1393         if (num_labels != NULL)
1394                 *num_labels = count;
1395
1396         free(label);
1397         *config = expected_config;
1398
1399         return (0);
1400 }
1401
1402 typedef struct rdsk_node {
1403         char *rn_name;                  /* Full path to device */
1404         int rn_order;                   /* Preferred order (low to high) */
1405         int rn_num_labels;              /* Number of valid labels */
1406         uint64_t rn_vdev_guid;          /* Expected vdev guid when set */
1407         libzfs_handle_t *rn_hdl;
1408         nvlist_t *rn_config;            /* Label config */
1409         avl_tree_t *rn_avl;
1410         avl_node_t rn_node;
1411         pthread_mutex_t *rn_lock;
1412         boolean_t rn_labelpaths;
1413 } rdsk_node_t;
1414
1415 /*
1416  * Sorted by vdev guid and full path to allow for multiple entries with
1417  * the same full path name.  This is required because it's possible to
1418  * have multiple block devices with labels that refer to the same
1419  * ZPOOL_CONFIG_PATH yet have different vdev guids.  In this case both
1420  * entries need to be added to the cache.  Scenarios where this can occur
1421  * include overwritten pool labels, devices which are visible from multiple
1422  * hosts and multipath devices.
1423  */
1424 static int
1425 slice_cache_compare(const void *arg1, const void *arg2)
1426 {
1427         const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
1428         const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
1429         uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
1430         uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
1431         int rv;
1432
1433         rv = AVL_CMP(guid1, guid2);
1434         if (rv)
1435                 return (rv);
1436
1437         return (AVL_ISIGN(strcmp(nm1, nm2)));
1438 }
1439
1440 static boolean_t
1441 is_watchdog_dev(char *dev)
1442 {
1443         /* For 'watchdog' dev */
1444         if (strcmp(dev, "watchdog") == 0)
1445                 return (B_TRUE);
1446
1447         /* For 'watchdog<digit><whatever> */
1448         if (strstr(dev, "watchdog") == dev && isdigit(dev[8]))
1449                 return (B_TRUE);
1450
1451         return (B_FALSE);
1452 }
1453
1454 static int
1455 label_paths_impl(libzfs_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,
1456     uint64_t vdev_guid, char **path, char **devid)
1457 {
1458         nvlist_t **child;
1459         uint_t c, children;
1460         uint64_t guid;
1461         char *val;
1462         int error;
1463
1464         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1465             &child, &children) == 0) {
1466                 for (c = 0; c < children; c++) {
1467                         error  = label_paths_impl(hdl, child[c],
1468                             pool_guid, vdev_guid, path, devid);
1469                         if (error)
1470                                 return (error);
1471                 }
1472                 return (0);
1473         }
1474
1475         if (nvroot == NULL)
1476                 return (0);
1477
1478         error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);
1479         if ((error != 0) || (guid != vdev_guid))
1480                 return (0);
1481
1482         error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);
1483         if (error == 0)
1484                 *path = val;
1485
1486         error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);
1487         if (error == 0)
1488                 *devid = val;
1489
1490         return (0);
1491 }
1492
1493 /*
1494  * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
1495  * and store these strings as config_path and devid_path respectively.
1496  * The returned pointers are only valid as long as label remains valid.
1497  */
1498 static int
1499 label_paths(libzfs_handle_t *hdl, nvlist_t *label, char **path, char **devid)
1500 {
1501         nvlist_t *nvroot;
1502         uint64_t pool_guid;
1503         uint64_t vdev_guid;
1504
1505         *path = NULL;
1506         *devid = NULL;
1507
1508         if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1509             nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) ||
1510             nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid))
1511                 return (ENOENT);
1512
1513         return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,
1514             devid));
1515 }
1516
1517 static void
1518 zpool_open_func(void *arg)
1519 {
1520         rdsk_node_t *rn = arg;
1521         libzfs_handle_t *hdl = rn->rn_hdl;
1522         struct stat64 statbuf;
1523         nvlist_t *config;
1524         char *bname, *dupname;
1525         uint64_t vdev_guid = 0;
1526         int error;
1527         int num_labels;
1528         int fd;
1529
1530         /*
1531          * Skip devices with well known prefixes there can be side effects
1532          * when opening devices which need to be avoided.
1533          *
1534          * hpet     - High Precision Event Timer
1535          * watchdog - Watchdog must be closed in a special way.
1536          */
1537         dupname = zfs_strdup(hdl, rn->rn_name);
1538         bname = basename(dupname);
1539         error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname));
1540         free(dupname);
1541         if (error)
1542                 return;
1543
1544         /*
1545          * Ignore failed stats.  We only want regular files and block devices.
1546          */
1547         if (stat64(rn->rn_name, &statbuf) != 0 ||
1548             (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
1549                 return;
1550
1551         /*
1552          * Preferentially open using O_DIRECT to bypass the block device
1553          * cache which may be stale for multipath devices.  An EINVAL errno
1554          * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
1555          */
1556         fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
1557         if ((fd < 0) && (errno == EINVAL))
1558                 fd = open(rn->rn_name, O_RDONLY);
1559
1560         if (fd < 0)
1561                 return;
1562
1563         /*
1564          * This file is too small to hold a zpool
1565          */
1566         if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
1567                 (void) close(fd);
1568                 return;
1569         }
1570
1571         error = zpool_read_label(fd, &config, &num_labels);
1572         if (error != 0) {
1573                 (void) close(fd);
1574                 return;
1575         }
1576
1577         if (num_labels == 0) {
1578                 (void) close(fd);
1579                 nvlist_free(config);
1580                 return;
1581         }
1582
1583         /*
1584          * Check that the vdev is for the expected guid.  Additional entries
1585          * are speculatively added based on the paths stored in the labels.
1586          * Entries with valid paths but incorrect guids must be removed.
1587          */
1588         error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
1589         if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
1590                 (void) close(fd);
1591                 nvlist_free(config);
1592                 return;
1593         }
1594
1595         (void) close(fd);
1596
1597         rn->rn_config = config;
1598         rn->rn_num_labels = num_labels;
1599
1600         /*
1601          * Add additional entries for paths described by this label.
1602          */
1603         if (rn->rn_labelpaths) {
1604                 char *path = NULL;
1605                 char *devid = NULL;
1606                 rdsk_node_t *slice;
1607                 avl_index_t where;
1608                 int error;
1609
1610                 if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
1611                         return;
1612
1613                 /*
1614                  * Allow devlinks to stabilize so all paths are available.
1615                  */
1616                 zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT);
1617
1618                 if (path != NULL) {
1619                         slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1620                         slice->rn_name = zfs_strdup(hdl, path);
1621                         slice->rn_vdev_guid = vdev_guid;
1622                         slice->rn_avl = rn->rn_avl;
1623                         slice->rn_hdl = hdl;
1624                         slice->rn_order = IMPORT_ORDER_PREFERRED_1;
1625                         slice->rn_labelpaths = B_FALSE;
1626                         pthread_mutex_lock(rn->rn_lock);
1627                         if (avl_find(rn->rn_avl, slice, &where)) {
1628                         pthread_mutex_unlock(rn->rn_lock);
1629                                 free(slice->rn_name);
1630                                 free(slice);
1631                         } else {
1632                                 avl_insert(rn->rn_avl, slice, where);
1633                                 pthread_mutex_unlock(rn->rn_lock);
1634                                 zpool_open_func(slice);
1635                         }
1636                 }
1637
1638                 if (devid != NULL) {
1639                         slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1640                         error = asprintf(&slice->rn_name, "%s%s",
1641                             DEV_BYID_PATH, devid);
1642                         if (error == -1) {
1643                                 free(slice);
1644                                 return;
1645                         }
1646
1647                         slice->rn_vdev_guid = vdev_guid;
1648                         slice->rn_avl = rn->rn_avl;
1649                         slice->rn_hdl = hdl;
1650                         slice->rn_order = IMPORT_ORDER_PREFERRED_2;
1651                         slice->rn_labelpaths = B_FALSE;
1652                         pthread_mutex_lock(rn->rn_lock);
1653                         if (avl_find(rn->rn_avl, slice, &where)) {
1654                                 pthread_mutex_unlock(rn->rn_lock);
1655                                 free(slice->rn_name);
1656                                 free(slice);
1657                         } else {
1658                                 avl_insert(rn->rn_avl, slice, where);
1659                                 pthread_mutex_unlock(rn->rn_lock);
1660                                 zpool_open_func(slice);
1661                         }
1662                 }
1663         }
1664 }
1665
1666 /*
1667  * Given a file descriptor, clear (zero) the label information.  This function
1668  * is used in the appliance stack as part of the ZFS sysevent module and
1669  * to implement the "zpool labelclear" command.
1670  */
1671 int
1672 zpool_clear_label(int fd)
1673 {
1674         struct stat64 statbuf;
1675         int l;
1676         vdev_label_t *label;
1677         uint64_t size;
1678
1679         if (fstat64_blk(fd, &statbuf) == -1)
1680                 return (0);
1681         size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
1682
1683         if ((label = calloc(1, sizeof (vdev_label_t))) == NULL)
1684                 return (-1);
1685
1686         for (l = 0; l < VDEV_LABELS; l++) {
1687                 if (pwrite64(fd, label, sizeof (vdev_label_t),
1688                     label_offset(size, l)) != sizeof (vdev_label_t)) {
1689                         free(label);
1690                         return (-1);
1691                 }
1692         }
1693
1694         free(label);
1695         return (0);
1696 }
1697
1698 static void
1699 zpool_find_import_scan_add_slice(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1700     avl_tree_t *cache, char *path, const char *name, int order)
1701 {
1702         avl_index_t where;
1703         rdsk_node_t *slice;
1704
1705         slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1706         if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {
1707                 free(slice);
1708                 return;
1709         }
1710         slice->rn_vdev_guid = 0;
1711         slice->rn_lock = lock;
1712         slice->rn_avl = cache;
1713         slice->rn_hdl = hdl;
1714         slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;
1715         slice->rn_labelpaths = B_FALSE;
1716
1717         pthread_mutex_lock(lock);
1718         if (avl_find(cache, slice, &where)) {
1719                 free(slice->rn_name);
1720                 free(slice);
1721         } else {
1722                 avl_insert(cache, slice, where);
1723         }
1724         pthread_mutex_unlock(lock);
1725 }
1726
1727 static int
1728 zpool_find_import_scan_dir(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1729     avl_tree_t *cache, char *dir, int order)
1730 {
1731         int error;
1732         char path[MAXPATHLEN];
1733         struct dirent64 *dp;
1734         DIR *dirp;
1735
1736         if (realpath(dir, path) == NULL) {
1737                 error = errno;
1738                 if (error == ENOENT)
1739                         return (0);
1740
1741                 zfs_error_aux(hdl, strerror(error));
1742                 (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1743                     TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
1744                 return (error);
1745         }
1746
1747         dirp = opendir(path);
1748         if (dirp == NULL) {
1749                 error = errno;
1750                 zfs_error_aux(hdl, strerror(error));
1751                 (void) zfs_error_fmt(hdl, EZFS_BADPATH,
1752                     dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
1753                 return (error);
1754         }
1755
1756         while ((dp = readdir64(dirp)) != NULL) {
1757                 const char *name = dp->d_name;
1758                 if (name[0] == '.' &&
1759                     (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
1760                         continue;
1761
1762                 zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,
1763                     order);
1764         }
1765
1766         (void) closedir(dirp);
1767         return (0);
1768 }
1769
1770 static int
1771 zpool_find_import_scan_path(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1772     avl_tree_t *cache, char *dir, int order)
1773 {
1774         int error = 0;
1775         char path[MAXPATHLEN];
1776         char *d, *b;
1777         char *dpath, *name;
1778
1779         /*
1780          * Seperate the directory part and last part of the
1781          * path. We do this so that we can get the realpath of
1782          * the directory. We don't get the realpath on the
1783          * whole path because if it's a symlink, we want the
1784          * path of the symlink not where it points to.
1785          */
1786         d = zfs_strdup(hdl, dir);
1787         b = zfs_strdup(hdl, dir);
1788         dpath = dirname(d);
1789         name = basename(b);
1790
1791         if (realpath(dpath, path) == NULL) {
1792                 error = errno;
1793                 if (error == ENOENT) {
1794                         error = 0;
1795                         goto out;
1796                 }
1797
1798                 zfs_error_aux(hdl, strerror(error));
1799                 (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1800                     TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
1801                 goto out;
1802         }
1803
1804         zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);
1805
1806 out:
1807         free(b);
1808         free(d);
1809         return (error);
1810 }
1811
1812 /*
1813  * Scan a list of directories for zfs devices.
1814  */
1815 static int
1816 zpool_find_import_scan(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1817     avl_tree_t **slice_cache, char **dir, int dirs)
1818 {
1819         avl_tree_t *cache;
1820         rdsk_node_t *slice;
1821         void *cookie;
1822         int i, error;
1823
1824         *slice_cache = NULL;
1825         cache = zfs_alloc(hdl, sizeof (avl_tree_t));
1826         avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),
1827             offsetof(rdsk_node_t, rn_node));
1828
1829         for (i = 0; i < dirs; i++) {
1830                 struct stat sbuf;
1831
1832                 if (stat(dir[i], &sbuf) != 0) {
1833                         error = errno;
1834                         if (error == ENOENT)
1835                                 continue;
1836
1837                         zfs_error_aux(hdl, strerror(error));
1838                         (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1839                             TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);
1840                         goto error;
1841                 }
1842
1843                 /*
1844                  * If dir[i] is a directory, we walk through it and add all
1845                  * the entry to the cache. If it's not a directory, we just
1846                  * add it to the cache.
1847                  */
1848                 if (S_ISDIR(sbuf.st_mode)) {
1849                         if ((error = zpool_find_import_scan_dir(hdl, lock,
1850                             cache, dir[i], i)) != 0)
1851                                 goto error;
1852                 } else {
1853                         if ((error = zpool_find_import_scan_path(hdl, lock,
1854                             cache, dir[i], i)) != 0)
1855                                 goto error;
1856                 }
1857         }
1858
1859         *slice_cache = cache;
1860         return (0);
1861
1862 error:
1863         cookie = NULL;
1864         while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
1865                 free(slice->rn_name);
1866                 free(slice);
1867         }
1868         free(cache);
1869
1870         return (error);
1871 }
1872
1873 /*
1874  * Use libblkid to quickly enumerate all known zfs devices.
1875  */
1876 static int
1877 zpool_find_import_blkid(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1878     avl_tree_t **slice_cache)
1879 {
1880         rdsk_node_t *slice;
1881         blkid_cache cache;
1882         blkid_dev_iterate iter;
1883         blkid_dev dev;
1884         avl_index_t where;
1885         int error;
1886
1887         *slice_cache = NULL;
1888
1889         error = blkid_get_cache(&cache, NULL);
1890         if (error != 0)
1891                 return (error);
1892
1893         error = blkid_probe_all_new(cache);
1894         if (error != 0) {
1895                 blkid_put_cache(cache);
1896                 return (error);
1897         }
1898
1899         iter = blkid_dev_iterate_begin(cache);
1900         if (iter == NULL) {
1901                 blkid_put_cache(cache);
1902                 return (EINVAL);
1903         }
1904
1905         error = blkid_dev_set_search(iter, "TYPE", "zfs_member");
1906         if (error != 0) {
1907                 blkid_dev_iterate_end(iter);
1908                 blkid_put_cache(cache);
1909                 return (error);
1910         }
1911
1912         *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t));
1913         avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
1914             offsetof(rdsk_node_t, rn_node));
1915
1916         while (blkid_dev_next(iter, &dev) == 0) {
1917                 slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1918                 slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
1919                 slice->rn_vdev_guid = 0;
1920                 slice->rn_lock = lock;
1921                 slice->rn_avl = *slice_cache;
1922                 slice->rn_hdl = hdl;
1923                 slice->rn_labelpaths = B_TRUE;
1924
1925                 error = zfs_path_order(slice->rn_name, &slice->rn_order);
1926                 if (error == 0)
1927                         slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
1928                 else
1929                         slice->rn_order = IMPORT_ORDER_DEFAULT;
1930
1931                 pthread_mutex_lock(lock);
1932                 if (avl_find(*slice_cache, slice, &where)) {
1933                         free(slice->rn_name);
1934                         free(slice);
1935                 } else {
1936                         avl_insert(*slice_cache, slice, where);
1937                 }
1938                 pthread_mutex_unlock(lock);
1939         }
1940
1941         blkid_dev_iterate_end(iter);
1942         blkid_put_cache(cache);
1943
1944         return (0);
1945 }
1946
1947 char *
1948 zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
1949         "/dev/disk/by-vdev",    /* Custom rules, use first if they exist */
1950         "/dev/mapper",          /* Use multipath devices before components */
1951         "/dev/disk/by-partlabel", /* Single unique entry set by user */
1952         "/dev/disk/by-partuuid", /* Generated partition uuid */
1953         "/dev/disk/by-label",   /* Custom persistent labels */
1954         "/dev/disk/by-uuid",    /* Single unique entry and persistent */
1955         "/dev/disk/by-id",      /* May be multiple entries and persistent */
1956         "/dev/disk/by-path",    /* Encodes physical location and persistent */
1957         "/dev"                  /* UNSAFE device names will change */
1958 };
1959
1960 /*
1961  * Given a list of directories to search, find all pools stored on disk.  This
1962  * includes partial pools which are not available to import.  If no args are
1963  * given (argc is 0), then the default directory (/dev/dsk) is searched.
1964  * poolname or guid (but not both) are provided by the caller when trying
1965  * to import a specific pool.
1966  */
1967 static nvlist_t *
1968 zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
1969 {
1970         nvlist_t *ret = NULL;
1971         pool_list_t pools = { 0 };
1972         pool_entry_t *pe, *penext;
1973         vdev_entry_t *ve, *venext;
1974         config_entry_t *ce, *cenext;
1975         name_entry_t *ne, *nenext;
1976         pthread_mutex_t lock;
1977         avl_tree_t *cache;
1978         rdsk_node_t *slice;
1979         void *cookie;
1980         tpool_t *t;
1981
1982         verify(iarg->poolname == NULL || iarg->guid == 0);
1983         pthread_mutex_init(&lock, NULL);
1984
1985         /*
1986          * Locate pool member vdevs using libblkid or by directory scanning.
1987          * On success a newly allocated AVL tree which is populated with an
1988          * entry for each discovered vdev will be returned as the cache.
1989          * It's the callers responsibility to consume and destroy this tree.
1990          */
1991         if (iarg->scan || iarg->paths != 0) {
1992                 int dirs = iarg->paths;
1993                 char **dir = iarg->path;
1994
1995                 if (dirs == 0) {
1996                         dir = zpool_default_import_path;
1997                         dirs = DEFAULT_IMPORT_PATH_SIZE;
1998                 }
1999
2000                 if (zpool_find_import_scan(hdl, &lock, &cache, dir,  dirs) != 0)
2001                         return (NULL);
2002         } else {
2003                 if (zpool_find_import_blkid(hdl, &lock, &cache) != 0)
2004                         return (NULL);
2005         }
2006
2007         /*
2008          * Create a thread pool to parallelize the process of reading and
2009          * validating labels, a large number of threads can be used due to
2010          * minimal contention.
2011          */
2012         t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
2013         for (slice = avl_first(cache); slice;
2014             (slice = avl_walk(cache, slice, AVL_AFTER)))
2015                 (void) tpool_dispatch(t, zpool_open_func, slice);
2016
2017         tpool_wait(t);
2018         tpool_destroy(t);
2019
2020         /*
2021          * Process the cache filtering out any entries which are not
2022          * for the specificed pool then adding matching label configs.
2023          */
2024         cookie = NULL;
2025         while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
2026                 if (slice->rn_config != NULL) {
2027                         nvlist_t *config = slice->rn_config;
2028                         boolean_t matched = B_TRUE;
2029                         boolean_t aux = B_FALSE;
2030                         int fd;
2031
2032                         /*
2033                          * Check if it's a spare or l2cache device. If it is,
2034                          * we need to skip the name and guid check since they
2035                          * don't exist on aux device label.
2036                          */
2037                         if (iarg->poolname != NULL || iarg->guid != 0) {
2038                                 uint64_t state;
2039                                 aux = nvlist_lookup_uint64(config,
2040                                     ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&
2041                                     (state == POOL_STATE_SPARE ||
2042                                     state == POOL_STATE_L2CACHE);
2043                         }
2044
2045                         if (iarg->poolname != NULL && !aux) {
2046                                 char *pname;
2047
2048                                 matched = nvlist_lookup_string(config,
2049                                     ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&
2050                                     strcmp(iarg->poolname, pname) == 0;
2051                         } else if (iarg->guid != 0 && !aux) {
2052                                 uint64_t this_guid;
2053
2054                                 matched = nvlist_lookup_uint64(config,
2055                                     ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&
2056                                     iarg->guid == this_guid;
2057                         }
2058                         if (!matched) {
2059                                 nvlist_free(config);
2060                         } else {
2061                                 /*
2062                                  * Verify all remaining entries can be opened
2063                                  * exclusively. This will prune all underlying
2064                                  * multipath devices which otherwise could
2065                                  * result in the vdev appearing as UNAVAIL.
2066                                  *
2067                                  * Under zdb, this step isn't required and
2068                                  * would prevent a zdb -e of active pools with
2069                                  * no cachefile.
2070                                  */
2071                                 fd = open(slice->rn_name, O_RDONLY | O_EXCL);
2072                                 if (fd >= 0 || iarg->can_be_active) {
2073                                         if (fd >= 0)
2074                                                 close(fd);
2075                                         add_config(hdl, &pools,
2076                                             slice->rn_name, slice->rn_order,
2077                                             slice->rn_num_labels, config);
2078                                 } else {
2079                                         nvlist_free(config);
2080                                 }
2081                         }
2082                 }
2083                 free(slice->rn_name);
2084                 free(slice);
2085         }
2086         avl_destroy(cache);
2087         free(cache);
2088         pthread_mutex_destroy(&lock);
2089
2090         ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
2091
2092         for (pe = pools.pools; pe != NULL; pe = penext) {
2093                 penext = pe->pe_next;
2094                 for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
2095                         venext = ve->ve_next;
2096                         for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
2097                                 cenext = ce->ce_next;
2098                                 nvlist_free(ce->ce_config);
2099                                 free(ce);
2100                         }
2101                         free(ve);
2102                 }
2103                 free(pe);
2104         }
2105
2106         for (ne = pools.names; ne != NULL; ne = nenext) {
2107                 nenext = ne->ne_next;
2108                 free(ne->ne_name);
2109                 free(ne);
2110         }
2111
2112         return (ret);
2113 }
2114
2115 nvlist_t *
2116 zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
2117 {
2118         importargs_t iarg = { 0 };
2119
2120         iarg.paths = argc;
2121         iarg.path = argv;
2122
2123         return (zpool_find_import_impl(hdl, &iarg));
2124 }
2125
2126 /*
2127  * Given a cache file, return the contents as a list of importable pools.
2128  * poolname or guid (but not both) are provided by the caller when trying
2129  * to import a specific pool.
2130  */
2131 nvlist_t *
2132 zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
2133     char *poolname, uint64_t guid)
2134 {
2135         char *buf;
2136         int fd;
2137         struct stat64 statbuf;
2138         nvlist_t *raw, *src, *dst;
2139         nvlist_t *pools;
2140         nvpair_t *elem;
2141         char *name;
2142         uint64_t this_guid;
2143         boolean_t active;
2144
2145         verify(poolname == NULL || guid == 0);
2146
2147         if ((fd = open(cachefile, O_RDONLY)) < 0) {
2148                 zfs_error_aux(hdl, "%s", strerror(errno));
2149                 (void) zfs_error(hdl, EZFS_BADCACHE,
2150                     dgettext(TEXT_DOMAIN, "failed to open cache file"));
2151                 return (NULL);
2152         }
2153
2154         if (fstat64(fd, &statbuf) != 0) {
2155                 zfs_error_aux(hdl, "%s", strerror(errno));
2156                 (void) close(fd);
2157                 (void) zfs_error(hdl, EZFS_BADCACHE,
2158                     dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
2159                 return (NULL);
2160         }
2161
2162         if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) {
2163                 (void) close(fd);
2164                 return (NULL);
2165         }
2166
2167         if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
2168                 (void) close(fd);
2169                 free(buf);
2170                 (void) zfs_error(hdl, EZFS_BADCACHE,
2171                     dgettext(TEXT_DOMAIN,
2172                     "failed to read cache file contents"));
2173                 return (NULL);
2174         }
2175
2176         (void) close(fd);
2177
2178         if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
2179                 free(buf);
2180                 (void) zfs_error(hdl, EZFS_BADCACHE,
2181                     dgettext(TEXT_DOMAIN,
2182                     "invalid or corrupt cache file contents"));
2183                 return (NULL);
2184         }
2185
2186         free(buf);
2187
2188         /*
2189          * Go through and get the current state of the pools and refresh their
2190          * state.
2191          */
2192         if (nvlist_alloc(&pools, 0, 0) != 0) {
2193                 (void) no_memory(hdl);
2194                 nvlist_free(raw);
2195                 return (NULL);
2196         }
2197
2198         elem = NULL;
2199         while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
2200                 src = fnvpair_value_nvlist(elem);
2201
2202                 name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
2203                 if (poolname != NULL && strcmp(poolname, name) != 0)
2204                         continue;
2205
2206                 this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
2207                 if (guid != 0 && guid != this_guid)
2208                         continue;
2209
2210                 if (pool_active(hdl, name, this_guid, &active) != 0) {
2211                         nvlist_free(raw);
2212                         nvlist_free(pools);
2213                         return (NULL);
2214                 }
2215
2216                 if (active)
2217                         continue;
2218
2219                 if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
2220                     cachefile) != 0) {
2221                         (void) no_memory(hdl);
2222                         nvlist_free(raw);
2223                         nvlist_free(pools);
2224                         return (NULL);
2225                 }
2226
2227                 if ((dst = refresh_config(hdl, src)) == NULL) {
2228                         nvlist_free(raw);
2229                         nvlist_free(pools);
2230                         return (NULL);
2231                 }
2232
2233                 if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
2234                         (void) no_memory(hdl);
2235                         nvlist_free(dst);
2236                         nvlist_free(raw);
2237                         nvlist_free(pools);
2238                         return (NULL);
2239                 }
2240                 nvlist_free(dst);
2241         }
2242
2243         nvlist_free(raw);
2244         return (pools);
2245 }
2246
2247 static int
2248 name_or_guid_exists(zpool_handle_t *zhp, void *data)
2249 {
2250         importargs_t *import = data;
2251         int found = 0;
2252
2253         if (import->poolname != NULL) {
2254                 char *pool_name;
2255
2256                 verify(nvlist_lookup_string(zhp->zpool_config,
2257                     ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
2258                 if (strcmp(pool_name, import->poolname) == 0)
2259                         found = 1;
2260         } else {
2261                 uint64_t pool_guid;
2262
2263                 verify(nvlist_lookup_uint64(zhp->zpool_config,
2264                     ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
2265                 if (pool_guid == import->guid)
2266                         found = 1;
2267         }
2268
2269         zpool_close(zhp);
2270         return (found);
2271 }
2272
2273 nvlist_t *
2274 zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
2275 {
2276         verify(import->poolname == NULL || import->guid == 0);
2277
2278         if (import->unique)
2279                 import->exists = zpool_iter(hdl, name_or_guid_exists, import);
2280
2281         if (import->cachefile != NULL)
2282                 return (zpool_find_import_cached(hdl, import->cachefile,
2283                     import->poolname, import->guid));
2284
2285         return (zpool_find_import_impl(hdl, import));
2286 }
2287
2288 static boolean_t
2289 pool_match(nvlist_t *cfg, char *tgt)
2290 {
2291         uint64_t v, guid = strtoull(tgt, NULL, 0);
2292         char *s;
2293
2294         if (guid != 0) {
2295                 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
2296                         return (v == guid);
2297         } else {
2298                 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
2299                         return (strcmp(s, tgt) == 0);
2300         }
2301         return (B_FALSE);
2302 }
2303
2304 int
2305 zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp,
2306     importargs_t *args)
2307 {
2308         nvlist_t *pools;
2309         nvlist_t *match = NULL;
2310         nvlist_t *config = NULL;
2311         char *name = NULL, *sepp = NULL;
2312         char sep = '\0';
2313         int count = 0;
2314         char *targetdup = strdup(target);
2315
2316         *configp = NULL;
2317
2318         if ((sepp = strpbrk(targetdup, "/@")) != NULL) {
2319                 sep = *sepp;
2320                 *sepp = '\0';
2321         }
2322
2323         pools = zpool_search_import(hdl, args);
2324
2325         if (pools != NULL) {
2326                 nvpair_t *elem = NULL;
2327                 while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
2328                         VERIFY0(nvpair_value_nvlist(elem, &config));
2329                         if (pool_match(config, targetdup)) {
2330                                 count++;
2331                                 if (match != NULL) {
2332                                         /* multiple matches found */
2333                                         continue;
2334                                 } else {
2335                                         match = config;
2336                                         name = nvpair_name(elem);
2337                                 }
2338                         }
2339                 }
2340         }
2341
2342         if (count == 0) {
2343                 (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2344                     "no pools found"));
2345                 free(targetdup);
2346                 return (ENOENT);
2347         }
2348
2349         if (count > 1) {
2350                 (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2351                     "%d pools found, use pool GUID\n"), count);
2352                 free(targetdup);
2353                 return (EINVAL);
2354         }
2355
2356         *configp = match;
2357         free(targetdup);
2358
2359         return (0);
2360 }
2361
2362 boolean_t
2363 find_guid(nvlist_t *nv, uint64_t guid)
2364 {
2365         uint64_t tmp;
2366         nvlist_t **child;
2367         uint_t c, children;
2368
2369         verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0);
2370         if (tmp == guid)
2371                 return (B_TRUE);
2372
2373         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2374             &child, &children) == 0) {
2375                 for (c = 0; c < children; c++)
2376                         if (find_guid(child[c], guid))
2377                                 return (B_TRUE);
2378         }
2379
2380         return (B_FALSE);
2381 }
2382
2383 typedef struct aux_cbdata {
2384         const char      *cb_type;
2385         uint64_t        cb_guid;
2386         zpool_handle_t  *cb_zhp;
2387 } aux_cbdata_t;
2388
2389 static int
2390 find_aux(zpool_handle_t *zhp, void *data)
2391 {
2392         aux_cbdata_t *cbp = data;
2393         nvlist_t **list;
2394         uint_t i, count;
2395         uint64_t guid;
2396         nvlist_t *nvroot;
2397
2398         verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
2399             &nvroot) == 0);
2400
2401         if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type,
2402             &list, &count) == 0) {
2403                 for (i = 0; i < count; i++) {
2404                         verify(nvlist_lookup_uint64(list[i],
2405                             ZPOOL_CONFIG_GUID, &guid) == 0);
2406                         if (guid == cbp->cb_guid) {
2407                                 cbp->cb_zhp = zhp;
2408                                 return (1);
2409                         }
2410                 }
2411         }
2412
2413         zpool_close(zhp);
2414         return (0);
2415 }
2416
2417 /*
2418  * Determines if the pool is in use.  If so, it returns true and the state of
2419  * the pool as well as the name of the pool.  Name string is allocated and
2420  * must be freed by the caller.
2421  */
2422 int
2423 zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
2424     boolean_t *inuse)
2425 {
2426         nvlist_t *config;
2427         char *name;
2428         boolean_t ret;
2429         uint64_t guid, vdev_guid;
2430         zpool_handle_t *zhp;
2431         nvlist_t *pool_config;
2432         uint64_t stateval, isspare;
2433         aux_cbdata_t cb = { 0 };
2434         boolean_t isactive;
2435
2436         *inuse = B_FALSE;
2437
2438         if (zpool_read_label(fd, &config, NULL) != 0) {
2439                 (void) no_memory(hdl);
2440                 return (-1);
2441         }
2442
2443         if (config == NULL)
2444                 return (0);
2445
2446         verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2447             &stateval) == 0);
2448         verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
2449             &vdev_guid) == 0);
2450
2451         if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) {
2452                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
2453                     &name) == 0);
2454                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
2455                     &guid) == 0);
2456         }
2457
2458         switch (stateval) {
2459         case POOL_STATE_EXPORTED:
2460                 /*
2461                  * A pool with an exported state may in fact be imported
2462                  * read-only, so check the in-core state to see if it's
2463                  * active and imported read-only.  If it is, set
2464                  * its state to active.
2465                  */
2466                 if (pool_active(hdl, name, guid, &isactive) == 0 && isactive &&
2467                     (zhp = zpool_open_canfail(hdl, name)) != NULL) {
2468                         if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL))
2469                                 stateval = POOL_STATE_ACTIVE;
2470
2471                         /*
2472                          * All we needed the zpool handle for is the
2473                          * readonly prop check.
2474                          */
2475                         zpool_close(zhp);
2476                 }
2477
2478                 ret = B_TRUE;
2479                 break;
2480
2481         case POOL_STATE_ACTIVE:
2482                 /*
2483                  * For an active pool, we have to determine if it's really part
2484                  * of a currently active pool (in which case the pool will exist
2485                  * and the guid will be the same), or whether it's part of an
2486                  * active pool that was disconnected without being explicitly
2487                  * exported.
2488                  */
2489                 if (pool_active(hdl, name, guid, &isactive) != 0) {
2490                         nvlist_free(config);
2491                         return (-1);
2492                 }
2493
2494                 if (isactive) {
2495                         /*
2496                          * Because the device may have been removed while
2497                          * offlined, we only report it as active if the vdev is
2498                          * still present in the config.  Otherwise, pretend like
2499                          * it's not in use.
2500                          */
2501                         if ((zhp = zpool_open_canfail(hdl, name)) != NULL &&
2502                             (pool_config = zpool_get_config(zhp, NULL))
2503                             != NULL) {
2504                                 nvlist_t *nvroot;
2505
2506                                 verify(nvlist_lookup_nvlist(pool_config,
2507                                     ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2508                                 ret = find_guid(nvroot, vdev_guid);
2509                         } else {
2510                                 ret = B_FALSE;
2511                         }
2512
2513                         /*
2514                          * If this is an active spare within another pool, we
2515                          * treat it like an unused hot spare.  This allows the
2516                          * user to create a pool with a hot spare that currently
2517                          * in use within another pool.  Since we return B_TRUE,
2518                          * libdiskmgt will continue to prevent generic consumers
2519                          * from using the device.
2520                          */
2521                         if (ret && nvlist_lookup_uint64(config,
2522                             ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare)
2523                                 stateval = POOL_STATE_SPARE;
2524
2525                         if (zhp != NULL)
2526                                 zpool_close(zhp);
2527                 } else {
2528                         stateval = POOL_STATE_POTENTIALLY_ACTIVE;
2529                         ret = B_TRUE;
2530                 }
2531                 break;
2532
2533         case POOL_STATE_SPARE:
2534                 /*
2535                  * For a hot spare, it can be either definitively in use, or
2536                  * potentially active.  To determine if it's in use, we iterate
2537                  * over all pools in the system and search for one with a spare
2538                  * with a matching guid.
2539                  *
2540                  * Due to the shared nature of spares, we don't actually report
2541                  * the potentially active case as in use.  This means the user
2542                  * can freely create pools on the hot spares of exported pools,
2543                  * but to do otherwise makes the resulting code complicated, and
2544                  * we end up having to deal with this case anyway.
2545                  */
2546                 cb.cb_zhp = NULL;
2547                 cb.cb_guid = vdev_guid;
2548                 cb.cb_type = ZPOOL_CONFIG_SPARES;
2549                 if (zpool_iter(hdl, find_aux, &cb) == 1) {
2550                         name = (char *)zpool_get_name(cb.cb_zhp);
2551                         ret = B_TRUE;
2552                 } else {
2553                         ret = B_FALSE;
2554                 }
2555                 break;
2556
2557         case POOL_STATE_L2CACHE:
2558
2559                 /*
2560                  * Check if any pool is currently using this l2cache device.
2561                  */
2562                 cb.cb_zhp = NULL;
2563                 cb.cb_guid = vdev_guid;
2564                 cb.cb_type = ZPOOL_CONFIG_L2CACHE;
2565                 if (zpool_iter(hdl, find_aux, &cb) == 1) {
2566                         name = (char *)zpool_get_name(cb.cb_zhp);
2567                         ret = B_TRUE;
2568                 } else {
2569                         ret = B_FALSE;
2570                 }
2571                 break;
2572
2573         default:
2574                 ret = B_FALSE;
2575         }
2576
2577
2578         if (ret) {
2579                 if ((*namestr = zfs_strdup(hdl, name)) == NULL) {
2580                         if (cb.cb_zhp)
2581                                 zpool_close(cb.cb_zhp);
2582                         nvlist_free(config);
2583                         return (-1);
2584                 }
2585                 *state = (pool_state_t)stateval;
2586         }
2587
2588         if (cb.cb_zhp)
2589                 zpool_close(cb.cb_zhp);
2590
2591         nvlist_free(config);
2592         *inuse = ret;
2593         return (0);
2594 }