]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - lib/libzfs/libzfs_import.c
Fix calloc(3) arguments order
[FreeBSD/FreeBSD.git] / lib / libzfs / libzfs_import.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25  * Copyright 2015 RackTop Systems.
26  * Copyright (c) 2016, Intel Corporation.
27  */
28
29 /*
30  * Pool import support functions.
31  *
32  * To import a pool, we rely on reading the configuration information from the
33  * ZFS label of each device.  If we successfully read the label, then we
34  * organize the configuration information in the following hierarchy:
35  *
36  *      pool guid -> toplevel vdev guid -> label txg
37  *
38  * Duplicate entries matching this same tuple will be discarded.  Once we have
39  * examined every device, we pick the best label txg config for each toplevel
40  * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
41  * update any paths that have changed.  Finally, we attempt to import the pool
42  * using our derived config, and record the results.
43  */
44
45 #include <ctype.h>
46 #include <devid.h>
47 #include <dirent.h>
48 #include <errno.h>
49 #include <libintl.h>
50 #include <libgen.h>
51 #ifdef HAVE_LIBUDEV
52 #include <libudev.h>
53 #include <sched.h>
54 #endif
55 #include <stddef.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <sys/stat.h>
59 #include <unistd.h>
60 #include <fcntl.h>
61 #include <sys/vtoc.h>
62 #include <sys/dktp/fdisk.h>
63 #include <sys/efi_partition.h>
64 #include <thread_pool.h>
65 #include <sys/vdev_impl.h>
66 #include <blkid/blkid.h>
67 #include "libzfs.h"
68 #include "libzfs_impl.h"
69 #include <libzfs.h>
70
71 /*
72  * Intermediate structures used to gather configuration information.
73  */
74 typedef struct config_entry {
75         uint64_t                ce_txg;
76         nvlist_t                *ce_config;
77         struct config_entry     *ce_next;
78 } config_entry_t;
79
80 typedef struct vdev_entry {
81         uint64_t                ve_guid;
82         config_entry_t          *ve_configs;
83         struct vdev_entry       *ve_next;
84 } vdev_entry_t;
85
86 typedef struct pool_entry {
87         uint64_t                pe_guid;
88         vdev_entry_t            *pe_vdevs;
89         struct pool_entry       *pe_next;
90 } pool_entry_t;
91
92 typedef struct name_entry {
93         char                    *ne_name;
94         uint64_t                ne_guid;
95         uint64_t                ne_order;
96         uint64_t                ne_num_labels;
97         struct name_entry       *ne_next;
98 } name_entry_t;
99
100 typedef struct pool_list {
101         pool_entry_t            *pools;
102         name_entry_t            *names;
103 } pool_list_t;
104
105 #define DEV_BYID_PATH   "/dev/disk/by-id/"
106
107 /*
108  * Linux persistent device strings for vdev labels
109  *
110  * based on libudev for consistency with libudev disk add/remove events
111  */
112 #ifdef HAVE_LIBUDEV
113
114 typedef struct vdev_dev_strs {
115         char    vds_devid[128];
116         char    vds_devphys[128];
117 } vdev_dev_strs_t;
118
119 /*
120  * Obtain the persistent device id string (describes what)
121  *
122  * used by ZED vdev matching for auto-{online,expand,replace}
123  */
124 int
125 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
126 {
127         struct udev_list_entry *entry;
128         const char *bus;
129         char devbyid[MAXPATHLEN];
130
131         /* The bus based by-id path is preferred */
132         bus = udev_device_get_property_value(dev, "ID_BUS");
133
134         if (bus == NULL) {
135                 const char *dm_uuid;
136
137                 /*
138                  * For multipath nodes use the persistent uuid based identifier
139                  *
140                  * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
141                  */
142                 dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
143                 if (dm_uuid != NULL) {
144                         (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
145                         return (0);
146                 }
147
148                 /*
149                  * NVME 'by-id' symlinks are similar to bus case
150                  */
151                 struct udev_device *parent;
152
153                 parent = udev_device_get_parent_with_subsystem_devtype(dev,
154                     "nvme", NULL);
155                 if (parent != NULL)
156                         bus = "nvme";   /* continue with bus symlink search */
157                 else
158                         return (ENODATA);
159         }
160
161         /*
162          * locate the bus specific by-id link
163          */
164         (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
165         entry = udev_device_get_devlinks_list_entry(dev);
166         while (entry != NULL) {
167                 const char *name;
168
169                 name = udev_list_entry_get_name(entry);
170                 if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
171                         name += strlen(DEV_BYID_PATH);
172                         (void) strlcpy(bufptr, name, buflen);
173                         return (0);
174                 }
175                 entry = udev_list_entry_get_next(entry);
176         }
177
178         return (ENODATA);
179 }
180
181 /*
182  * Obtain the persistent physical location string (describes where)
183  *
184  * used by ZED vdev matching for auto-{online,expand,replace}
185  */
186 int
187 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
188 {
189         const char *physpath = NULL;
190
191         /*
192          * Normal disks use ID_PATH for their physical path.  Device mapper
193          * devices are virtual and don't have a physical path.  For them we
194          * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file.
195          * ID_VDEV provides a persistent path to a virtual device.  If you
196          * don't have vdev_id.conf setup, you cannot use multipath autoreplace.
197          */
198         if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) &&
199             physpath[0])) {
200                 if (!((physpath =
201                     udev_device_get_property_value(dev, "ID_VDEV")) &&
202                     physpath[0])) {
203                         return (ENODATA);
204                 }
205         }
206
207         (void) strlcpy(bufptr, physpath, buflen);
208
209         return (0);
210 }
211
212 boolean_t
213 udev_is_mpath(struct udev_device *dev)
214 {
215         return udev_device_get_property_value(dev, "DM_UUID") &&
216             udev_device_get_property_value(dev, "MPATH_SBIN_PATH");
217 }
218
219 /*
220  * A disk is considered a multipath whole disk when:
221  *      DEVNAME key value has "dm-"
222  *      DM_NAME key value has "mpath" prefix
223  *      DM_UUID key exists
224  *      ID_PART_TABLE_TYPE key does not exist or is not gpt
225  */
226 static boolean_t
227 udev_mpath_whole_disk(struct udev_device *dev)
228 {
229         const char *devname, *type, *uuid;
230
231         devname = udev_device_get_property_value(dev, "DEVNAME");
232         type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
233         uuid = udev_device_get_property_value(dev, "DM_UUID");
234
235         if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
236             ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
237             (uuid != NULL)) {
238                 return (B_TRUE);
239         }
240
241         return (B_FALSE);
242 }
243
244 /*
245  * Check if a disk is effectively a multipath whole disk
246  */
247 boolean_t
248 is_mpath_whole_disk(const char *path)
249 {
250         struct udev *udev;
251         struct udev_device *dev = NULL;
252         char nodepath[MAXPATHLEN];
253         char *sysname;
254         boolean_t wholedisk = B_FALSE;
255
256         if (realpath(path, nodepath) == NULL)
257                 return (B_FALSE);
258         sysname = strrchr(nodepath, '/') + 1;
259         if (strncmp(sysname, "dm-", 3) != 0)
260                 return (B_FALSE);
261         if ((udev = udev_new()) == NULL)
262                 return (B_FALSE);
263         if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
264             sysname)) == NULL) {
265                 udev_device_unref(dev);
266                 return (B_FALSE);
267         }
268
269         wholedisk = udev_mpath_whole_disk(dev);
270
271         udev_device_unref(dev);
272         return (wholedisk);
273 }
274
275 static int
276 udev_device_is_ready(struct udev_device *dev)
277 {
278 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
279         return (udev_device_get_is_initialized(dev));
280 #else
281         /* wait for DEVLINKS property to be initialized */
282         return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
283 #endif
284 }
285
286 /*
287  * Wait up to timeout_ms for udev to set up the device node.  The device is
288  * considered ready when libudev determines it has been initialized, all of
289  * the device links have been verified to exist, and it has been allowed to
290  * settle.  At this point the device the device can be accessed reliably.
291  * Depending on the complexity of the udev rules this process could take
292  * several seconds.
293  */
294 int
295 zpool_label_disk_wait(char *path, int timeout_ms)
296 {
297         struct udev *udev;
298         struct udev_device *dev = NULL;
299         char nodepath[MAXPATHLEN];
300         char *sysname = NULL;
301         int ret = ENODEV;
302         int settle_ms = 50;
303         long sleep_ms = 10;
304         hrtime_t start, settle;
305
306         if ((udev = udev_new()) == NULL)
307                 return (ENXIO);
308
309         start = gethrtime();
310         settle = 0;
311
312         do {
313                 if (sysname == NULL) {
314                         if (realpath(path, nodepath) != NULL) {
315                                 sysname = strrchr(nodepath, '/') + 1;
316                         } else {
317                                 (void) usleep(sleep_ms * MILLISEC);
318                                 continue;
319                         }
320                 }
321
322                 dev = udev_device_new_from_subsystem_sysname(udev,
323                     "block", sysname);
324                 if ((dev != NULL) && udev_device_is_ready(dev)) {
325                         struct udev_list_entry *links, *link = NULL;
326
327                         ret = 0;
328                         links = udev_device_get_devlinks_list_entry(dev);
329
330                         udev_list_entry_foreach(link, links) {
331                                 struct stat64 statbuf;
332                                 const char *name;
333
334                                 name = udev_list_entry_get_name(link);
335                                 errno = 0;
336                                 if (stat64(name, &statbuf) == 0 && errno == 0)
337                                         continue;
338
339                                 settle = 0;
340                                 ret = ENODEV;
341                                 break;
342                         }
343
344                         if (ret == 0) {
345                                 if (settle == 0) {
346                                         settle = gethrtime();
347                                 } else if (NSEC2MSEC(gethrtime() - settle) >=
348                                     settle_ms) {
349                                         udev_device_unref(dev);
350                                         break;
351                                 }
352                         }
353                 }
354
355                 udev_device_unref(dev);
356                 (void) usleep(sleep_ms * MILLISEC);
357
358         } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
359
360         udev_unref(udev);
361
362         return (ret);
363 }
364
365
366 /*
367  * Encode the persistent devices strings
368  * used for the vdev disk label
369  */
370 static int
371 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
372     boolean_t wholedisk)
373 {
374         struct udev *udev;
375         struct udev_device *dev = NULL;
376         char nodepath[MAXPATHLEN];
377         char *sysname;
378         int ret = ENODEV;
379         hrtime_t start;
380
381         if ((udev = udev_new()) == NULL)
382                 return (ENXIO);
383
384         /* resolve path to a runtime device node instance */
385         if (realpath(path, nodepath) == NULL)
386                 goto no_dev;
387
388         sysname = strrchr(nodepath, '/') + 1;
389
390         /*
391          * Wait up to 3 seconds for udev to set up the device node context
392          */
393         start = gethrtime();
394         do {
395                 dev = udev_device_new_from_subsystem_sysname(udev, "block",
396                     sysname);
397                 if (dev == NULL)
398                         goto no_dev;
399                 if (udev_device_is_ready(dev))
400                         break;  /* udev ready */
401
402                 udev_device_unref(dev);
403                 dev = NULL;
404
405                 if (NSEC2MSEC(gethrtime() - start) < 10)
406                         (void) sched_yield();   /* yield/busy wait up to 10ms */
407                 else
408                         (void) usleep(10 * MILLISEC);
409
410         } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
411
412         if (dev == NULL)
413                 goto no_dev;
414
415         /*
416          * Only whole disks require extra device strings
417          */
418         if (!wholedisk && !udev_mpath_whole_disk(dev))
419                 goto no_dev;
420
421         ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
422         if (ret != 0)
423                 goto no_dev_ref;
424
425         /* physical location string (optional) */
426         if (zfs_device_get_physical(dev, ds->vds_devphys,
427             sizeof (ds->vds_devphys)) != 0) {
428                 ds->vds_devphys[0] = '\0'; /* empty string --> not available */
429         }
430
431 no_dev_ref:
432         udev_device_unref(dev);
433 no_dev:
434         udev_unref(udev);
435
436         return (ret);
437 }
438
439 /*
440  * Update a leaf vdev's persistent device strings (Linux only)
441  *
442  * - only applies for a dedicated leaf vdev (aka whole disk)
443  * - updated during pool create|add|attach|import
444  * - used for matching device matching during auto-{online,expand,replace}
445  * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
446  * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
447  *
448  * single device node example:
449  *      devid:          'scsi-MG03SCA300_350000494a8cb3d67-part1'
450  *      phys_path:      'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
451  *
452  * multipath device node example:
453  *      devid:          'dm-uuid-mpath-35000c5006304de3f'
454  *
455  * We also store the enclosure sysfs path for turning on enclosure LEDs
456  * (if applicable):
457  *      vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
458  */
459 void
460 update_vdev_config_dev_strs(nvlist_t *nv)
461 {
462         vdev_dev_strs_t vds;
463         char *env, *type, *path;
464         uint64_t wholedisk = 0;
465         char *upath, *spath;
466
467         /*
468          * For the benefit of legacy ZFS implementations, allow
469          * for opting out of devid strings in the vdev label.
470          *
471          * example use:
472          *      env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
473          *
474          * explanation:
475          * Older ZFS on Linux implementations had issues when attempting to
476          * display pool config VDEV names if a "devid" NVP value is present
477          * in the pool's config.
478          *
479          * For example, a pool that originated on illumos platform would
480          * have a devid value in the config and "zpool status" would fail
481          * when listing the config.
482          *
483          * A pool can be stripped of any "devid" values on import or
484          * prevented from adding them on zpool create|add by setting
485          * ZFS_VDEV_DEVID_OPT_OUT.
486          */
487         env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
488         if (env && (strtoul(env, NULL, 0) > 0 ||
489             !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
490                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
491                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
492                 return;
493         }
494
495         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
496             strcmp(type, VDEV_TYPE_DISK) != 0) {
497                 return;
498         }
499         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
500                 return;
501         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
502
503         /*
504          * Update device string values in config nvlist
505          */
506         if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
507                 (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
508                 if (vds.vds_devphys[0] != '\0') {
509                         (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
510                             vds.vds_devphys);
511                 }
512
513                 /* Add enclosure sysfs path (if disk is in an enclosure) */
514                 upath = zfs_get_underlying_path(path);
515                 spath = zfs_get_enclosure_sysfs_path(upath);
516                 if (spath)
517                         nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
518                             spath);
519                 else
520                         nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
521
522                 free(upath);
523                 free(spath);
524         } else {
525                 /* clear out any stale entries */
526                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
527                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
528                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
529         }
530 }
531 #else
532
533 boolean_t
534 is_mpath_whole_disk(const char *path)
535 {
536         return (B_FALSE);
537 }
538
539 /*
540  * Wait up to timeout_ms for udev to set up the device node.  The device is
541  * considered ready when the provided path have been verified to exist and
542  * it has been allowed to settle.  At this point the device the device can
543  * be accessed reliably.  Depending on the complexity of the udev rules thisi
544  * process could take several seconds.
545  */
546 int
547 zpool_label_disk_wait(char *path, int timeout_ms)
548 {
549         int settle_ms = 50;
550         long sleep_ms = 10;
551         hrtime_t start, settle;
552         struct stat64 statbuf;
553
554         start = gethrtime();
555         settle = 0;
556
557         do {
558                 errno = 0;
559                 if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
560                         if (settle == 0)
561                                 settle = gethrtime();
562                         else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
563                                 return (0);
564                 } else if (errno != ENOENT) {
565                         return (errno);
566                 }
567
568                 usleep(sleep_ms * MILLISEC);
569         } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
570
571         return (ENODEV);
572 }
573
574 void
575 update_vdev_config_dev_strs(nvlist_t *nv)
576 {
577 }
578
579 #endif /* HAVE_LIBUDEV */
580
581 /*
582  * Go through and fix up any path and/or devid information for the given vdev
583  * configuration.
584  */
585 static int
586 fix_paths(nvlist_t *nv, name_entry_t *names)
587 {
588         nvlist_t **child;
589         uint_t c, children;
590         uint64_t guid;
591         name_entry_t *ne, *best;
592         char *path;
593
594         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
595             &child, &children) == 0) {
596                 for (c = 0; c < children; c++)
597                         if (fix_paths(child[c], names) != 0)
598                                 return (-1);
599                 return (0);
600         }
601
602         /*
603          * This is a leaf (file or disk) vdev.  In either case, go through
604          * the name list and see if we find a matching guid.  If so, replace
605          * the path and see if we can calculate a new devid.
606          *
607          * There may be multiple names associated with a particular guid, in
608          * which case we have overlapping partitions or multiple paths to the
609          * same disk.  In this case we prefer to use the path name which
610          * matches the ZPOOL_CONFIG_PATH.  If no matching entry is found we
611          * use the lowest order device which corresponds to the first match
612          * while traversing the ZPOOL_IMPORT_PATH search path.
613          */
614         verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
615         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
616                 path = NULL;
617
618         best = NULL;
619         for (ne = names; ne != NULL; ne = ne->ne_next) {
620                 if (ne->ne_guid == guid) {
621                         if (path == NULL) {
622                                 best = ne;
623                                 break;
624                         }
625
626                         if ((strlen(path) == strlen(ne->ne_name)) &&
627                             strncmp(path, ne->ne_name, strlen(path)) == 0) {
628                                 best = ne;
629                                 break;
630                         }
631
632                         if (best == NULL) {
633                                 best = ne;
634                                 continue;
635                         }
636
637                         /* Prefer paths with move vdev labels. */
638                         if (ne->ne_num_labels > best->ne_num_labels) {
639                                 best = ne;
640                                 continue;
641                         }
642
643                         /* Prefer paths earlier in the search order. */
644                         if (ne->ne_num_labels == best->ne_num_labels &&
645                             ne->ne_order < best->ne_order) {
646                                 best = ne;
647                                 continue;
648                         }
649                 }
650         }
651
652         if (best == NULL)
653                 return (0);
654
655         if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
656                 return (-1);
657
658         /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */
659         update_vdev_config_dev_strs(nv);
660
661         return (0);
662 }
663
664 /*
665  * Add the given configuration to the list of known devices.
666  */
667 static int
668 add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path,
669     int order, int num_labels, nvlist_t *config)
670 {
671         uint64_t pool_guid, vdev_guid, top_guid, txg, state;
672         pool_entry_t *pe;
673         vdev_entry_t *ve;
674         config_entry_t *ce;
675         name_entry_t *ne;
676
677         /*
678          * If this is a hot spare not currently in use or level 2 cache
679          * device, add it to the list of names to translate, but don't do
680          * anything else.
681          */
682         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
683             &state) == 0 &&
684             (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
685             nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
686                 if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) {
687                         nvlist_free(config);
688                         return (-1);
689                 }
690
691                 if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
692                         free(ne);
693                         nvlist_free(config);
694                         return (-1);
695                 }
696                 ne->ne_guid = vdev_guid;
697                 ne->ne_order = order;
698                 ne->ne_num_labels = num_labels;
699                 ne->ne_next = pl->names;
700                 pl->names = ne;
701                 nvlist_free(config);
702                 return (0);
703         }
704
705         /*
706          * If we have a valid config but cannot read any of these fields, then
707          * it means we have a half-initialized label.  In vdev_label_init()
708          * we write a label with txg == 0 so that we can identify the device
709          * in case the user refers to the same disk later on.  If we fail to
710          * create the pool, we'll be left with a label in this state
711          * which should not be considered part of a valid pool.
712          */
713         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
714             &pool_guid) != 0 ||
715             nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
716             &vdev_guid) != 0 ||
717             nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
718             &top_guid) != 0 ||
719             nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
720             &txg) != 0 || txg == 0) {
721                 nvlist_free(config);
722                 return (0);
723         }
724
725         /*
726          * First, see if we know about this pool.  If not, then add it to the
727          * list of known pools.
728          */
729         for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
730                 if (pe->pe_guid == pool_guid)
731                         break;
732         }
733
734         if (pe == NULL) {
735                 if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
736                         nvlist_free(config);
737                         return (-1);
738                 }
739                 pe->pe_guid = pool_guid;
740                 pe->pe_next = pl->pools;
741                 pl->pools = pe;
742         }
743
744         /*
745          * Second, see if we know about this toplevel vdev.  Add it if its
746          * missing.
747          */
748         for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
749                 if (ve->ve_guid == top_guid)
750                         break;
751         }
752
753         if (ve == NULL) {
754                 if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
755                         nvlist_free(config);
756                         return (-1);
757                 }
758                 ve->ve_guid = top_guid;
759                 ve->ve_next = pe->pe_vdevs;
760                 pe->pe_vdevs = ve;
761         }
762
763         /*
764          * Third, see if we have a config with a matching transaction group.  If
765          * so, then we do nothing.  Otherwise, add it to the list of known
766          * configs.
767          */
768         for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
769                 if (ce->ce_txg == txg)
770                         break;
771         }
772
773         if (ce == NULL) {
774                 if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) {
775                         nvlist_free(config);
776                         return (-1);
777                 }
778                 ce->ce_txg = txg;
779                 ce->ce_config = config;
780                 ce->ce_next = ve->ve_configs;
781                 ve->ve_configs = ce;
782         } else {
783                 nvlist_free(config);
784         }
785
786         /*
787          * At this point we've successfully added our config to the list of
788          * known configs.  The last thing to do is add the vdev guid -> path
789          * mappings so that we can fix up the configuration as necessary before
790          * doing the import.
791          */
792         if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
793                 return (-1);
794
795         if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
796                 free(ne);
797                 return (-1);
798         }
799
800         ne->ne_guid = vdev_guid;
801         ne->ne_order = order;
802         ne->ne_num_labels = num_labels;
803         ne->ne_next = pl->names;
804         pl->names = ne;
805
806         return (0);
807 }
808
809 /*
810  * Returns true if the named pool matches the given GUID.
811  */
812 static int
813 pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid,
814     boolean_t *isactive)
815 {
816         zpool_handle_t *zhp;
817         uint64_t theguid;
818
819         if (zpool_open_silent(hdl, name, &zhp) != 0)
820                 return (-1);
821
822         if (zhp == NULL) {
823                 *isactive = B_FALSE;
824                 return (0);
825         }
826
827         verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID,
828             &theguid) == 0);
829
830         zpool_close(zhp);
831
832         *isactive = (theguid == guid);
833         return (0);
834 }
835
836 static nvlist_t *
837 refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
838 {
839         nvlist_t *nvl;
840         zfs_cmd_t zc = {"\0"};
841         int err, dstbuf_size;
842
843         if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0)
844                 return (NULL);
845
846         dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4);
847
848         if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) {
849                 zcmd_free_nvlists(&zc);
850                 return (NULL);
851         }
852
853         while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT,
854             &zc)) != 0 && errno == ENOMEM) {
855                 if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
856                         zcmd_free_nvlists(&zc);
857                         return (NULL);
858                 }
859         }
860
861         if (err) {
862                 zcmd_free_nvlists(&zc);
863                 return (NULL);
864         }
865
866         if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) {
867                 zcmd_free_nvlists(&zc);
868                 return (NULL);
869         }
870
871         zcmd_free_nvlists(&zc);
872         return (nvl);
873 }
874
875 /*
876  * Determine if the vdev id is a hole in the namespace.
877  */
878 boolean_t
879 vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
880 {
881         int c;
882
883         for (c = 0; c < holes; c++) {
884
885                 /* Top-level is a hole */
886                 if (hole_array[c] == id)
887                         return (B_TRUE);
888         }
889         return (B_FALSE);
890 }
891
892 /*
893  * Convert our list of pools into the definitive set of configurations.  We
894  * start by picking the best config for each toplevel vdev.  Once that's done,
895  * we assemble the toplevel vdevs into a full config for the pool.  We make a
896  * pass to fix up any incorrect paths, and then add it to the main list to
897  * return to the user.
898  */
899 static nvlist_t *
900 get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
901 {
902         pool_entry_t *pe;
903         vdev_entry_t *ve;
904         config_entry_t *ce;
905         nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
906         nvlist_t **spares, **l2cache;
907         uint_t i, nspares, nl2cache;
908         boolean_t config_seen;
909         uint64_t best_txg;
910         char *name, *hostname = NULL;
911         uint64_t guid;
912         uint_t children = 0;
913         nvlist_t **child = NULL;
914         uint_t holes;
915         uint64_t *hole_array, max_id;
916         uint_t c;
917         boolean_t isactive;
918         uint64_t hostid;
919         nvlist_t *nvl;
920         boolean_t valid_top_config = B_FALSE;
921
922         if (nvlist_alloc(&ret, 0, 0) != 0)
923                 goto nomem;
924
925         for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
926                 uint64_t id, max_txg = 0;
927
928                 if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
929                         goto nomem;
930                 config_seen = B_FALSE;
931
932                 /*
933                  * Iterate over all toplevel vdevs.  Grab the pool configuration
934                  * from the first one we find, and then go through the rest and
935                  * add them as necessary to the 'vdevs' member of the config.
936                  */
937                 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
938
939                         /*
940                          * Determine the best configuration for this vdev by
941                          * selecting the config with the latest transaction
942                          * group.
943                          */
944                         best_txg = 0;
945                         for (ce = ve->ve_configs; ce != NULL;
946                             ce = ce->ce_next) {
947
948                                 if (ce->ce_txg > best_txg) {
949                                         tmp = ce->ce_config;
950                                         best_txg = ce->ce_txg;
951                                 }
952                         }
953
954                         /*
955                          * We rely on the fact that the max txg for the
956                          * pool will contain the most up-to-date information
957                          * about the valid top-levels in the vdev namespace.
958                          */
959                         if (best_txg > max_txg) {
960                                 (void) nvlist_remove(config,
961                                     ZPOOL_CONFIG_VDEV_CHILDREN,
962                                     DATA_TYPE_UINT64);
963                                 (void) nvlist_remove(config,
964                                     ZPOOL_CONFIG_HOLE_ARRAY,
965                                     DATA_TYPE_UINT64_ARRAY);
966
967                                 max_txg = best_txg;
968                                 hole_array = NULL;
969                                 holes = 0;
970                                 max_id = 0;
971                                 valid_top_config = B_FALSE;
972
973                                 if (nvlist_lookup_uint64(tmp,
974                                     ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
975                                         verify(nvlist_add_uint64(config,
976                                             ZPOOL_CONFIG_VDEV_CHILDREN,
977                                             max_id) == 0);
978                                         valid_top_config = B_TRUE;
979                                 }
980
981                                 if (nvlist_lookup_uint64_array(tmp,
982                                     ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
983                                     &holes) == 0) {
984                                         verify(nvlist_add_uint64_array(config,
985                                             ZPOOL_CONFIG_HOLE_ARRAY,
986                                             hole_array, holes) == 0);
987                                 }
988                         }
989
990                         if (!config_seen) {
991                                 /*
992                                  * Copy the relevant pieces of data to the pool
993                                  * configuration:
994                                  *
995                                  *      version
996                                  *      pool guid
997                                  *      name
998                                  *      comment (if available)
999                                  *      pool state
1000                                  *      hostid (if available)
1001                                  *      hostname (if available)
1002                                  */
1003                                 uint64_t state, version;
1004                                 char *comment = NULL;
1005
1006                                 version = fnvlist_lookup_uint64(tmp,
1007                                     ZPOOL_CONFIG_VERSION);
1008                                 fnvlist_add_uint64(config,
1009                                     ZPOOL_CONFIG_VERSION, version);
1010                                 guid = fnvlist_lookup_uint64(tmp,
1011                                     ZPOOL_CONFIG_POOL_GUID);
1012                                 fnvlist_add_uint64(config,
1013                                     ZPOOL_CONFIG_POOL_GUID, guid);
1014                                 name = fnvlist_lookup_string(tmp,
1015                                     ZPOOL_CONFIG_POOL_NAME);
1016                                 fnvlist_add_string(config,
1017                                     ZPOOL_CONFIG_POOL_NAME, name);
1018
1019                                 if (nvlist_lookup_string(tmp,
1020                                     ZPOOL_CONFIG_COMMENT, &comment) == 0)
1021                                         fnvlist_add_string(config,
1022                                             ZPOOL_CONFIG_COMMENT, comment);
1023
1024                                 state = fnvlist_lookup_uint64(tmp,
1025                                     ZPOOL_CONFIG_POOL_STATE);
1026                                 fnvlist_add_uint64(config,
1027                                     ZPOOL_CONFIG_POOL_STATE, state);
1028
1029                                 hostid = 0;
1030                                 if (nvlist_lookup_uint64(tmp,
1031                                     ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
1032                                         fnvlist_add_uint64(config,
1033                                             ZPOOL_CONFIG_HOSTID, hostid);
1034                                         hostname = fnvlist_lookup_string(tmp,
1035                                             ZPOOL_CONFIG_HOSTNAME);
1036                                         fnvlist_add_string(config,
1037                                             ZPOOL_CONFIG_HOSTNAME, hostname);
1038                                 }
1039
1040                                 config_seen = B_TRUE;
1041                         }
1042
1043                         /*
1044                          * Add this top-level vdev to the child array.
1045                          */
1046                         verify(nvlist_lookup_nvlist(tmp,
1047                             ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
1048                         verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
1049                             &id) == 0);
1050
1051                         if (id >= children) {
1052                                 nvlist_t **newchild;
1053
1054                                 newchild = zfs_alloc(hdl, (id + 1) *
1055                                     sizeof (nvlist_t *));
1056                                 if (newchild == NULL)
1057                                         goto nomem;
1058
1059                                 for (c = 0; c < children; c++)
1060                                         newchild[c] = child[c];
1061
1062                                 free(child);
1063                                 child = newchild;
1064                                 children = id + 1;
1065                         }
1066                         if (nvlist_dup(nvtop, &child[id], 0) != 0)
1067                                 goto nomem;
1068
1069                 }
1070
1071                 /*
1072                  * If we have information about all the top-levels then
1073                  * clean up the nvlist which we've constructed. This
1074                  * means removing any extraneous devices that are
1075                  * beyond the valid range or adding devices to the end
1076                  * of our array which appear to be missing.
1077                  */
1078                 if (valid_top_config) {
1079                         if (max_id < children) {
1080                                 for (c = max_id; c < children; c++)
1081                                         nvlist_free(child[c]);
1082                                 children = max_id;
1083                         } else if (max_id > children) {
1084                                 nvlist_t **newchild;
1085
1086                                 newchild = zfs_alloc(hdl, (max_id) *
1087                                     sizeof (nvlist_t *));
1088                                 if (newchild == NULL)
1089                                         goto nomem;
1090
1091                                 for (c = 0; c < children; c++)
1092                                         newchild[c] = child[c];
1093
1094                                 free(child);
1095                                 child = newchild;
1096                                 children = max_id;
1097                         }
1098                 }
1099
1100                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
1101                     &guid) == 0);
1102
1103                 /*
1104                  * The vdev namespace may contain holes as a result of
1105                  * device removal. We must add them back into the vdev
1106                  * tree before we process any missing devices.
1107                  */
1108                 if (holes > 0) {
1109                         ASSERT(valid_top_config);
1110
1111                         for (c = 0; c < children; c++) {
1112                                 nvlist_t *holey;
1113
1114                                 if (child[c] != NULL ||
1115                                     !vdev_is_hole(hole_array, holes, c))
1116                                         continue;
1117
1118                                 if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
1119                                     0) != 0)
1120                                         goto nomem;
1121
1122                                 /*
1123                                  * Holes in the namespace are treated as
1124                                  * "hole" top-level vdevs and have a
1125                                  * special flag set on them.
1126                                  */
1127                                 if (nvlist_add_string(holey,
1128                                     ZPOOL_CONFIG_TYPE,
1129                                     VDEV_TYPE_HOLE) != 0 ||
1130                                     nvlist_add_uint64(holey,
1131                                     ZPOOL_CONFIG_ID, c) != 0 ||
1132                                     nvlist_add_uint64(holey,
1133                                     ZPOOL_CONFIG_GUID, 0ULL) != 0) {
1134                                         nvlist_free(holey);
1135                                         goto nomem;
1136                                 }
1137                                 child[c] = holey;
1138                         }
1139                 }
1140
1141                 /*
1142                  * Look for any missing top-level vdevs.  If this is the case,
1143                  * create a faked up 'missing' vdev as a placeholder.  We cannot
1144                  * simply compress the child array, because the kernel performs
1145                  * certain checks to make sure the vdev IDs match their location
1146                  * in the configuration.
1147                  */
1148                 for (c = 0; c < children; c++) {
1149                         if (child[c] == NULL) {
1150                                 nvlist_t *missing;
1151                                 if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
1152                                     0) != 0)
1153                                         goto nomem;
1154                                 if (nvlist_add_string(missing,
1155                                     ZPOOL_CONFIG_TYPE,
1156                                     VDEV_TYPE_MISSING) != 0 ||
1157                                     nvlist_add_uint64(missing,
1158                                     ZPOOL_CONFIG_ID, c) != 0 ||
1159                                     nvlist_add_uint64(missing,
1160                                     ZPOOL_CONFIG_GUID, 0ULL) != 0) {
1161                                         nvlist_free(missing);
1162                                         goto nomem;
1163                                 }
1164                                 child[c] = missing;
1165                         }
1166                 }
1167
1168                 /*
1169                  * Put all of this pool's top-level vdevs into a root vdev.
1170                  */
1171                 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
1172                         goto nomem;
1173                 if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1174                     VDEV_TYPE_ROOT) != 0 ||
1175                     nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||
1176                     nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||
1177                     nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1178                     child, children) != 0) {
1179                         nvlist_free(nvroot);
1180                         goto nomem;
1181                 }
1182
1183                 for (c = 0; c < children; c++)
1184                         nvlist_free(child[c]);
1185                 free(child);
1186                 children = 0;
1187                 child = NULL;
1188
1189                 /*
1190                  * Go through and fix up any paths and/or devids based on our
1191                  * known list of vdev GUID -> path mappings.
1192                  */
1193                 if (fix_paths(nvroot, pl->names) != 0) {
1194                         nvlist_free(nvroot);
1195                         goto nomem;
1196                 }
1197
1198                 /*
1199                  * Add the root vdev to this pool's configuration.
1200                  */
1201                 if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1202                     nvroot) != 0) {
1203                         nvlist_free(nvroot);
1204                         goto nomem;
1205                 }
1206                 nvlist_free(nvroot);
1207
1208                 /*
1209                  * zdb uses this path to report on active pools that were
1210                  * imported or created using -R.
1211                  */
1212                 if (active_ok)
1213                         goto add_pool;
1214
1215                 /*
1216                  * Determine if this pool is currently active, in which case we
1217                  * can't actually import it.
1218                  */
1219                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
1220                     &name) == 0);
1221                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
1222                     &guid) == 0);
1223
1224                 if (pool_active(hdl, name, guid, &isactive) != 0)
1225                         goto error;
1226
1227                 if (isactive) {
1228                         nvlist_free(config);
1229                         config = NULL;
1230                         continue;
1231                 }
1232
1233                 if ((nvl = refresh_config(hdl, config)) == NULL) {
1234                         nvlist_free(config);
1235                         config = NULL;
1236                         continue;
1237                 }
1238
1239                 nvlist_free(config);
1240                 config = nvl;
1241
1242                 /*
1243                  * Go through and update the paths for spares, now that we have
1244                  * them.
1245                  */
1246                 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1247                     &nvroot) == 0);
1248                 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1249                     &spares, &nspares) == 0) {
1250                         for (i = 0; i < nspares; i++) {
1251                                 if (fix_paths(spares[i], pl->names) != 0)
1252                                         goto nomem;
1253                         }
1254                 }
1255
1256                 /*
1257                  * Update the paths for l2cache devices.
1258                  */
1259                 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1260                     &l2cache, &nl2cache) == 0) {
1261                         for (i = 0; i < nl2cache; i++) {
1262                                 if (fix_paths(l2cache[i], pl->names) != 0)
1263                                         goto nomem;
1264                         }
1265                 }
1266
1267                 /*
1268                  * Restore the original information read from the actual label.
1269                  */
1270                 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
1271                     DATA_TYPE_UINT64);
1272                 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
1273                     DATA_TYPE_STRING);
1274                 if (hostid != 0) {
1275                         verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
1276                             hostid) == 0);
1277                         verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
1278                             hostname) == 0);
1279                 }
1280
1281 add_pool:
1282                 /*
1283                  * Add this pool to the list of configs.
1284                  */
1285                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
1286                     &name) == 0);
1287                 if (nvlist_add_nvlist(ret, name, config) != 0)
1288                         goto nomem;
1289
1290                 nvlist_free(config);
1291                 config = NULL;
1292         }
1293
1294         return (ret);
1295
1296 nomem:
1297         (void) no_memory(hdl);
1298 error:
1299         nvlist_free(config);
1300         nvlist_free(ret);
1301         for (c = 0; c < children; c++)
1302                 nvlist_free(child[c]);
1303         free(child);
1304
1305         return (NULL);
1306 }
1307
1308 /*
1309  * Return the offset of the given label.
1310  */
1311 static uint64_t
1312 label_offset(uint64_t size, int l)
1313 {
1314         ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0);
1315         return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
1316             0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
1317 }
1318
1319 /*
1320  * Given a file descriptor, read the label information and return an nvlist
1321  * describing the configuration, if there is one.  The number of valid
1322  * labels found will be returned in num_labels when non-NULL.
1323  */
1324 int
1325 zpool_read_label(int fd, nvlist_t **config, int *num_labels)
1326 {
1327         struct stat64 statbuf;
1328         int l, count = 0;
1329         vdev_label_t *label;
1330         nvlist_t *expected_config = NULL;
1331         uint64_t expected_guid = 0, size;
1332         int error;
1333
1334         *config = NULL;
1335
1336         if (fstat64_blk(fd, &statbuf) == -1)
1337                 return (0);
1338         size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
1339
1340         error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
1341         if (error)
1342                 return (-1);
1343
1344         for (l = 0; l < VDEV_LABELS; l++) {
1345                 uint64_t state, guid, txg;
1346
1347                 if (pread64(fd, label, sizeof (vdev_label_t),
1348                     label_offset(size, l)) != sizeof (vdev_label_t))
1349                         continue;
1350
1351                 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
1352                     sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0)
1353                         continue;
1354
1355                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
1356                     &guid) != 0 || guid == 0) {
1357                         nvlist_free(*config);
1358                         continue;
1359                 }
1360
1361                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
1362                     &state) != 0 || state > POOL_STATE_L2CACHE) {
1363                         nvlist_free(*config);
1364                         continue;
1365                 }
1366
1367                 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
1368                     (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
1369                     &txg) != 0 || txg == 0)) {
1370                         nvlist_free(*config);
1371                         continue;
1372                 }
1373
1374                 if (expected_guid) {
1375                         if (expected_guid == guid)
1376                                 count++;
1377
1378                         nvlist_free(*config);
1379                 } else {
1380                         expected_config = *config;
1381                         expected_guid = guid;
1382                         count++;
1383                 }
1384         }
1385
1386         if (num_labels != NULL)
1387                 *num_labels = count;
1388
1389         free(label);
1390         *config = expected_config;
1391
1392         return (0);
1393 }
1394
1395 typedef struct rdsk_node {
1396         char *rn_name;                  /* Full path to device */
1397         int rn_order;                   /* Preferred order (low to high) */
1398         int rn_num_labels;              /* Number of valid labels */
1399         uint64_t rn_vdev_guid;          /* Expected vdev guid when set */
1400         libzfs_handle_t *rn_hdl;
1401         nvlist_t *rn_config;            /* Label config */
1402         avl_tree_t *rn_avl;
1403         avl_node_t rn_node;
1404         pthread_mutex_t *rn_lock;
1405         boolean_t rn_labelpaths;
1406 } rdsk_node_t;
1407
1408 /*
1409  * Sorted by vdev guid and full path to allow for multiple entries with
1410  * the same full path name.  This is required because it's possible to
1411  * have multiple block devices with labels that refer to the same
1412  * ZPOOL_CONFIG_PATH yet have different vdev guids.  In this case both
1413  * entries need to be added to the cache.  Scenarios where this can occur
1414  * include overwritten pool labels, devices which are visible from multiple
1415  * hosts and multipath devices.
1416  */
1417 static int
1418 slice_cache_compare(const void *arg1, const void *arg2)
1419 {
1420         const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
1421         const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
1422         uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
1423         uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
1424         int rv;
1425
1426         rv = AVL_CMP(guid1, guid2);
1427         if (rv)
1428                 return (rv);
1429
1430         return (AVL_ISIGN(strcmp(nm1, nm2)));
1431 }
1432
1433 static boolean_t
1434 is_watchdog_dev(char *dev)
1435 {
1436         /* For 'watchdog' dev */
1437         if (strcmp(dev, "watchdog") == 0)
1438                 return (B_TRUE);
1439
1440         /* For 'watchdog<digit><whatever> */
1441         if (strstr(dev, "watchdog") == dev && isdigit(dev[8]))
1442                 return (B_TRUE);
1443
1444         return (B_FALSE);
1445 }
1446
1447 static int
1448 label_paths_impl(libzfs_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,
1449     uint64_t vdev_guid, char **path, char **devid)
1450 {
1451         nvlist_t **child;
1452         uint_t c, children;
1453         uint64_t guid;
1454         char *val;
1455         int error;
1456
1457         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1458             &child, &children) == 0) {
1459                 for (c = 0; c < children; c++) {
1460                         error  = label_paths_impl(hdl, child[c],
1461                             pool_guid, vdev_guid, path, devid);
1462                         if (error)
1463                                 return (error);
1464                 }
1465                 return (0);
1466         }
1467
1468         if (nvroot == NULL)
1469                 return (0);
1470
1471         error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);
1472         if ((error != 0) || (guid != vdev_guid))
1473                 return (0);
1474
1475         error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);
1476         if (error == 0)
1477                 *path = val;
1478
1479         error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);
1480         if (error == 0)
1481                 *devid = val;
1482
1483         return (0);
1484 }
1485
1486 /*
1487  * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
1488  * and store these strings as config_path and devid_path respectively.
1489  * The returned pointers are only valid as long as label remains valid.
1490  */
1491 static int
1492 label_paths(libzfs_handle_t *hdl, nvlist_t *label, char **path, char **devid)
1493 {
1494         nvlist_t *nvroot;
1495         uint64_t pool_guid;
1496         uint64_t vdev_guid;
1497
1498         *path = NULL;
1499         *devid = NULL;
1500
1501         if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1502             nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) ||
1503             nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid))
1504                 return (ENOENT);
1505
1506         return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,
1507             devid));
1508 }
1509
1510 static void
1511 zpool_open_func(void *arg)
1512 {
1513         rdsk_node_t *rn = arg;
1514         libzfs_handle_t *hdl = rn->rn_hdl;
1515         struct stat64 statbuf;
1516         nvlist_t *config;
1517         char *bname, *dupname;
1518         uint64_t vdev_guid = 0;
1519         int error;
1520         int num_labels;
1521         int fd;
1522
1523         /*
1524          * Skip devices with well known prefixes there can be side effects
1525          * when opening devices which need to be avoided.
1526          *
1527          * hpet     - High Precision Event Timer
1528          * watchdog - Watchdog must be closed in a special way.
1529          */
1530         dupname = zfs_strdup(hdl, rn->rn_name);
1531         bname = basename(dupname);
1532         error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname));
1533         free(dupname);
1534         if (error)
1535                 return;
1536
1537         /*
1538          * Ignore failed stats.  We only want regular files and block devices.
1539          */
1540         if (stat64(rn->rn_name, &statbuf) != 0 ||
1541             (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
1542                 return;
1543
1544         /*
1545          * Preferentially open using O_DIRECT to bypass the block device
1546          * cache which may be stale for multipath devices.  An EINVAL errno
1547          * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
1548          */
1549         fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
1550         if ((fd < 0) && (errno == EINVAL))
1551                 fd = open(rn->rn_name, O_RDONLY);
1552
1553         if (fd < 0)
1554                 return;
1555
1556         /*
1557          * This file is too small to hold a zpool
1558          */
1559         if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
1560                 (void) close(fd);
1561                 return;
1562         }
1563
1564         error = zpool_read_label(fd, &config, &num_labels);
1565         if (error != 0) {
1566                 (void) close(fd);
1567                 return;
1568         }
1569
1570         if (num_labels == 0) {
1571                 (void) close(fd);
1572                 nvlist_free(config);
1573                 return;
1574         }
1575
1576         /*
1577          * Check that the vdev is for the expected guid.  Additional entries
1578          * are speculatively added based on the paths stored in the labels.
1579          * Entries with valid paths but incorrect guids must be removed.
1580          */
1581         error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
1582         if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
1583                 (void) close(fd);
1584                 nvlist_free(config);
1585                 return;
1586         }
1587
1588         (void) close(fd);
1589
1590         rn->rn_config = config;
1591         rn->rn_num_labels = num_labels;
1592
1593         /*
1594          * Add additional entries for paths described by this label.
1595          */
1596         if (rn->rn_labelpaths) {
1597                 char *path = NULL;
1598                 char *devid = NULL;
1599                 rdsk_node_t *slice;
1600                 avl_index_t where;
1601                 int error;
1602
1603                 if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
1604                         return;
1605
1606                 /*
1607                  * Allow devlinks to stabilize so all paths are available.
1608                  */
1609                 zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT);
1610
1611                 if (path != NULL) {
1612                         slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1613                         slice->rn_name = zfs_strdup(hdl, path);
1614                         slice->rn_vdev_guid = vdev_guid;
1615                         slice->rn_avl = rn->rn_avl;
1616                         slice->rn_hdl = hdl;
1617                         slice->rn_order = IMPORT_ORDER_PREFERRED_1;
1618                         slice->rn_labelpaths = B_FALSE;
1619                         pthread_mutex_lock(rn->rn_lock);
1620                         if (avl_find(rn->rn_avl, slice, &where)) {
1621                         pthread_mutex_unlock(rn->rn_lock);
1622                                 free(slice->rn_name);
1623                                 free(slice);
1624                         } else {
1625                                 avl_insert(rn->rn_avl, slice, where);
1626                                 pthread_mutex_unlock(rn->rn_lock);
1627                                 zpool_open_func(slice);
1628                         }
1629                 }
1630
1631                 if (devid != NULL) {
1632                         slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1633                         error = asprintf(&slice->rn_name, "%s%s",
1634                             DEV_BYID_PATH, devid);
1635                         if (error == -1) {
1636                                 free(slice);
1637                                 return;
1638                         }
1639
1640                         slice->rn_vdev_guid = vdev_guid;
1641                         slice->rn_avl = rn->rn_avl;
1642                         slice->rn_hdl = hdl;
1643                         slice->rn_order = IMPORT_ORDER_PREFERRED_2;
1644                         slice->rn_labelpaths = B_FALSE;
1645                         pthread_mutex_lock(rn->rn_lock);
1646                         if (avl_find(rn->rn_avl, slice, &where)) {
1647                                 pthread_mutex_unlock(rn->rn_lock);
1648                                 free(slice->rn_name);
1649                                 free(slice);
1650                         } else {
1651                                 avl_insert(rn->rn_avl, slice, where);
1652                                 pthread_mutex_unlock(rn->rn_lock);
1653                                 zpool_open_func(slice);
1654                         }
1655                 }
1656         }
1657 }
1658
1659 /*
1660  * Given a file descriptor, clear (zero) the label information.  This function
1661  * is used in the appliance stack as part of the ZFS sysevent module and
1662  * to implement the "zpool labelclear" command.
1663  */
1664 int
1665 zpool_clear_label(int fd)
1666 {
1667         struct stat64 statbuf;
1668         int l;
1669         vdev_label_t *label;
1670         uint64_t size;
1671
1672         if (fstat64_blk(fd, &statbuf) == -1)
1673                 return (0);
1674         size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
1675
1676         if ((label = calloc(1, sizeof (vdev_label_t))) == NULL)
1677                 return (-1);
1678
1679         for (l = 0; l < VDEV_LABELS; l++) {
1680                 if (pwrite64(fd, label, sizeof (vdev_label_t),
1681                     label_offset(size, l)) != sizeof (vdev_label_t)) {
1682                         free(label);
1683                         return (-1);
1684                 }
1685         }
1686
1687         free(label);
1688         return (0);
1689 }
1690
1691 static void
1692 zpool_find_import_scan_add_slice(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1693     avl_tree_t *cache, char *path, const char *name, int order)
1694 {
1695         avl_index_t where;
1696         rdsk_node_t *slice;
1697
1698         slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1699         if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {
1700                 free(slice);
1701                 return;
1702         }
1703         slice->rn_vdev_guid = 0;
1704         slice->rn_lock = lock;
1705         slice->rn_avl = cache;
1706         slice->rn_hdl = hdl;
1707         slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;
1708         slice->rn_labelpaths = B_FALSE;
1709
1710         pthread_mutex_lock(lock);
1711         if (avl_find(cache, slice, &where)) {
1712                 free(slice->rn_name);
1713                 free(slice);
1714         } else {
1715                 avl_insert(cache, slice, where);
1716         }
1717         pthread_mutex_unlock(lock);
1718 }
1719
1720 static int
1721 zpool_find_import_scan_dir(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1722     avl_tree_t *cache, char *dir, int order)
1723 {
1724         int error;
1725         char path[MAXPATHLEN];
1726         struct dirent64 *dp;
1727         DIR *dirp;
1728
1729         if (realpath(dir, path) == NULL) {
1730                 error = errno;
1731                 if (error == ENOENT)
1732                         return (0);
1733
1734                 zfs_error_aux(hdl, strerror(error));
1735                 (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1736                     TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
1737                 return (error);
1738         }
1739
1740         dirp = opendir(path);
1741         if (dirp == NULL) {
1742                 error = errno;
1743                 zfs_error_aux(hdl, strerror(error));
1744                 (void) zfs_error_fmt(hdl, EZFS_BADPATH,
1745                     dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
1746                 return (error);
1747         }
1748
1749         while ((dp = readdir64(dirp)) != NULL) {
1750                 const char *name = dp->d_name;
1751                 if (name[0] == '.' &&
1752                     (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
1753                         continue;
1754
1755                 zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,
1756                     order);
1757         }
1758
1759         (void) closedir(dirp);
1760         return (0);
1761 }
1762
1763 static int
1764 zpool_find_import_scan_path(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1765     avl_tree_t *cache, char *dir, int order)
1766 {
1767         int error = 0;
1768         char path[MAXPATHLEN];
1769         char *d, *b;
1770         char *dpath, *name;
1771
1772         /*
1773          * Seperate the directory part and last part of the
1774          * path. We do this so that we can get the realpath of
1775          * the directory. We don't get the realpath on the
1776          * whole path because if it's a symlink, we want the
1777          * path of the symlink not where it points to.
1778          */
1779         d = zfs_strdup(hdl, dir);
1780         b = zfs_strdup(hdl, dir);
1781         dpath = dirname(d);
1782         name = basename(b);
1783
1784         if (realpath(dpath, path) == NULL) {
1785                 error = errno;
1786                 if (error == ENOENT) {
1787                         error = 0;
1788                         goto out;
1789                 }
1790
1791                 zfs_error_aux(hdl, strerror(error));
1792                 (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1793                     TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
1794                 goto out;
1795         }
1796
1797         zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);
1798
1799 out:
1800         free(b);
1801         free(d);
1802         return (error);
1803 }
1804
1805 /*
1806  * Scan a list of directories for zfs devices.
1807  */
1808 static int
1809 zpool_find_import_scan(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1810     avl_tree_t **slice_cache, char **dir, int dirs)
1811 {
1812         avl_tree_t *cache;
1813         rdsk_node_t *slice;
1814         void *cookie;
1815         int i, error;
1816
1817         *slice_cache = NULL;
1818         cache = zfs_alloc(hdl, sizeof (avl_tree_t));
1819         avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),
1820             offsetof(rdsk_node_t, rn_node));
1821
1822         for (i = 0; i < dirs; i++) {
1823                 struct stat sbuf;
1824
1825                 if (stat(dir[i], &sbuf) != 0) {
1826                         error = errno;
1827                         if (error == ENOENT)
1828                                 continue;
1829
1830                         zfs_error_aux(hdl, strerror(error));
1831                         (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1832                             TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);
1833                         goto error;
1834                 }
1835
1836                 /*
1837                  * If dir[i] is a directory, we walk through it and add all
1838                  * the entry to the cache. If it's not a directory, we just
1839                  * add it to the cache.
1840                  */
1841                 if (S_ISDIR(sbuf.st_mode)) {
1842                         if ((error = zpool_find_import_scan_dir(hdl, lock,
1843                             cache, dir[i], i)) != 0)
1844                                 goto error;
1845                 } else {
1846                         if ((error = zpool_find_import_scan_path(hdl, lock,
1847                             cache, dir[i], i)) != 0)
1848                                 goto error;
1849                 }
1850         }
1851
1852         *slice_cache = cache;
1853         return (0);
1854
1855 error:
1856         cookie = NULL;
1857         while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
1858                 free(slice->rn_name);
1859                 free(slice);
1860         }
1861         free(cache);
1862
1863         return (error);
1864 }
1865
1866 /*
1867  * Use libblkid to quickly enumerate all known zfs devices.
1868  */
1869 static int
1870 zpool_find_import_blkid(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1871     avl_tree_t **slice_cache)
1872 {
1873         rdsk_node_t *slice;
1874         blkid_cache cache;
1875         blkid_dev_iterate iter;
1876         blkid_dev dev;
1877         avl_index_t where;
1878         int error;
1879
1880         *slice_cache = NULL;
1881
1882         error = blkid_get_cache(&cache, NULL);
1883         if (error != 0)
1884                 return (error);
1885
1886         error = blkid_probe_all_new(cache);
1887         if (error != 0) {
1888                 blkid_put_cache(cache);
1889                 return (error);
1890         }
1891
1892         iter = blkid_dev_iterate_begin(cache);
1893         if (iter == NULL) {
1894                 blkid_put_cache(cache);
1895                 return (EINVAL);
1896         }
1897
1898         error = blkid_dev_set_search(iter, "TYPE", "zfs_member");
1899         if (error != 0) {
1900                 blkid_dev_iterate_end(iter);
1901                 blkid_put_cache(cache);
1902                 return (error);
1903         }
1904
1905         *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t));
1906         avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
1907             offsetof(rdsk_node_t, rn_node));
1908
1909         while (blkid_dev_next(iter, &dev) == 0) {
1910                 slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1911                 slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
1912                 slice->rn_vdev_guid = 0;
1913                 slice->rn_lock = lock;
1914                 slice->rn_avl = *slice_cache;
1915                 slice->rn_hdl = hdl;
1916                 slice->rn_labelpaths = B_TRUE;
1917
1918                 error = zfs_path_order(slice->rn_name, &slice->rn_order);
1919                 if (error == 0)
1920                         slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
1921                 else
1922                         slice->rn_order = IMPORT_ORDER_DEFAULT;
1923
1924                 pthread_mutex_lock(lock);
1925                 if (avl_find(*slice_cache, slice, &where)) {
1926                         free(slice->rn_name);
1927                         free(slice);
1928                 } else {
1929                         avl_insert(*slice_cache, slice, where);
1930                 }
1931                 pthread_mutex_unlock(lock);
1932         }
1933
1934         blkid_dev_iterate_end(iter);
1935         blkid_put_cache(cache);
1936
1937         return (0);
1938 }
1939
1940 char *
1941 zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
1942         "/dev/disk/by-vdev",    /* Custom rules, use first if they exist */
1943         "/dev/mapper",          /* Use multipath devices before components */
1944         "/dev/disk/by-partlabel", /* Single unique entry set by user */
1945         "/dev/disk/by-partuuid", /* Generated partition uuid */
1946         "/dev/disk/by-label",   /* Custom persistent labels */
1947         "/dev/disk/by-uuid",    /* Single unique entry and persistent */
1948         "/dev/disk/by-id",      /* May be multiple entries and persistent */
1949         "/dev/disk/by-path",    /* Encodes physical location and persistent */
1950         "/dev"                  /* UNSAFE device names will change */
1951 };
1952
1953 /*
1954  * Given a list of directories to search, find all pools stored on disk.  This
1955  * includes partial pools which are not available to import.  If no args are
1956  * given (argc is 0), then the default directory (/dev/dsk) is searched.
1957  * poolname or guid (but not both) are provided by the caller when trying
1958  * to import a specific pool.
1959  */
1960 static nvlist_t *
1961 zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
1962 {
1963         nvlist_t *ret = NULL;
1964         pool_list_t pools = { 0 };
1965         pool_entry_t *pe, *penext;
1966         vdev_entry_t *ve, *venext;
1967         config_entry_t *ce, *cenext;
1968         name_entry_t *ne, *nenext;
1969         pthread_mutex_t lock;
1970         avl_tree_t *cache;
1971         rdsk_node_t *slice;
1972         void *cookie;
1973         tpool_t *t;
1974
1975         verify(iarg->poolname == NULL || iarg->guid == 0);
1976         pthread_mutex_init(&lock, NULL);
1977
1978         /*
1979          * Locate pool member vdevs using libblkid or by directory scanning.
1980          * On success a newly allocated AVL tree which is populated with an
1981          * entry for each discovered vdev will be returned as the cache.
1982          * It's the callers responsibility to consume and destroy this tree.
1983          */
1984         if (iarg->scan || iarg->paths != 0) {
1985                 int dirs = iarg->paths;
1986                 char **dir = iarg->path;
1987
1988                 if (dirs == 0) {
1989                         dir = zpool_default_import_path;
1990                         dirs = DEFAULT_IMPORT_PATH_SIZE;
1991                 }
1992
1993                 if (zpool_find_import_scan(hdl, &lock, &cache, dir,  dirs) != 0)
1994                         return (NULL);
1995         } else {
1996                 if (zpool_find_import_blkid(hdl, &lock, &cache) != 0)
1997                         return (NULL);
1998         }
1999
2000         /*
2001          * Create a thread pool to parallelize the process of reading and
2002          * validating labels, a large number of threads can be used due to
2003          * minimal contention.
2004          */
2005         t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
2006         for (slice = avl_first(cache); slice;
2007             (slice = avl_walk(cache, slice, AVL_AFTER)))
2008                 (void) tpool_dispatch(t, zpool_open_func, slice);
2009
2010         tpool_wait(t);
2011         tpool_destroy(t);
2012
2013         /*
2014          * Process the cache filtering out any entries which are not
2015          * for the specificed pool then adding matching label configs.
2016          */
2017         cookie = NULL;
2018         while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
2019                 if (slice->rn_config != NULL) {
2020                         nvlist_t *config = slice->rn_config;
2021                         boolean_t matched = B_TRUE;
2022                         boolean_t aux = B_FALSE;
2023                         int fd;
2024
2025                         /*
2026                          * Check if it's a spare or l2cache device. If it is,
2027                          * we need to skip the name and guid check since they
2028                          * don't exist on aux device label.
2029                          */
2030                         if (iarg->poolname != NULL || iarg->guid != 0) {
2031                                 uint64_t state;
2032                                 aux = nvlist_lookup_uint64(config,
2033                                     ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&
2034                                     (state == POOL_STATE_SPARE ||
2035                                     state == POOL_STATE_L2CACHE);
2036                         }
2037
2038                         if (iarg->poolname != NULL && !aux) {
2039                                 char *pname;
2040
2041                                 matched = nvlist_lookup_string(config,
2042                                     ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&
2043                                     strcmp(iarg->poolname, pname) == 0;
2044                         } else if (iarg->guid != 0 && !aux) {
2045                                 uint64_t this_guid;
2046
2047                                 matched = nvlist_lookup_uint64(config,
2048                                     ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&
2049                                     iarg->guid == this_guid;
2050                         }
2051                         if (!matched) {
2052                                 nvlist_free(config);
2053                         } else {
2054                                 /*
2055                                  * Verify all remaining entries can be opened
2056                                  * exclusively. This will prune all underlying
2057                                  * multipath devices which otherwise could
2058                                  * result in the vdev appearing as UNAVAIL.
2059                                  *
2060                                  * Under zdb, this step isn't required and
2061                                  * would prevent a zdb -e of active pools with
2062                                  * no cachefile.
2063                                  */
2064                                 fd = open(slice->rn_name, O_RDONLY | O_EXCL);
2065                                 if (fd >= 0 || iarg->can_be_active) {
2066                                         if (fd >= 0)
2067                                                 close(fd);
2068                                         add_config(hdl, &pools,
2069                                             slice->rn_name, slice->rn_order,
2070                                             slice->rn_num_labels, config);
2071                                 } else {
2072                                         nvlist_free(config);
2073                                 }
2074                         }
2075                 }
2076                 free(slice->rn_name);
2077                 free(slice);
2078         }
2079         avl_destroy(cache);
2080         free(cache);
2081         pthread_mutex_destroy(&lock);
2082
2083         ret = get_configs(hdl, &pools, iarg->can_be_active);
2084
2085         for (pe = pools.pools; pe != NULL; pe = penext) {
2086                 penext = pe->pe_next;
2087                 for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
2088                         venext = ve->ve_next;
2089                         for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
2090                                 cenext = ce->ce_next;
2091                                 nvlist_free(ce->ce_config);
2092                                 free(ce);
2093                         }
2094                         free(ve);
2095                 }
2096                 free(pe);
2097         }
2098
2099         for (ne = pools.names; ne != NULL; ne = nenext) {
2100                 nenext = ne->ne_next;
2101                 free(ne->ne_name);
2102                 free(ne);
2103         }
2104
2105         return (ret);
2106 }
2107
2108 nvlist_t *
2109 zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
2110 {
2111         importargs_t iarg = { 0 };
2112
2113         iarg.paths = argc;
2114         iarg.path = argv;
2115
2116         return (zpool_find_import_impl(hdl, &iarg));
2117 }
2118
2119 /*
2120  * Given a cache file, return the contents as a list of importable pools.
2121  * poolname or guid (but not both) are provided by the caller when trying
2122  * to import a specific pool.
2123  */
2124 nvlist_t *
2125 zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
2126     char *poolname, uint64_t guid)
2127 {
2128         char *buf;
2129         int fd;
2130         struct stat64 statbuf;
2131         nvlist_t *raw, *src, *dst;
2132         nvlist_t *pools;
2133         nvpair_t *elem;
2134         char *name;
2135         uint64_t this_guid;
2136         boolean_t active;
2137
2138         verify(poolname == NULL || guid == 0);
2139
2140         if ((fd = open(cachefile, O_RDONLY)) < 0) {
2141                 zfs_error_aux(hdl, "%s", strerror(errno));
2142                 (void) zfs_error(hdl, EZFS_BADCACHE,
2143                     dgettext(TEXT_DOMAIN, "failed to open cache file"));
2144                 return (NULL);
2145         }
2146
2147         if (fstat64(fd, &statbuf) != 0) {
2148                 zfs_error_aux(hdl, "%s", strerror(errno));
2149                 (void) close(fd);
2150                 (void) zfs_error(hdl, EZFS_BADCACHE,
2151                     dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
2152                 return (NULL);
2153         }
2154
2155         if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) {
2156                 (void) close(fd);
2157                 return (NULL);
2158         }
2159
2160         if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
2161                 (void) close(fd);
2162                 free(buf);
2163                 (void) zfs_error(hdl, EZFS_BADCACHE,
2164                     dgettext(TEXT_DOMAIN,
2165                     "failed to read cache file contents"));
2166                 return (NULL);
2167         }
2168
2169         (void) close(fd);
2170
2171         if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
2172                 free(buf);
2173                 (void) zfs_error(hdl, EZFS_BADCACHE,
2174                     dgettext(TEXT_DOMAIN,
2175                     "invalid or corrupt cache file contents"));
2176                 return (NULL);
2177         }
2178
2179         free(buf);
2180
2181         /*
2182          * Go through and get the current state of the pools and refresh their
2183          * state.
2184          */
2185         if (nvlist_alloc(&pools, 0, 0) != 0) {
2186                 (void) no_memory(hdl);
2187                 nvlist_free(raw);
2188                 return (NULL);
2189         }
2190
2191         elem = NULL;
2192         while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
2193                 src = fnvpair_value_nvlist(elem);
2194
2195                 name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
2196                 if (poolname != NULL && strcmp(poolname, name) != 0)
2197                         continue;
2198
2199                 this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
2200                 if (guid != 0 && guid != this_guid)
2201                         continue;
2202
2203                 if (pool_active(hdl, name, this_guid, &active) != 0) {
2204                         nvlist_free(raw);
2205                         nvlist_free(pools);
2206                         return (NULL);
2207                 }
2208
2209                 if (active)
2210                         continue;
2211
2212                 if ((dst = refresh_config(hdl, src)) == NULL) {
2213                         nvlist_free(raw);
2214                         nvlist_free(pools);
2215                         return (NULL);
2216                 }
2217
2218                 if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
2219                         (void) no_memory(hdl);
2220                         nvlist_free(dst);
2221                         nvlist_free(raw);
2222                         nvlist_free(pools);
2223                         return (NULL);
2224                 }
2225                 nvlist_free(dst);
2226         }
2227
2228         nvlist_free(raw);
2229         return (pools);
2230 }
2231
2232 static int
2233 name_or_guid_exists(zpool_handle_t *zhp, void *data)
2234 {
2235         importargs_t *import = data;
2236         int found = 0;
2237
2238         if (import->poolname != NULL) {
2239                 char *pool_name;
2240
2241                 verify(nvlist_lookup_string(zhp->zpool_config,
2242                     ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
2243                 if (strcmp(pool_name, import->poolname) == 0)
2244                         found = 1;
2245         } else {
2246                 uint64_t pool_guid;
2247
2248                 verify(nvlist_lookup_uint64(zhp->zpool_config,
2249                     ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
2250                 if (pool_guid == import->guid)
2251                         found = 1;
2252         }
2253
2254         zpool_close(zhp);
2255         return (found);
2256 }
2257
2258 nvlist_t *
2259 zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
2260 {
2261         verify(import->poolname == NULL || import->guid == 0);
2262
2263         if (import->unique)
2264                 import->exists = zpool_iter(hdl, name_or_guid_exists, import);
2265
2266         if (import->cachefile != NULL)
2267                 return (zpool_find_import_cached(hdl, import->cachefile,
2268                     import->poolname, import->guid));
2269
2270         return (zpool_find_import_impl(hdl, import));
2271 }
2272
2273 static boolean_t
2274 pool_match(nvlist_t *cfg, char *tgt)
2275 {
2276         uint64_t v, guid = strtoull(tgt, NULL, 0);
2277         char *s;
2278
2279         if (guid != 0) {
2280                 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
2281                         return (v == guid);
2282         } else {
2283                 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
2284                         return (strcmp(s, tgt) == 0);
2285         }
2286         return (B_FALSE);
2287 }
2288
2289 int
2290 zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp,
2291     importargs_t *args)
2292 {
2293         nvlist_t *pools;
2294         nvlist_t *match = NULL;
2295         nvlist_t *config = NULL;
2296         char *name = NULL, *sepp = NULL;
2297         char sep = '\0';
2298         int count = 0;
2299         char *targetdup = strdup(target);
2300
2301         *configp = NULL;
2302
2303         if ((sepp = strpbrk(targetdup, "/@")) != NULL) {
2304                 sep = *sepp;
2305                 *sepp = '\0';
2306         }
2307
2308         pools = zpool_search_import(hdl, args);
2309
2310         if (pools != NULL) {
2311                 nvpair_t *elem = NULL;
2312                 while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
2313                         VERIFY0(nvpair_value_nvlist(elem, &config));
2314                         if (pool_match(config, targetdup)) {
2315                                 count++;
2316                                 if (match != NULL) {
2317                                         /* multiple matches found */
2318                                         continue;
2319                                 } else {
2320                                         match = config;
2321                                         name = nvpair_name(elem);
2322                                 }
2323                         }
2324                 }
2325         }
2326
2327         if (count == 0) {
2328                 (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2329                     "no pools found"));
2330                 free(targetdup);
2331                 return (ENOENT);
2332         }
2333
2334         if (count > 1) {
2335                 (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2336                     "%d pools found, use pool GUID\n"), count);
2337                 free(targetdup);
2338                 return (EINVAL);
2339         }
2340
2341         *configp = match;
2342         free(targetdup);
2343
2344         return (0);
2345 }
2346
2347 boolean_t
2348 find_guid(nvlist_t *nv, uint64_t guid)
2349 {
2350         uint64_t tmp;
2351         nvlist_t **child;
2352         uint_t c, children;
2353
2354         verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0);
2355         if (tmp == guid)
2356                 return (B_TRUE);
2357
2358         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2359             &child, &children) == 0) {
2360                 for (c = 0; c < children; c++)
2361                         if (find_guid(child[c], guid))
2362                                 return (B_TRUE);
2363         }
2364
2365         return (B_FALSE);
2366 }
2367
2368 typedef struct aux_cbdata {
2369         const char      *cb_type;
2370         uint64_t        cb_guid;
2371         zpool_handle_t  *cb_zhp;
2372 } aux_cbdata_t;
2373
2374 static int
2375 find_aux(zpool_handle_t *zhp, void *data)
2376 {
2377         aux_cbdata_t *cbp = data;
2378         nvlist_t **list;
2379         uint_t i, count;
2380         uint64_t guid;
2381         nvlist_t *nvroot;
2382
2383         verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
2384             &nvroot) == 0);
2385
2386         if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type,
2387             &list, &count) == 0) {
2388                 for (i = 0; i < count; i++) {
2389                         verify(nvlist_lookup_uint64(list[i],
2390                             ZPOOL_CONFIG_GUID, &guid) == 0);
2391                         if (guid == cbp->cb_guid) {
2392                                 cbp->cb_zhp = zhp;
2393                                 return (1);
2394                         }
2395                 }
2396         }
2397
2398         zpool_close(zhp);
2399         return (0);
2400 }
2401
2402 /*
2403  * Determines if the pool is in use.  If so, it returns true and the state of
2404  * the pool as well as the name of the pool.  Name string is allocated and
2405  * must be freed by the caller.
2406  */
2407 int
2408 zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
2409     boolean_t *inuse)
2410 {
2411         nvlist_t *config;
2412         char *name;
2413         boolean_t ret;
2414         uint64_t guid, vdev_guid;
2415         zpool_handle_t *zhp;
2416         nvlist_t *pool_config;
2417         uint64_t stateval, isspare;
2418         aux_cbdata_t cb = { 0 };
2419         boolean_t isactive;
2420
2421         *inuse = B_FALSE;
2422
2423         if (zpool_read_label(fd, &config, NULL) != 0) {
2424                 (void) no_memory(hdl);
2425                 return (-1);
2426         }
2427
2428         if (config == NULL)
2429                 return (0);
2430
2431         verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2432             &stateval) == 0);
2433         verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
2434             &vdev_guid) == 0);
2435
2436         if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) {
2437                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
2438                     &name) == 0);
2439                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
2440                     &guid) == 0);
2441         }
2442
2443         switch (stateval) {
2444         case POOL_STATE_EXPORTED:
2445                 /*
2446                  * A pool with an exported state may in fact be imported
2447                  * read-only, so check the in-core state to see if it's
2448                  * active and imported read-only.  If it is, set
2449                  * its state to active.
2450                  */
2451                 if (pool_active(hdl, name, guid, &isactive) == 0 && isactive &&
2452                     (zhp = zpool_open_canfail(hdl, name)) != NULL) {
2453                         if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL))
2454                                 stateval = POOL_STATE_ACTIVE;
2455
2456                         /*
2457                          * All we needed the zpool handle for is the
2458                          * readonly prop check.
2459                          */
2460                         zpool_close(zhp);
2461                 }
2462
2463                 ret = B_TRUE;
2464                 break;
2465
2466         case POOL_STATE_ACTIVE:
2467                 /*
2468                  * For an active pool, we have to determine if it's really part
2469                  * of a currently active pool (in which case the pool will exist
2470                  * and the guid will be the same), or whether it's part of an
2471                  * active pool that was disconnected without being explicitly
2472                  * exported.
2473                  */
2474                 if (pool_active(hdl, name, guid, &isactive) != 0) {
2475                         nvlist_free(config);
2476                         return (-1);
2477                 }
2478
2479                 if (isactive) {
2480                         /*
2481                          * Because the device may have been removed while
2482                          * offlined, we only report it as active if the vdev is
2483                          * still present in the config.  Otherwise, pretend like
2484                          * it's not in use.
2485                          */
2486                         if ((zhp = zpool_open_canfail(hdl, name)) != NULL &&
2487                             (pool_config = zpool_get_config(zhp, NULL))
2488                             != NULL) {
2489                                 nvlist_t *nvroot;
2490
2491                                 verify(nvlist_lookup_nvlist(pool_config,
2492                                     ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2493                                 ret = find_guid(nvroot, vdev_guid);
2494                         } else {
2495                                 ret = B_FALSE;
2496                         }
2497
2498                         /*
2499                          * If this is an active spare within another pool, we
2500                          * treat it like an unused hot spare.  This allows the
2501                          * user to create a pool with a hot spare that currently
2502                          * in use within another pool.  Since we return B_TRUE,
2503                          * libdiskmgt will continue to prevent generic consumers
2504                          * from using the device.
2505                          */
2506                         if (ret && nvlist_lookup_uint64(config,
2507                             ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare)
2508                                 stateval = POOL_STATE_SPARE;
2509
2510                         if (zhp != NULL)
2511                                 zpool_close(zhp);
2512                 } else {
2513                         stateval = POOL_STATE_POTENTIALLY_ACTIVE;
2514                         ret = B_TRUE;
2515                 }
2516                 break;
2517
2518         case POOL_STATE_SPARE:
2519                 /*
2520                  * For a hot spare, it can be either definitively in use, or
2521                  * potentially active.  To determine if it's in use, we iterate
2522                  * over all pools in the system and search for one with a spare
2523                  * with a matching guid.
2524                  *
2525                  * Due to the shared nature of spares, we don't actually report
2526                  * the potentially active case as in use.  This means the user
2527                  * can freely create pools on the hot spares of exported pools,
2528                  * but to do otherwise makes the resulting code complicated, and
2529                  * we end up having to deal with this case anyway.
2530                  */
2531                 cb.cb_zhp = NULL;
2532                 cb.cb_guid = vdev_guid;
2533                 cb.cb_type = ZPOOL_CONFIG_SPARES;
2534                 if (zpool_iter(hdl, find_aux, &cb) == 1) {
2535                         name = (char *)zpool_get_name(cb.cb_zhp);
2536                         ret = B_TRUE;
2537                 } else {
2538                         ret = B_FALSE;
2539                 }
2540                 break;
2541
2542         case POOL_STATE_L2CACHE:
2543
2544                 /*
2545                  * Check if any pool is currently using this l2cache device.
2546                  */
2547                 cb.cb_zhp = NULL;
2548                 cb.cb_guid = vdev_guid;
2549                 cb.cb_type = ZPOOL_CONFIG_L2CACHE;
2550                 if (zpool_iter(hdl, find_aux, &cb) == 1) {
2551                         name = (char *)zpool_get_name(cb.cb_zhp);
2552                         ret = B_TRUE;
2553                 } else {
2554                         ret = B_FALSE;
2555                 }
2556                 break;
2557
2558         default:
2559                 ret = B_FALSE;
2560         }
2561
2562
2563         if (ret) {
2564                 if ((*namestr = zfs_strdup(hdl, name)) == NULL) {
2565                         if (cb.cb_zhp)
2566                                 zpool_close(cb.cb_zhp);
2567                         nvlist_free(config);
2568                         return (-1);
2569                 }
2570                 *state = (pool_state_t)stateval;
2571         }
2572
2573         if (cb.cb_zhp)
2574                 zpool_close(cb.cb_zhp);
2575
2576         nvlist_free(config);
2577         *inuse = ret;
2578         return (0);
2579 }