]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - lib/libzfs/libzfs_import.c
Illumos 1778 - Assertion failed: rn->rn_nozpool == B_FALSE
[FreeBSD/FreeBSD.git] / lib / libzfs / libzfs_import.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012 by Delphix. All rights reserved.
25  * Copyright 2015 RackTop Systems.
26  */
27
28 /*
29  * Pool import support functions.
30  *
31  * To import a pool, we rely on reading the configuration information from the
32  * ZFS label of each device.  If we successfully read the label, then we
33  * organize the configuration information in the following hierarchy:
34  *
35  *      pool guid -> toplevel vdev guid -> label txg
36  *
37  * Duplicate entries matching this same tuple will be discarded.  Once we have
38  * examined every device, we pick the best label txg config for each toplevel
39  * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
40  * update any paths that have changed.  Finally, we attempt to import the pool
41  * using our derived config, and record the results.
42  */
43
44 #include <ctype.h>
45 #include <devid.h>
46 #include <dirent.h>
47 #include <errno.h>
48 #include <libintl.h>
49 #include <stddef.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <sys/stat.h>
53 #include <unistd.h>
54 #include <fcntl.h>
55 #include <sys/vtoc.h>
56 #include <sys/dktp/fdisk.h>
57 #include <sys/efi_partition.h>
58
59 #include <sys/vdev_impl.h>
60 #ifdef HAVE_LIBBLKID
61 #include <blkid/blkid.h>
62 #endif
63
64 #include "libzfs.h"
65 #include "libzfs_impl.h"
66
67 /*
68  * Intermediate structures used to gather configuration information.
69  */
70 typedef struct config_entry {
71         uint64_t                ce_txg;
72         nvlist_t                *ce_config;
73         struct config_entry     *ce_next;
74 } config_entry_t;
75
76 typedef struct vdev_entry {
77         uint64_t                ve_guid;
78         config_entry_t          *ve_configs;
79         struct vdev_entry       *ve_next;
80 } vdev_entry_t;
81
82 typedef struct pool_entry {
83         uint64_t                pe_guid;
84         vdev_entry_t            *pe_vdevs;
85         struct pool_entry       *pe_next;
86 } pool_entry_t;
87
88 typedef struct name_entry {
89         char                    *ne_name;
90         uint64_t                ne_guid;
91         uint64_t                ne_order;
92         uint64_t                ne_num_labels;
93         struct name_entry       *ne_next;
94 } name_entry_t;
95
96 typedef struct pool_list {
97         pool_entry_t            *pools;
98         name_entry_t            *names;
99 } pool_list_t;
100
101 static char *
102 get_devid(const char *path)
103 {
104         int fd;
105         ddi_devid_t devid;
106         char *minor, *ret;
107
108         if ((fd = open(path, O_RDONLY)) < 0)
109                 return (NULL);
110
111         minor = NULL;
112         ret = NULL;
113         if (devid_get(fd, &devid) == 0) {
114                 if (devid_get_minor_name(fd, &minor) == 0)
115                         ret = devid_str_encode(devid, minor);
116                 if (minor != NULL)
117                         devid_str_free(minor);
118                 devid_free(devid);
119         }
120         (void) close(fd);
121
122         return (ret);
123 }
124
125
126 /*
127  * Go through and fix up any path and/or devid information for the given vdev
128  * configuration.
129  */
130 static int
131 fix_paths(nvlist_t *nv, name_entry_t *names)
132 {
133         nvlist_t **child;
134         uint_t c, children;
135         uint64_t guid;
136         name_entry_t *ne, *best;
137         char *path, *devid;
138
139         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
140             &child, &children) == 0) {
141                 for (c = 0; c < children; c++)
142                         if (fix_paths(child[c], names) != 0)
143                                 return (-1);
144                 return (0);
145         }
146
147         /*
148          * This is a leaf (file or disk) vdev.  In either case, go through
149          * the name list and see if we find a matching guid.  If so, replace
150          * the path and see if we can calculate a new devid.
151          *
152          * There may be multiple names associated with a particular guid, in
153          * which case we have overlapping partitions or multiple paths to the
154          * same disk.  In this case we prefer to use the path name which
155          * matches the ZPOOL_CONFIG_PATH.  If no matching entry is found we
156          * use the lowest order device which corresponds to the first match
157          * while traversing the ZPOOL_IMPORT_PATH search path.
158          */
159         verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
160         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
161                 path = NULL;
162
163         best = NULL;
164         for (ne = names; ne != NULL; ne = ne->ne_next) {
165                 if (ne->ne_guid == guid) {
166
167                         if (path == NULL) {
168                                 best = ne;
169                                 break;
170                         }
171
172                         if ((strlen(path) == strlen(ne->ne_name)) &&
173                             strncmp(path, ne->ne_name, strlen(path)) == 0) {
174                                 best = ne;
175                                 break;
176                         }
177
178                         if (best == NULL) {
179                                 best = ne;
180                                 continue;
181                         }
182
183                         /* Prefer paths with move vdev labels. */
184                         if (ne->ne_num_labels > best->ne_num_labels) {
185                                 best = ne;
186                                 continue;
187                         }
188
189                         /* Prefer paths earlier in the search order. */
190                         if (best->ne_num_labels == best->ne_num_labels &&
191                             ne->ne_order < best->ne_order) {
192                                 best = ne;
193                                 continue;
194                         }
195                 }
196         }
197
198         if (best == NULL)
199                 return (0);
200
201         if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
202                 return (-1);
203
204         if ((devid = get_devid(best->ne_name)) == NULL) {
205                 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
206         } else {
207                 if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) {
208                         devid_str_free(devid);
209                         return (-1);
210                 }
211                 devid_str_free(devid);
212         }
213
214         return (0);
215 }
216
217 /*
218  * Add the given configuration to the list of known devices.
219  */
220 static int
221 add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path,
222     int order, int num_labels, nvlist_t *config)
223 {
224         uint64_t pool_guid, vdev_guid, top_guid, txg, state;
225         pool_entry_t *pe;
226         vdev_entry_t *ve;
227         config_entry_t *ce;
228         name_entry_t *ne;
229
230         /*
231          * If this is a hot spare not currently in use or level 2 cache
232          * device, add it to the list of names to translate, but don't do
233          * anything else.
234          */
235         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
236             &state) == 0 &&
237             (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
238             nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
239                 if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
240                         return (-1);
241
242                 if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
243                         free(ne);
244                         return (-1);
245                 }
246                 ne->ne_guid = vdev_guid;
247                 ne->ne_order = order;
248                 ne->ne_num_labels = num_labels;
249                 ne->ne_next = pl->names;
250                 pl->names = ne;
251                 return (0);
252         }
253
254         /*
255          * If we have a valid config but cannot read any of these fields, then
256          * it means we have a half-initialized label.  In vdev_label_init()
257          * we write a label with txg == 0 so that we can identify the device
258          * in case the user refers to the same disk later on.  If we fail to
259          * create the pool, we'll be left with a label in this state
260          * which should not be considered part of a valid pool.
261          */
262         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
263             &pool_guid) != 0 ||
264             nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
265             &vdev_guid) != 0 ||
266             nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
267             &top_guid) != 0 ||
268             nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
269             &txg) != 0 || txg == 0) {
270                 nvlist_free(config);
271                 return (0);
272         }
273
274         /*
275          * First, see if we know about this pool.  If not, then add it to the
276          * list of known pools.
277          */
278         for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
279                 if (pe->pe_guid == pool_guid)
280                         break;
281         }
282
283         if (pe == NULL) {
284                 if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
285                         nvlist_free(config);
286                         return (-1);
287                 }
288                 pe->pe_guid = pool_guid;
289                 pe->pe_next = pl->pools;
290                 pl->pools = pe;
291         }
292
293         /*
294          * Second, see if we know about this toplevel vdev.  Add it if its
295          * missing.
296          */
297         for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
298                 if (ve->ve_guid == top_guid)
299                         break;
300         }
301
302         if (ve == NULL) {
303                 if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
304                         nvlist_free(config);
305                         return (-1);
306                 }
307                 ve->ve_guid = top_guid;
308                 ve->ve_next = pe->pe_vdevs;
309                 pe->pe_vdevs = ve;
310         }
311
312         /*
313          * Third, see if we have a config with a matching transaction group.  If
314          * so, then we do nothing.  Otherwise, add it to the list of known
315          * configs.
316          */
317         for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
318                 if (ce->ce_txg == txg)
319                         break;
320         }
321
322         if (ce == NULL) {
323                 if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) {
324                         nvlist_free(config);
325                         return (-1);
326                 }
327                 ce->ce_txg = txg;
328                 ce->ce_config = config;
329                 ce->ce_next = ve->ve_configs;
330                 ve->ve_configs = ce;
331         } else {
332                 nvlist_free(config);
333         }
334
335         /*
336          * At this point we've successfully added our config to the list of
337          * known configs.  The last thing to do is add the vdev guid -> path
338          * mappings so that we can fix up the configuration as necessary before
339          * doing the import.
340          */
341         if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
342                 return (-1);
343
344         if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
345                 free(ne);
346                 return (-1);
347         }
348
349         ne->ne_guid = vdev_guid;
350         ne->ne_order = order;
351         ne->ne_num_labels = num_labels;
352         ne->ne_next = pl->names;
353         pl->names = ne;
354
355         return (0);
356 }
357
358 /*
359  * Returns true if the named pool matches the given GUID.
360  */
361 static int
362 pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid,
363     boolean_t *isactive)
364 {
365         zpool_handle_t *zhp;
366         uint64_t theguid;
367
368         if (zpool_open_silent(hdl, name, &zhp) != 0)
369                 return (-1);
370
371         if (zhp == NULL) {
372                 *isactive = B_FALSE;
373                 return (0);
374         }
375
376         verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID,
377             &theguid) == 0);
378
379         zpool_close(zhp);
380
381         *isactive = (theguid == guid);
382         return (0);
383 }
384
385 static nvlist_t *
386 refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
387 {
388         nvlist_t *nvl;
389         zfs_cmd_t zc = {"\0"};
390         int err;
391
392         if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0)
393                 return (NULL);
394
395         if (zcmd_alloc_dst_nvlist(hdl, &zc,
396             zc.zc_nvlist_conf_size * 2) != 0) {
397                 zcmd_free_nvlists(&zc);
398                 return (NULL);
399         }
400
401         while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT,
402             &zc)) != 0 && errno == ENOMEM) {
403                 if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
404                         zcmd_free_nvlists(&zc);
405                         return (NULL);
406                 }
407         }
408
409         if (err) {
410                 zcmd_free_nvlists(&zc);
411                 return (NULL);
412         }
413
414         if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) {
415                 zcmd_free_nvlists(&zc);
416                 return (NULL);
417         }
418
419         zcmd_free_nvlists(&zc);
420         return (nvl);
421 }
422
423 /*
424  * Determine if the vdev id is a hole in the namespace.
425  */
426 boolean_t
427 vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
428 {
429         int c;
430
431         for (c = 0; c < holes; c++) {
432
433                 /* Top-level is a hole */
434                 if (hole_array[c] == id)
435                         return (B_TRUE);
436         }
437         return (B_FALSE);
438 }
439
440 /*
441  * Convert our list of pools into the definitive set of configurations.  We
442  * start by picking the best config for each toplevel vdev.  Once that's done,
443  * we assemble the toplevel vdevs into a full config for the pool.  We make a
444  * pass to fix up any incorrect paths, and then add it to the main list to
445  * return to the user.
446  */
447 static nvlist_t *
448 get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
449 {
450         pool_entry_t *pe;
451         vdev_entry_t *ve;
452         config_entry_t *ce;
453         nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
454         nvlist_t **spares, **l2cache;
455         uint_t i, nspares, nl2cache;
456         boolean_t config_seen;
457         uint64_t best_txg;
458         char *name, *hostname = NULL;
459         uint64_t guid;
460         uint_t children = 0;
461         nvlist_t **child = NULL;
462         uint_t holes;
463         uint64_t *hole_array, max_id;
464         uint_t c;
465         boolean_t isactive;
466         uint64_t hostid;
467         nvlist_t *nvl;
468         boolean_t valid_top_config = B_FALSE;
469
470         if (nvlist_alloc(&ret, 0, 0) != 0)
471                 goto nomem;
472
473         for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
474                 uint64_t id, max_txg = 0;
475
476                 if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
477                         goto nomem;
478                 config_seen = B_FALSE;
479
480                 /*
481                  * Iterate over all toplevel vdevs.  Grab the pool configuration
482                  * from the first one we find, and then go through the rest and
483                  * add them as necessary to the 'vdevs' member of the config.
484                  */
485                 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
486
487                         /*
488                          * Determine the best configuration for this vdev by
489                          * selecting the config with the latest transaction
490                          * group.
491                          */
492                         best_txg = 0;
493                         for (ce = ve->ve_configs; ce != NULL;
494                             ce = ce->ce_next) {
495
496                                 if (ce->ce_txg > best_txg) {
497                                         tmp = ce->ce_config;
498                                         best_txg = ce->ce_txg;
499                                 }
500                         }
501
502                         /*
503                          * We rely on the fact that the max txg for the
504                          * pool will contain the most up-to-date information
505                          * about the valid top-levels in the vdev namespace.
506                          */
507                         if (best_txg > max_txg) {
508                                 (void) nvlist_remove(config,
509                                     ZPOOL_CONFIG_VDEV_CHILDREN,
510                                     DATA_TYPE_UINT64);
511                                 (void) nvlist_remove(config,
512                                     ZPOOL_CONFIG_HOLE_ARRAY,
513                                     DATA_TYPE_UINT64_ARRAY);
514
515                                 max_txg = best_txg;
516                                 hole_array = NULL;
517                                 holes = 0;
518                                 max_id = 0;
519                                 valid_top_config = B_FALSE;
520
521                                 if (nvlist_lookup_uint64(tmp,
522                                     ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
523                                         verify(nvlist_add_uint64(config,
524                                             ZPOOL_CONFIG_VDEV_CHILDREN,
525                                             max_id) == 0);
526                                         valid_top_config = B_TRUE;
527                                 }
528
529                                 if (nvlist_lookup_uint64_array(tmp,
530                                     ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
531                                     &holes) == 0) {
532                                         verify(nvlist_add_uint64_array(config,
533                                             ZPOOL_CONFIG_HOLE_ARRAY,
534                                             hole_array, holes) == 0);
535                                 }
536                         }
537
538                         if (!config_seen) {
539                                 /*
540                                  * Copy the relevant pieces of data to the pool
541                                  * configuration:
542                                  *
543                                  *      version
544                                  *      pool guid
545                                  *      name
546                                  *      comment (if available)
547                                  *      pool state
548                                  *      hostid (if available)
549                                  *      hostname (if available)
550                                  */
551                                 uint64_t state, version;
552                                 char *comment = NULL;
553
554                                 version = fnvlist_lookup_uint64(tmp,
555                                     ZPOOL_CONFIG_VERSION);
556                                 fnvlist_add_uint64(config,
557                                     ZPOOL_CONFIG_VERSION, version);
558                                 guid = fnvlist_lookup_uint64(tmp,
559                                     ZPOOL_CONFIG_POOL_GUID);
560                                 fnvlist_add_uint64(config,
561                                     ZPOOL_CONFIG_POOL_GUID, guid);
562                                 name = fnvlist_lookup_string(tmp,
563                                     ZPOOL_CONFIG_POOL_NAME);
564                                 fnvlist_add_string(config,
565                                     ZPOOL_CONFIG_POOL_NAME, name);
566
567                                 if (nvlist_lookup_string(tmp,
568                                     ZPOOL_CONFIG_COMMENT, &comment) == 0)
569                                         fnvlist_add_string(config,
570                                             ZPOOL_CONFIG_COMMENT, comment);
571
572                                 state = fnvlist_lookup_uint64(tmp,
573                                     ZPOOL_CONFIG_POOL_STATE);
574                                 fnvlist_add_uint64(config,
575                                     ZPOOL_CONFIG_POOL_STATE, state);
576
577                                 hostid = 0;
578                                 if (nvlist_lookup_uint64(tmp,
579                                     ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
580                                         fnvlist_add_uint64(config,
581                                             ZPOOL_CONFIG_HOSTID, hostid);
582                                         hostname = fnvlist_lookup_string(tmp,
583                                             ZPOOL_CONFIG_HOSTNAME);
584                                         fnvlist_add_string(config,
585                                             ZPOOL_CONFIG_HOSTNAME, hostname);
586                                 }
587
588                                 config_seen = B_TRUE;
589                         }
590
591                         /*
592                          * Add this top-level vdev to the child array.
593                          */
594                         verify(nvlist_lookup_nvlist(tmp,
595                             ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
596                         verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
597                             &id) == 0);
598
599                         if (id >= children) {
600                                 nvlist_t **newchild;
601
602                                 newchild = zfs_alloc(hdl, (id + 1) *
603                                     sizeof (nvlist_t *));
604                                 if (newchild == NULL)
605                                         goto nomem;
606
607                                 for (c = 0; c < children; c++)
608                                         newchild[c] = child[c];
609
610                                 free(child);
611                                 child = newchild;
612                                 children = id + 1;
613                         }
614                         if (nvlist_dup(nvtop, &child[id], 0) != 0)
615                                 goto nomem;
616
617                 }
618
619                 /*
620                  * If we have information about all the top-levels then
621                  * clean up the nvlist which we've constructed. This
622                  * means removing any extraneous devices that are
623                  * beyond the valid range or adding devices to the end
624                  * of our array which appear to be missing.
625                  */
626                 if (valid_top_config) {
627                         if (max_id < children) {
628                                 for (c = max_id; c < children; c++)
629                                         nvlist_free(child[c]);
630                                 children = max_id;
631                         } else if (max_id > children) {
632                                 nvlist_t **newchild;
633
634                                 newchild = zfs_alloc(hdl, (max_id) *
635                                     sizeof (nvlist_t *));
636                                 if (newchild == NULL)
637                                         goto nomem;
638
639                                 for (c = 0; c < children; c++)
640                                         newchild[c] = child[c];
641
642                                 free(child);
643                                 child = newchild;
644                                 children = max_id;
645                         }
646                 }
647
648                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
649                     &guid) == 0);
650
651                 /*
652                  * The vdev namespace may contain holes as a result of
653                  * device removal. We must add them back into the vdev
654                  * tree before we process any missing devices.
655                  */
656                 if (holes > 0) {
657                         ASSERT(valid_top_config);
658
659                         for (c = 0; c < children; c++) {
660                                 nvlist_t *holey;
661
662                                 if (child[c] != NULL ||
663                                     !vdev_is_hole(hole_array, holes, c))
664                                         continue;
665
666                                 if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
667                                     0) != 0)
668                                         goto nomem;
669
670                                 /*
671                                  * Holes in the namespace are treated as
672                                  * "hole" top-level vdevs and have a
673                                  * special flag set on them.
674                                  */
675                                 if (nvlist_add_string(holey,
676                                     ZPOOL_CONFIG_TYPE,
677                                     VDEV_TYPE_HOLE) != 0 ||
678                                     nvlist_add_uint64(holey,
679                                     ZPOOL_CONFIG_ID, c) != 0 ||
680                                     nvlist_add_uint64(holey,
681                                     ZPOOL_CONFIG_GUID, 0ULL) != 0) {
682                                         nvlist_free(holey);
683                                         goto nomem;
684                                 }
685                                 child[c] = holey;
686                         }
687                 }
688
689                 /*
690                  * Look for any missing top-level vdevs.  If this is the case,
691                  * create a faked up 'missing' vdev as a placeholder.  We cannot
692                  * simply compress the child array, because the kernel performs
693                  * certain checks to make sure the vdev IDs match their location
694                  * in the configuration.
695                  */
696                 for (c = 0; c < children; c++) {
697                         if (child[c] == NULL) {
698                                 nvlist_t *missing;
699                                 if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
700                                     0) != 0)
701                                         goto nomem;
702                                 if (nvlist_add_string(missing,
703                                     ZPOOL_CONFIG_TYPE,
704                                     VDEV_TYPE_MISSING) != 0 ||
705                                     nvlist_add_uint64(missing,
706                                     ZPOOL_CONFIG_ID, c) != 0 ||
707                                     nvlist_add_uint64(missing,
708                                     ZPOOL_CONFIG_GUID, 0ULL) != 0) {
709                                         nvlist_free(missing);
710                                         goto nomem;
711                                 }
712                                 child[c] = missing;
713                         }
714                 }
715
716                 /*
717                  * Put all of this pool's top-level vdevs into a root vdev.
718                  */
719                 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
720                         goto nomem;
721                 if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
722                     VDEV_TYPE_ROOT) != 0 ||
723                     nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||
724                     nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||
725                     nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
726                     child, children) != 0) {
727                         nvlist_free(nvroot);
728                         goto nomem;
729                 }
730
731                 for (c = 0; c < children; c++)
732                         nvlist_free(child[c]);
733                 free(child);
734                 children = 0;
735                 child = NULL;
736
737                 /*
738                  * Go through and fix up any paths and/or devids based on our
739                  * known list of vdev GUID -> path mappings.
740                  */
741                 if (fix_paths(nvroot, pl->names) != 0) {
742                         nvlist_free(nvroot);
743                         goto nomem;
744                 }
745
746                 /*
747                  * Add the root vdev to this pool's configuration.
748                  */
749                 if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
750                     nvroot) != 0) {
751                         nvlist_free(nvroot);
752                         goto nomem;
753                 }
754                 nvlist_free(nvroot);
755
756                 /*
757                  * zdb uses this path to report on active pools that were
758                  * imported or created using -R.
759                  */
760                 if (active_ok)
761                         goto add_pool;
762
763                 /*
764                  * Determine if this pool is currently active, in which case we
765                  * can't actually import it.
766                  */
767                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
768                     &name) == 0);
769                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
770                     &guid) == 0);
771
772                 if (pool_active(hdl, name, guid, &isactive) != 0)
773                         goto error;
774
775                 if (isactive) {
776                         nvlist_free(config);
777                         config = NULL;
778                         continue;
779                 }
780
781                 if ((nvl = refresh_config(hdl, config)) == NULL) {
782                         nvlist_free(config);
783                         config = NULL;
784                         continue;
785                 }
786
787                 nvlist_free(config);
788                 config = nvl;
789
790                 /*
791                  * Go through and update the paths for spares, now that we have
792                  * them.
793                  */
794                 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
795                     &nvroot) == 0);
796                 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
797                     &spares, &nspares) == 0) {
798                         for (i = 0; i < nspares; i++) {
799                                 if (fix_paths(spares[i], pl->names) != 0)
800                                         goto nomem;
801                         }
802                 }
803
804                 /*
805                  * Update the paths for l2cache devices.
806                  */
807                 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
808                     &l2cache, &nl2cache) == 0) {
809                         for (i = 0; i < nl2cache; i++) {
810                                 if (fix_paths(l2cache[i], pl->names) != 0)
811                                         goto nomem;
812                         }
813                 }
814
815                 /*
816                  * Restore the original information read from the actual label.
817                  */
818                 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
819                     DATA_TYPE_UINT64);
820                 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
821                     DATA_TYPE_STRING);
822                 if (hostid != 0) {
823                         verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
824                             hostid) == 0);
825                         verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
826                             hostname) == 0);
827                 }
828
829 add_pool:
830                 /*
831                  * Add this pool to the list of configs.
832                  */
833                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
834                     &name) == 0);
835                 if (nvlist_add_nvlist(ret, name, config) != 0)
836                         goto nomem;
837
838                 nvlist_free(config);
839                 config = NULL;
840         }
841
842         return (ret);
843
844 nomem:
845         (void) no_memory(hdl);
846 error:
847         nvlist_free(config);
848         nvlist_free(ret);
849         for (c = 0; c < children; c++)
850                 nvlist_free(child[c]);
851         free(child);
852
853         return (NULL);
854 }
855
856 /*
857  * Return the offset of the given label.
858  */
859 static uint64_t
860 label_offset(uint64_t size, int l)
861 {
862         ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0);
863         return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
864             0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
865 }
866
867 /*
868  * Given a file descriptor, read the label information and return an nvlist
869  * describing the configuration, if there is one.  The number of valid
870  * labels found will be returned in num_labels when non-NULL.
871  */
872 int
873 zpool_read_label(int fd, nvlist_t **config, int *num_labels)
874 {
875         struct stat64 statbuf;
876         int l, count = 0;
877         vdev_label_t *label;
878         nvlist_t *expected_config = NULL;
879         uint64_t expected_guid = 0, size;
880
881         *config = NULL;
882
883         if (fstat64_blk(fd, &statbuf) == -1)
884                 return (0);
885         size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
886
887         if ((label = malloc(sizeof (vdev_label_t))) == NULL)
888                 return (-1);
889
890         for (l = 0; l < VDEV_LABELS; l++) {
891                 uint64_t state, guid, txg;
892
893                 if (pread64(fd, label, sizeof (vdev_label_t),
894                     label_offset(size, l)) != sizeof (vdev_label_t))
895                         continue;
896
897                 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
898                     sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0)
899                         continue;
900
901                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
902                     &guid) != 0 || guid == 0) {
903                         nvlist_free(*config);
904                         continue;
905                 }
906
907                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
908                     &state) != 0 || state > POOL_STATE_L2CACHE) {
909                         nvlist_free(*config);
910                         continue;
911                 }
912
913                 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
914                     (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
915                     &txg) != 0 || txg == 0)) {
916                         nvlist_free(*config);
917                         continue;
918                 }
919
920                 if (expected_guid) {
921                         if (expected_guid == guid)
922                                 count++;
923
924                         nvlist_free(*config);
925                 } else {
926                         expected_config = *config;
927                         expected_guid = guid;
928                         count++;
929                 }
930         }
931
932         if (num_labels != NULL)
933                 *num_labels = count;
934
935         free(label);
936         *config = expected_config;
937
938         return (0);
939 }
940
941 typedef struct rdsk_node {
942         char *rn_name;
943         int rn_num_labels;
944         int rn_dfd;
945         libzfs_handle_t *rn_hdl;
946         nvlist_t *rn_config;
947         avl_tree_t *rn_avl;
948         avl_node_t rn_node;
949         boolean_t rn_nozpool;
950 } rdsk_node_t;
951
952 static int
953 slice_cache_compare(const void *arg1, const void *arg2)
954 {
955         const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
956         const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
957         char *nm1slice, *nm2slice;
958         int rv;
959
960         /*
961          * partitions one and three (slices zero and two) are the most
962          * likely to provide results, so put those first
963          */
964         nm1slice = strstr(nm1, "part1");
965         nm2slice = strstr(nm2, "part1");
966         if (nm1slice && !nm2slice) {
967                 return (-1);
968         }
969         if (!nm1slice && nm2slice) {
970                 return (1);
971         }
972         nm1slice = strstr(nm1, "part3");
973         nm2slice = strstr(nm2, "part3");
974         if (nm1slice && !nm2slice) {
975                 return (-1);
976         }
977         if (!nm1slice && nm2slice) {
978                 return (1);
979         }
980
981         rv = strcmp(nm1, nm2);
982         if (rv == 0)
983                 return (0);
984         return (rv > 0 ? 1 : -1);
985 }
986
987 #ifndef __linux__
988 static void
989 check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
990     diskaddr_t size, uint_t blksz)
991 {
992         rdsk_node_t tmpnode;
993         rdsk_node_t *node;
994         char sname[MAXNAMELEN];
995
996         tmpnode.rn_name = &sname[0];
997         (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
998             diskname, partno);
999         /* too small to contain a zpool? */
1000         if ((size < (SPA_MINDEVSIZE / blksz)) &&
1001             (node = avl_find(r, &tmpnode, NULL)))
1002                 node->rn_nozpool = B_TRUE;
1003 }
1004 #endif
1005
1006 static void
1007 nozpool_all_slices(avl_tree_t *r, const char *sname)
1008 {
1009 #ifndef __linux__
1010         char diskname[MAXNAMELEN];
1011         char *ptr;
1012         int i;
1013
1014         (void) strncpy(diskname, sname, MAXNAMELEN);
1015         if (((ptr = strrchr(diskname, 's')) == NULL) &&
1016             ((ptr = strrchr(diskname, 'p')) == NULL))
1017                 return;
1018         ptr[0] = 's';
1019         ptr[1] = '\0';
1020         for (i = 0; i < NDKMAP; i++)
1021                 check_one_slice(r, diskname, i, 0, 1);
1022         ptr[0] = 'p';
1023         for (i = 0; i <= FD_NUMPART; i++)
1024                 check_one_slice(r, diskname, i, 0, 1);
1025 #endif
1026 }
1027
1028 static void
1029 check_slices(avl_tree_t *r, int fd, const char *sname)
1030 {
1031 #ifndef __linux__
1032         struct extvtoc vtoc;
1033         struct dk_gpt *gpt;
1034         char diskname[MAXNAMELEN];
1035         char *ptr;
1036         int i;
1037
1038         (void) strncpy(diskname, sname, MAXNAMELEN);
1039         if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
1040                 return;
1041         ptr[1] = '\0';
1042
1043         if (read_extvtoc(fd, &vtoc) >= 0) {
1044                 for (i = 0; i < NDKMAP; i++)
1045                         check_one_slice(r, diskname, i,
1046                             vtoc.v_part[i].p_size, vtoc.v_sectorsz);
1047         } else if (efi_alloc_and_read(fd, &gpt) >= 0) {
1048                 /*
1049                  * on x86 we'll still have leftover links that point
1050                  * to slices s[9-15], so use NDKMAP instead
1051                  */
1052                 for (i = 0; i < NDKMAP; i++)
1053                         check_one_slice(r, diskname, i,
1054                             gpt->efi_parts[i].p_size, gpt->efi_lbasize);
1055                 /* nodes p[1-4] are never used with EFI labels */
1056                 ptr[0] = 'p';
1057                 for (i = 1; i <= FD_NUMPART; i++)
1058                         check_one_slice(r, diskname, i, 0, 1);
1059                 efi_free(gpt);
1060         }
1061 #endif
1062 }
1063
1064 static void
1065 zpool_open_func(void *arg)
1066 {
1067         rdsk_node_t *rn = arg;
1068         struct stat64 statbuf;
1069         nvlist_t *config;
1070         int num_labels;
1071         int fd;
1072
1073         if (rn->rn_nozpool)
1074                 return;
1075 #ifdef __linux__
1076         /*
1077          * Skip devices with well known prefixes there can be side effects
1078          * when opening devices which need to be avoided.
1079          *
1080          * core     - Symlink to /proc/kcore
1081          * fd*      - Floppy interface.
1082          * fuse     - Fuse control device.
1083          * hpet     - High Precision Event Timer
1084          * lp*      - Printer interface.
1085          * parport* - Parallel port interface.
1086          * ppp      - Generic PPP driver.
1087          * random   - Random device
1088          * rtc      - Real Time Clock
1089          * tty*     - Generic serial interface.
1090          * urandom  - Random device.
1091          * usbmon*  - USB IO monitor.
1092          * vcs*     - Virtual console memory.
1093          * watchdog - Watchdog must be closed in a special way.
1094          */
1095         if ((strncmp(rn->rn_name, "core", 4) == 0) ||
1096             (strncmp(rn->rn_name, "fd", 2) == 0) ||
1097             (strncmp(rn->rn_name, "fuse", 4) == 0) ||
1098             (strncmp(rn->rn_name, "hpet", 4) == 0) ||
1099             (strncmp(rn->rn_name, "lp", 2) == 0) ||
1100             (strncmp(rn->rn_name, "parport", 7) == 0) ||
1101             (strncmp(rn->rn_name, "ppp", 3) == 0) ||
1102             (strncmp(rn->rn_name, "random", 6) == 0) ||
1103             (strncmp(rn->rn_name, "rtc", 3) == 0) ||
1104             (strncmp(rn->rn_name, "tty", 3) == 0) ||
1105             (strncmp(rn->rn_name, "urandom", 7) == 0) ||
1106             (strncmp(rn->rn_name, "usbmon", 6) == 0) ||
1107             (strncmp(rn->rn_name, "vcs", 3) == 0) ||
1108             (strncmp(rn->rn_name, "watchdog", 8) == 0))
1109                 return;
1110
1111         /*
1112          * Ignore failed stats.  We only want regular files and block devices.
1113          */
1114         if (fstatat64(rn->rn_dfd, rn->rn_name, &statbuf, 0) != 0 ||
1115             (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
1116                 return;
1117
1118         if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
1119                 /* symlink to a device that's no longer there */
1120                 if (errno == ENOENT)
1121                         nozpool_all_slices(rn->rn_avl, rn->rn_name);
1122                 return;
1123         }
1124 #else
1125         if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
1126                 /* symlink to a device that's no longer there */
1127                 if (errno == ENOENT)
1128                         nozpool_all_slices(rn->rn_avl, rn->rn_name);
1129                 return;
1130         }
1131         /*
1132          * Ignore failed stats.  We only want regular
1133          * files, character devs and block devs.
1134          */
1135         if (fstat64(fd, &statbuf) != 0 ||
1136             (!S_ISREG(statbuf.st_mode) &&
1137             !S_ISCHR(statbuf.st_mode) &&
1138             !S_ISBLK(statbuf.st_mode))) {
1139                 (void) close(fd);
1140                 return;
1141         }
1142 #endif
1143         /* this file is too small to hold a zpool */
1144         if (S_ISREG(statbuf.st_mode) &&
1145             statbuf.st_size < SPA_MINDEVSIZE) {
1146                 (void) close(fd);
1147                 return;
1148         } else if (!S_ISREG(statbuf.st_mode)) {
1149                 /*
1150                  * Try to read the disk label first so we don't have to
1151                  * open a bunch of minor nodes that can't have a zpool.
1152                  */
1153                 check_slices(rn->rn_avl, fd, rn->rn_name);
1154         }
1155
1156         if ((zpool_read_label(fd, &config, &num_labels)) != 0) {
1157                 (void) close(fd);
1158                 (void) no_memory(rn->rn_hdl);
1159                 return;
1160         }
1161
1162         if (num_labels == 0) {
1163                 (void) close(fd);
1164                 nvlist_free(config);
1165                 return;
1166         }
1167
1168         (void) close(fd);
1169
1170         rn->rn_config = config;
1171         rn->rn_num_labels = num_labels;
1172 }
1173
1174 /*
1175  * Given a file descriptor, clear (zero) the label information.  This function
1176  * is used in the appliance stack as part of the ZFS sysevent module and
1177  * to implement the "zpool labelclear" command.
1178  */
1179 int
1180 zpool_clear_label(int fd)
1181 {
1182         struct stat64 statbuf;
1183         int l;
1184         vdev_label_t *label;
1185         uint64_t size;
1186
1187         if (fstat64_blk(fd, &statbuf) == -1)
1188                 return (0);
1189         size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
1190
1191         if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL)
1192                 return (-1);
1193
1194         for (l = 0; l < VDEV_LABELS; l++) {
1195                 if (pwrite64(fd, label, sizeof (vdev_label_t),
1196                     label_offset(size, l)) != sizeof (vdev_label_t)) {
1197                         free(label);
1198                         return (-1);
1199                 }
1200         }
1201
1202         free(label);
1203         return (0);
1204 }
1205
1206 #ifdef HAVE_LIBBLKID
1207 /*
1208  * Use libblkid to quickly search for zfs devices
1209  */
1210 static int
1211 zpool_find_import_blkid(libzfs_handle_t *hdl, pool_list_t *pools)
1212 {
1213         blkid_cache cache;
1214         blkid_dev_iterate iter;
1215         blkid_dev dev;
1216         const char *devname;
1217         nvlist_t *config;
1218         int fd, err, num_labels;
1219
1220         err = blkid_get_cache(&cache, NULL);
1221         if (err != 0) {
1222                 (void) zfs_error_fmt(hdl, EZFS_BADCACHE,
1223                     dgettext(TEXT_DOMAIN, "blkid_get_cache() %d"), err);
1224                 goto err_blkid1;
1225         }
1226
1227         err = blkid_probe_all(cache);
1228         if (err != 0) {
1229                 (void) zfs_error_fmt(hdl, EZFS_BADCACHE,
1230                     dgettext(TEXT_DOMAIN, "blkid_probe_all() %d"), err);
1231                 goto err_blkid2;
1232         }
1233
1234         iter = blkid_dev_iterate_begin(cache);
1235         if (iter == NULL) {
1236                 (void) zfs_error_fmt(hdl, EZFS_BADCACHE,
1237                     dgettext(TEXT_DOMAIN, "blkid_dev_iterate_begin()"));
1238                 goto err_blkid2;
1239         }
1240
1241         err = blkid_dev_set_search(iter, "TYPE", "zfs_member");
1242         if (err != 0) {
1243                 (void) zfs_error_fmt(hdl, EZFS_BADCACHE,
1244                     dgettext(TEXT_DOMAIN, "blkid_dev_set_search() %d"), err);
1245                 goto err_blkid3;
1246         }
1247
1248         while (blkid_dev_next(iter, &dev) == 0) {
1249                 devname = blkid_dev_devname(dev);
1250                 if ((fd = open64(devname, O_RDONLY)) < 0)
1251                         continue;
1252
1253                 err = zpool_read_label(fd, &config, &num_labels);
1254                 (void) close(fd);
1255
1256                 if (err != 0) {
1257                         (void) no_memory(hdl);
1258                         goto err_blkid3;
1259                 }
1260
1261                 if (config != NULL) {
1262                         err = add_config(hdl, pools, devname, 0,
1263                             num_labels, config);
1264                         if (err != 0)
1265                                 goto err_blkid3;
1266                 }
1267         }
1268
1269 err_blkid3:
1270         blkid_dev_iterate_end(iter);
1271 err_blkid2:
1272         blkid_put_cache(cache);
1273 err_blkid1:
1274         return (err);
1275 }
1276 #endif /* HAVE_LIBBLKID */
1277
1278 char *
1279 zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
1280         "/dev/disk/by-vdev",    /* Custom rules, use first if they exist */
1281         "/dev/mapper",          /* Use multipath devices before components */
1282         "/dev/disk/by-uuid",    /* Single unique entry and persistent */
1283         "/dev/disk/by-id",      /* May be multiple entries and persistent */
1284         "/dev/disk/by-path",    /* Encodes physical location and persistent */
1285         "/dev/disk/by-label",   /* Custom persistent labels */
1286         "/dev"                  /* UNSAFE device names will change */
1287 };
1288
1289 /*
1290  * Given a list of directories to search, find all pools stored on disk.  This
1291  * includes partial pools which are not available to import.  If no args are
1292  * given (argc is 0), then the default directory (/dev/dsk) is searched.
1293  * poolname or guid (but not both) are provided by the caller when trying
1294  * to import a specific pool.
1295  */
1296 static nvlist_t *
1297 zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
1298 {
1299         int i, dirs = iarg->paths;
1300         struct dirent64 *dp;
1301         char path[MAXPATHLEN];
1302         char *end, **dir = iarg->path;
1303         size_t pathleft;
1304         nvlist_t *ret = NULL;
1305         pool_list_t pools = { 0 };
1306         pool_entry_t *pe, *penext;
1307         vdev_entry_t *ve, *venext;
1308         config_entry_t *ce, *cenext;
1309         name_entry_t *ne, *nenext;
1310         avl_tree_t slice_cache;
1311         rdsk_node_t *slice;
1312         void *cookie;
1313
1314         verify(iarg->poolname == NULL || iarg->guid == 0);
1315
1316         if (dirs == 0) {
1317 #ifdef HAVE_LIBBLKID
1318                 /* Use libblkid to scan all device for their type */
1319                 if (zpool_find_import_blkid(hdl, &pools) == 0)
1320                         goto skip_scanning;
1321
1322                 (void) zfs_error_fmt(hdl, EZFS_BADCACHE,
1323                     dgettext(TEXT_DOMAIN, "blkid failure falling back "
1324                     "to manual probing"));
1325 #endif /* HAVE_LIBBLKID */
1326
1327                 dir = zpool_default_import_path;
1328                 dirs = DEFAULT_IMPORT_PATH_SIZE;
1329         }
1330
1331         /*
1332          * Go through and read the label configuration information from every
1333          * possible device, organizing the information according to pool GUID
1334          * and toplevel GUID.
1335          */
1336         for (i = 0; i < dirs; i++) {
1337                 taskq_t *t;
1338                 char *rdsk;
1339                 int dfd;
1340                 boolean_t config_failed = B_FALSE;
1341                 DIR *dirp;
1342
1343                 /* use realpath to normalize the path */
1344                 if (realpath(dir[i], path) == 0) {
1345
1346                         /* it is safe to skip missing search paths */
1347                         if (errno == ENOENT)
1348                                 continue;
1349
1350                         zfs_error_aux(hdl, strerror(errno));
1351                         (void) zfs_error_fmt(hdl, EZFS_BADPATH,
1352                             dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]);
1353                         goto error;
1354                 }
1355                 end = &path[strlen(path)];
1356                 *end++ = '/';
1357                 *end = 0;
1358                 pathleft = &path[sizeof (path)] - end;
1359
1360                 /*
1361                  * Using raw devices instead of block devices when we're
1362                  * reading the labels skips a bunch of slow operations during
1363                  * close(2) processing, so we replace /dev/dsk with /dev/rdsk.
1364                  */
1365                 if (strcmp(path, "/dev/dsk/") == 0)
1366                         rdsk = "/dev/rdsk/";
1367                 else
1368                         rdsk = path;
1369
1370                 if ((dfd = open64(rdsk, O_RDONLY)) < 0 ||
1371                     (dirp = fdopendir(dfd)) == NULL) {
1372                         if (dfd >= 0)
1373                                 (void) close(dfd);
1374                         zfs_error_aux(hdl, strerror(errno));
1375                         (void) zfs_error_fmt(hdl, EZFS_BADPATH,
1376                             dgettext(TEXT_DOMAIN, "cannot open '%s'"),
1377                             rdsk);
1378                         goto error;
1379                 }
1380
1381                 avl_create(&slice_cache, slice_cache_compare,
1382                     sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
1383
1384                 /*
1385                  * This is not MT-safe, but we have no MT consumers of libzfs
1386                  */
1387                 while ((dp = readdir64(dirp)) != NULL) {
1388                         const char *name = dp->d_name;
1389                         if (name[0] == '.' &&
1390                             (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
1391                                 continue;
1392
1393                         slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1394                         slice->rn_name = zfs_strdup(hdl, name);
1395                         slice->rn_avl = &slice_cache;
1396                         slice->rn_dfd = dfd;
1397                         slice->rn_hdl = hdl;
1398                         slice->rn_nozpool = B_FALSE;
1399                         avl_add(&slice_cache, slice);
1400                 }
1401                 /*
1402                  * create a thread pool to do all of this in parallel;
1403                  * rn_nozpool is not protected, so this is racy in that
1404                  * multiple tasks could decide that the same slice can
1405                  * not hold a zpool, which is benign.  Also choose
1406                  * double the number of processors; we hold a lot of
1407                  * locks in the kernel, so going beyond this doesn't
1408                  * buy us much.
1409                  */
1410                 thread_init();
1411                 t = taskq_create("z_import", 2 * boot_ncpus, defclsyspri,
1412                     2 * boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
1413                 for (slice = avl_first(&slice_cache); slice;
1414                     (slice = avl_walk(&slice_cache, slice,
1415                     AVL_AFTER)))
1416                         (void) taskq_dispatch(t, zpool_open_func, slice,
1417                             TQ_SLEEP);
1418                 taskq_wait(t);
1419                 taskq_destroy(t);
1420                 thread_fini();
1421
1422                 cookie = NULL;
1423                 while ((slice = avl_destroy_nodes(&slice_cache,
1424                     &cookie)) != NULL) {
1425                         if (slice->rn_config != NULL && !config_failed) {
1426                                 nvlist_t *config = slice->rn_config;
1427                                 boolean_t matched = B_TRUE;
1428
1429                                 if (iarg->poolname != NULL) {
1430                                         char *pname;
1431
1432                                         matched = nvlist_lookup_string(config,
1433                                             ZPOOL_CONFIG_POOL_NAME,
1434                                             &pname) == 0 &&
1435                                             strcmp(iarg->poolname, pname) == 0;
1436                                 } else if (iarg->guid != 0) {
1437                                         uint64_t this_guid;
1438
1439                                         matched = nvlist_lookup_uint64(config,
1440                                             ZPOOL_CONFIG_POOL_GUID,
1441                                             &this_guid) == 0 &&
1442                                             iarg->guid == this_guid;
1443                                 }
1444                                 if (!matched) {
1445                                         nvlist_free(config);
1446                                 } else {
1447                                         /*
1448                                          * use the non-raw path for the config
1449                                          */
1450                                         (void) strlcpy(end, slice->rn_name,
1451                                             pathleft);
1452                                         if (add_config(hdl, &pools, path, i+1,
1453                                             slice->rn_num_labels, config) != 0)
1454                                                 config_failed = B_TRUE;
1455                                 }
1456                         }
1457                         free(slice->rn_name);
1458                         free(slice);
1459                 }
1460                 avl_destroy(&slice_cache);
1461
1462                 (void) closedir(dirp);
1463
1464                 if (config_failed)
1465                         goto error;
1466         }
1467
1468 #ifdef HAVE_LIBBLKID
1469 skip_scanning:
1470 #endif
1471         ret = get_configs(hdl, &pools, iarg->can_be_active);
1472
1473 error:
1474         for (pe = pools.pools; pe != NULL; pe = penext) {
1475                 penext = pe->pe_next;
1476                 for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
1477                         venext = ve->ve_next;
1478                         for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
1479                                 cenext = ce->ce_next;
1480                                 if (ce->ce_config)
1481                                         nvlist_free(ce->ce_config);
1482                                 free(ce);
1483                         }
1484                         free(ve);
1485                 }
1486                 free(pe);
1487         }
1488
1489         for (ne = pools.names; ne != NULL; ne = nenext) {
1490                 nenext = ne->ne_next;
1491                 free(ne->ne_name);
1492                 free(ne);
1493         }
1494
1495         return (ret);
1496 }
1497
1498 nvlist_t *
1499 zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
1500 {
1501         importargs_t iarg = { 0 };
1502
1503         iarg.paths = argc;
1504         iarg.path = argv;
1505
1506         return (zpool_find_import_impl(hdl, &iarg));
1507 }
1508
1509 /*
1510  * Given a cache file, return the contents as a list of importable pools.
1511  * poolname or guid (but not both) are provided by the caller when trying
1512  * to import a specific pool.
1513  */
1514 nvlist_t *
1515 zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
1516     char *poolname, uint64_t guid)
1517 {
1518         char *buf;
1519         int fd;
1520         struct stat64 statbuf;
1521         nvlist_t *raw, *src, *dst;
1522         nvlist_t *pools;
1523         nvpair_t *elem;
1524         char *name;
1525         uint64_t this_guid;
1526         boolean_t active;
1527
1528         verify(poolname == NULL || guid == 0);
1529
1530         if ((fd = open(cachefile, O_RDONLY)) < 0) {
1531                 zfs_error_aux(hdl, "%s", strerror(errno));
1532                 (void) zfs_error(hdl, EZFS_BADCACHE,
1533                     dgettext(TEXT_DOMAIN, "failed to open cache file"));
1534                 return (NULL);
1535         }
1536
1537         if (fstat64(fd, &statbuf) != 0) {
1538                 zfs_error_aux(hdl, "%s", strerror(errno));
1539                 (void) close(fd);
1540                 (void) zfs_error(hdl, EZFS_BADCACHE,
1541                     dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
1542                 return (NULL);
1543         }
1544
1545         if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) {
1546                 (void) close(fd);
1547                 return (NULL);
1548         }
1549
1550         if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
1551                 (void) close(fd);
1552                 free(buf);
1553                 (void) zfs_error(hdl, EZFS_BADCACHE,
1554                     dgettext(TEXT_DOMAIN,
1555                     "failed to read cache file contents"));
1556                 return (NULL);
1557         }
1558
1559         (void) close(fd);
1560
1561         if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
1562                 free(buf);
1563                 (void) zfs_error(hdl, EZFS_BADCACHE,
1564                     dgettext(TEXT_DOMAIN,
1565                     "invalid or corrupt cache file contents"));
1566                 return (NULL);
1567         }
1568
1569         free(buf);
1570
1571         /*
1572          * Go through and get the current state of the pools and refresh their
1573          * state.
1574          */
1575         if (nvlist_alloc(&pools, 0, 0) != 0) {
1576                 (void) no_memory(hdl);
1577                 nvlist_free(raw);
1578                 return (NULL);
1579         }
1580
1581         elem = NULL;
1582         while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
1583                 src = fnvpair_value_nvlist(elem);
1584
1585                 name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
1586                 if (poolname != NULL && strcmp(poolname, name) != 0)
1587                         continue;
1588
1589                 this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
1590                 if (guid != 0 && guid != this_guid)
1591                         continue;
1592
1593                 if (pool_active(hdl, name, this_guid, &active) != 0) {
1594                         nvlist_free(raw);
1595                         nvlist_free(pools);
1596                         return (NULL);
1597                 }
1598
1599                 if (active)
1600                         continue;
1601
1602                 if ((dst = refresh_config(hdl, src)) == NULL) {
1603                         nvlist_free(raw);
1604                         nvlist_free(pools);
1605                         return (NULL);
1606                 }
1607
1608                 if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
1609                         (void) no_memory(hdl);
1610                         nvlist_free(dst);
1611                         nvlist_free(raw);
1612                         nvlist_free(pools);
1613                         return (NULL);
1614                 }
1615                 nvlist_free(dst);
1616         }
1617
1618         nvlist_free(raw);
1619         return (pools);
1620 }
1621
1622 static int
1623 name_or_guid_exists(zpool_handle_t *zhp, void *data)
1624 {
1625         importargs_t *import = data;
1626         int found = 0;
1627
1628         if (import->poolname != NULL) {
1629                 char *pool_name;
1630
1631                 verify(nvlist_lookup_string(zhp->zpool_config,
1632                     ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
1633                 if (strcmp(pool_name, import->poolname) == 0)
1634                         found = 1;
1635         } else {
1636                 uint64_t pool_guid;
1637
1638                 verify(nvlist_lookup_uint64(zhp->zpool_config,
1639                     ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
1640                 if (pool_guid == import->guid)
1641                         found = 1;
1642         }
1643
1644         zpool_close(zhp);
1645         return (found);
1646 }
1647
1648 nvlist_t *
1649 zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
1650 {
1651         verify(import->poolname == NULL || import->guid == 0);
1652
1653         if (import->unique)
1654                 import->exists = zpool_iter(hdl, name_or_guid_exists, import);
1655
1656         if (import->cachefile != NULL)
1657                 return (zpool_find_import_cached(hdl, import->cachefile,
1658                     import->poolname, import->guid));
1659
1660         return (zpool_find_import_impl(hdl, import));
1661 }
1662
1663 boolean_t
1664 find_guid(nvlist_t *nv, uint64_t guid)
1665 {
1666         uint64_t tmp;
1667         nvlist_t **child;
1668         uint_t c, children;
1669
1670         verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0);
1671         if (tmp == guid)
1672                 return (B_TRUE);
1673
1674         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1675             &child, &children) == 0) {
1676                 for (c = 0; c < children; c++)
1677                         if (find_guid(child[c], guid))
1678                                 return (B_TRUE);
1679         }
1680
1681         return (B_FALSE);
1682 }
1683
1684 typedef struct aux_cbdata {
1685         const char      *cb_type;
1686         uint64_t        cb_guid;
1687         zpool_handle_t  *cb_zhp;
1688 } aux_cbdata_t;
1689
1690 static int
1691 find_aux(zpool_handle_t *zhp, void *data)
1692 {
1693         aux_cbdata_t *cbp = data;
1694         nvlist_t **list;
1695         uint_t i, count;
1696         uint64_t guid;
1697         nvlist_t *nvroot;
1698
1699         verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
1700             &nvroot) == 0);
1701
1702         if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type,
1703             &list, &count) == 0) {
1704                 for (i = 0; i < count; i++) {
1705                         verify(nvlist_lookup_uint64(list[i],
1706                             ZPOOL_CONFIG_GUID, &guid) == 0);
1707                         if (guid == cbp->cb_guid) {
1708                                 cbp->cb_zhp = zhp;
1709                                 return (1);
1710                         }
1711                 }
1712         }
1713
1714         zpool_close(zhp);
1715         return (0);
1716 }
1717
1718 /*
1719  * Determines if the pool is in use.  If so, it returns true and the state of
1720  * the pool as well as the name of the pool.  Both strings are allocated and
1721  * must be freed by the caller.
1722  */
1723 int
1724 zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
1725     boolean_t *inuse)
1726 {
1727         nvlist_t *config;
1728         char *name;
1729         boolean_t ret;
1730         uint64_t guid, vdev_guid;
1731         zpool_handle_t *zhp;
1732         nvlist_t *pool_config;
1733         uint64_t stateval, isspare;
1734         aux_cbdata_t cb = { 0 };
1735         boolean_t isactive;
1736
1737         *inuse = B_FALSE;
1738
1739         if (zpool_read_label(fd, &config, NULL) != 0) {
1740                 (void) no_memory(hdl);
1741                 return (-1);
1742         }
1743
1744         if (config == NULL)
1745                 return (0);
1746
1747         verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
1748             &stateval) == 0);
1749         verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
1750             &vdev_guid) == 0);
1751
1752         if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) {
1753                 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
1754                     &name) == 0);
1755                 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
1756                     &guid) == 0);
1757         }
1758
1759         switch (stateval) {
1760         case POOL_STATE_EXPORTED:
1761                 /*
1762                  * A pool with an exported state may in fact be imported
1763                  * read-only, so check the in-core state to see if it's
1764                  * active and imported read-only.  If it is, set
1765                  * its state to active.
1766                  */
1767                 if (pool_active(hdl, name, guid, &isactive) == 0 && isactive &&
1768                     (zhp = zpool_open_canfail(hdl, name)) != NULL) {
1769                         if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL))
1770                                 stateval = POOL_STATE_ACTIVE;
1771
1772                         /*
1773                          * All we needed the zpool handle for is the
1774                          * readonly prop check.
1775                          */
1776                         zpool_close(zhp);
1777                 }
1778
1779                 ret = B_TRUE;
1780                 break;
1781
1782         case POOL_STATE_ACTIVE:
1783                 /*
1784                  * For an active pool, we have to determine if it's really part
1785                  * of a currently active pool (in which case the pool will exist
1786                  * and the guid will be the same), or whether it's part of an
1787                  * active pool that was disconnected without being explicitly
1788                  * exported.
1789                  */
1790                 if (pool_active(hdl, name, guid, &isactive) != 0) {
1791                         nvlist_free(config);
1792                         return (-1);
1793                 }
1794
1795                 if (isactive) {
1796                         /*
1797                          * Because the device may have been removed while
1798                          * offlined, we only report it as active if the vdev is
1799                          * still present in the config.  Otherwise, pretend like
1800                          * it's not in use.
1801                          */
1802                         if ((zhp = zpool_open_canfail(hdl, name)) != NULL &&
1803                             (pool_config = zpool_get_config(zhp, NULL))
1804                             != NULL) {
1805                                 nvlist_t *nvroot;
1806
1807                                 verify(nvlist_lookup_nvlist(pool_config,
1808                                     ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1809                                 ret = find_guid(nvroot, vdev_guid);
1810                         } else {
1811                                 ret = B_FALSE;
1812                         }
1813
1814                         /*
1815                          * If this is an active spare within another pool, we
1816                          * treat it like an unused hot spare.  This allows the
1817                          * user to create a pool with a hot spare that currently
1818                          * in use within another pool.  Since we return B_TRUE,
1819                          * libdiskmgt will continue to prevent generic consumers
1820                          * from using the device.
1821                          */
1822                         if (ret && nvlist_lookup_uint64(config,
1823                             ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare)
1824                                 stateval = POOL_STATE_SPARE;
1825
1826                         if (zhp != NULL)
1827                                 zpool_close(zhp);
1828                 } else {
1829                         stateval = POOL_STATE_POTENTIALLY_ACTIVE;
1830                         ret = B_TRUE;
1831                 }
1832                 break;
1833
1834         case POOL_STATE_SPARE:
1835                 /*
1836                  * For a hot spare, it can be either definitively in use, or
1837                  * potentially active.  To determine if it's in use, we iterate
1838                  * over all pools in the system and search for one with a spare
1839                  * with a matching guid.
1840                  *
1841                  * Due to the shared nature of spares, we don't actually report
1842                  * the potentially active case as in use.  This means the user
1843                  * can freely create pools on the hot spares of exported pools,
1844                  * but to do otherwise makes the resulting code complicated, and
1845                  * we end up having to deal with this case anyway.
1846                  */
1847                 cb.cb_zhp = NULL;
1848                 cb.cb_guid = vdev_guid;
1849                 cb.cb_type = ZPOOL_CONFIG_SPARES;
1850                 if (zpool_iter(hdl, find_aux, &cb) == 1) {
1851                         name = (char *)zpool_get_name(cb.cb_zhp);
1852                         ret = B_TRUE;
1853                 } else {
1854                         ret = B_FALSE;
1855                 }
1856                 break;
1857
1858         case POOL_STATE_L2CACHE:
1859
1860                 /*
1861                  * Check if any pool is currently using this l2cache device.
1862                  */
1863                 cb.cb_zhp = NULL;
1864                 cb.cb_guid = vdev_guid;
1865                 cb.cb_type = ZPOOL_CONFIG_L2CACHE;
1866                 if (zpool_iter(hdl, find_aux, &cb) == 1) {
1867                         name = (char *)zpool_get_name(cb.cb_zhp);
1868                         ret = B_TRUE;
1869                 } else {
1870                         ret = B_FALSE;
1871                 }
1872                 break;
1873
1874         default:
1875                 ret = B_FALSE;
1876         }
1877
1878
1879         if (ret) {
1880                 if ((*namestr = zfs_strdup(hdl, name)) == NULL) {
1881                         if (cb.cb_zhp)
1882                                 zpool_close(cb.cb_zhp);
1883                         nvlist_free(config);
1884                         return (-1);
1885                 }
1886                 *state = (pool_state_t)stateval;
1887         }
1888
1889         if (cb.cb_zhp)
1890                 zpool_close(cb.cb_zhp);
1891
1892         nvlist_free(config);
1893         *inuse = ret;
1894         return (0);
1895 }