4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25 * Copyright 2015 RackTop Systems.
26 * Copyright (c) 2016, Intel Corporation.
30 * Pool import support functions.
32 * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
33 * these commands are expected to run in the global zone, we can assume
34 * that the devices are all readable when called.
36 * To import a pool, we rely on reading the configuration information from the
37 * ZFS label of each device. If we successfully read the label, then we
38 * organize the configuration information in the following hierarchy:
40 * pool guid -> toplevel vdev guid -> label txg
42 * Duplicate entries matching this same tuple will be discarded. Once we have
43 * examined every device, we pick the best label txg config for each toplevel
44 * vdev. We then arrange these toplevel vdevs into a complete pool config, and
45 * update any paths that have changed. Finally, we attempt to import the pool
46 * using our derived config, and record the results.
61 #include <sys/dktp/fdisk.h>
62 #include <sys/vdev_impl.h>
63 #include <sys/fs/zfs.h>
64 #include <sys/vdev_impl.h>
66 #include <thread_pool.h>
68 #include <libnvpair.h>
70 #include "zutil_import.h"
76 #include <blkid/blkid.h>
78 #define DEFAULT_IMPORT_PATH_SIZE 9
79 #define DEV_BYID_PATH "/dev/disk/by-id/"
82 is_watchdog_dev(char *dev)
84 /* For 'watchdog' dev */
85 if (strcmp(dev, "watchdog") == 0)
88 /* For 'watchdog<digit><whatever> */
89 if (strstr(dev, "watchdog") == dev && isdigit(dev[8]))
98 return (ioctl(fd, BLKFLSBUF));
102 zpool_open_func(void *arg)
104 rdsk_node_t *rn = arg;
105 libpc_handle_t *hdl = rn->rn_hdl;
106 struct stat64 statbuf;
108 char *bname, *dupname;
109 uint64_t vdev_guid = 0;
115 * Skip devices with well known prefixes there can be side effects
116 * when opening devices which need to be avoided.
118 * hpet - High Precision Event Timer
119 * watchdog - Watchdog must be closed in a special way.
121 dupname = zutil_strdup(hdl, rn->rn_name);
122 bname = basename(dupname);
123 error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname));
129 * Ignore failed stats. We only want regular files and block devices.
131 if (stat64(rn->rn_name, &statbuf) != 0 ||
132 (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
136 * Preferentially open using O_DIRECT to bypass the block device
137 * cache which may be stale for multipath devices. An EINVAL errno
138 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
140 fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
141 if ((fd < 0) && (errno == EINVAL))
142 fd = open(rn->rn_name, O_RDONLY);
143 if ((fd < 0) && (errno == EACCES))
144 hdl->lpc_open_access_error = B_TRUE;
149 * This file is too small to hold a zpool
151 if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
156 error = zpool_read_label(fd, &config, &num_labels);
162 if (num_labels == 0) {
169 * Check that the vdev is for the expected guid. Additional entries
170 * are speculatively added based on the paths stored in the labels.
171 * Entries with valid paths but incorrect guids must be removed.
173 error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
174 if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
182 rn->rn_config = config;
183 rn->rn_num_labels = num_labels;
186 * Add additional entries for paths described by this label.
188 if (rn->rn_labelpaths) {
197 if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
200 env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS");
201 if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 ||
203 timeout = DISK_LABEL_WAIT;
207 * Allow devlinks to stabilize so all paths are available.
209 zpool_label_disk_wait(rn->rn_name, timeout);
212 slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
213 slice->rn_name = zutil_strdup(hdl, path);
214 slice->rn_vdev_guid = vdev_guid;
215 slice->rn_avl = rn->rn_avl;
217 slice->rn_order = IMPORT_ORDER_PREFERRED_1;
218 slice->rn_labelpaths = B_FALSE;
219 pthread_mutex_lock(rn->rn_lock);
220 if (avl_find(rn->rn_avl, slice, &where)) {
221 pthread_mutex_unlock(rn->rn_lock);
222 free(slice->rn_name);
225 avl_insert(rn->rn_avl, slice, where);
226 pthread_mutex_unlock(rn->rn_lock);
227 zpool_open_func(slice);
232 slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
233 error = asprintf(&slice->rn_name, "%s%s",
234 DEV_BYID_PATH, devid);
240 slice->rn_vdev_guid = vdev_guid;
241 slice->rn_avl = rn->rn_avl;
243 slice->rn_order = IMPORT_ORDER_PREFERRED_2;
244 slice->rn_labelpaths = B_FALSE;
245 pthread_mutex_lock(rn->rn_lock);
246 if (avl_find(rn->rn_avl, slice, &where)) {
247 pthread_mutex_unlock(rn->rn_lock);
248 free(slice->rn_name);
251 avl_insert(rn->rn_avl, slice, where);
252 pthread_mutex_unlock(rn->rn_lock);
253 zpool_open_func(slice);
260 zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
261 "/dev/disk/by-vdev", /* Custom rules, use first if they exist */
262 "/dev/mapper", /* Use multipath devices before components */
263 "/dev/disk/by-partlabel", /* Single unique entry set by user */
264 "/dev/disk/by-partuuid", /* Generated partition uuid */
265 "/dev/disk/by-label", /* Custom persistent labels */
266 "/dev/disk/by-uuid", /* Single unique entry and persistent */
267 "/dev/disk/by-id", /* May be multiple entries and persistent */
268 "/dev/disk/by-path", /* Encodes physical location and persistent */
269 "/dev" /* UNSAFE device names will change */
273 zpool_default_search_paths(size_t *count)
275 *count = DEFAULT_IMPORT_PATH_SIZE;
276 return ((const char * const *)zpool_default_import_path);
280 * Given a full path to a device determine if that device appears in the
281 * import search path. If it does return the first match and store the
282 * index in the passed 'order' variable, otherwise return an error.
285 zfs_path_order(char *name, int *order)
287 int i = 0, error = ENOENT;
288 char *dir, *env, *envdup;
290 env = getenv("ZPOOL_IMPORT_PATH");
292 envdup = strdup(env);
293 dir = strtok(envdup, ":");
295 if (strncmp(name, dir, strlen(dir)) == 0) {
300 dir = strtok(NULL, ":");
305 for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) {
306 if (strncmp(name, zpool_default_import_path[i],
307 strlen(zpool_default_import_path[i])) == 0) {
319 * Use libblkid to quickly enumerate all known zfs devices.
322 zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock,
323 avl_tree_t **slice_cache)
327 blkid_dev_iterate iter;
334 error = blkid_get_cache(&cache, NULL);
338 error = blkid_probe_all_new(cache);
340 blkid_put_cache(cache);
344 iter = blkid_dev_iterate_begin(cache);
346 blkid_put_cache(cache);
350 error = blkid_dev_set_search(iter, "TYPE", "zfs_member");
352 blkid_dev_iterate_end(iter);
353 blkid_put_cache(cache);
357 *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t));
358 avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
359 offsetof(rdsk_node_t, rn_node));
361 while (blkid_dev_next(iter, &dev) == 0) {
362 slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
363 slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev));
364 slice->rn_vdev_guid = 0;
365 slice->rn_lock = lock;
366 slice->rn_avl = *slice_cache;
368 slice->rn_labelpaths = B_TRUE;
370 error = zfs_path_order(slice->rn_name, &slice->rn_order);
372 slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
374 slice->rn_order = IMPORT_ORDER_DEFAULT;
376 pthread_mutex_lock(lock);
377 if (avl_find(*slice_cache, slice, &where)) {
378 free(slice->rn_name);
381 avl_insert(*slice_cache, slice, where);
383 pthread_mutex_unlock(lock);
386 blkid_dev_iterate_end(iter);
387 blkid_put_cache(cache);
393 * Linux persistent device strings for vdev labels
395 * based on libudev for consistency with libudev disk add/remove events
398 typedef struct vdev_dev_strs {
400 char vds_devphys[128];
406 * Obtain the persistent device id string (describes what)
408 * used by ZED vdev matching for auto-{online,expand,replace}
411 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
413 struct udev_list_entry *entry;
415 char devbyid[MAXPATHLEN];
417 /* The bus based by-id path is preferred */
418 bus = udev_device_get_property_value(dev, "ID_BUS");
424 * For multipath nodes use the persistent uuid based identifier
426 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
428 dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
429 if (dm_uuid != NULL) {
430 (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
435 * For volumes use the persistent /dev/zvol/dataset identifier
437 entry = udev_device_get_devlinks_list_entry(dev);
438 while (entry != NULL) {
441 name = udev_list_entry_get_name(entry);
442 if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
443 (void) strlcpy(bufptr, name, buflen);
446 entry = udev_list_entry_get_next(entry);
450 * NVME 'by-id' symlinks are similar to bus case
452 struct udev_device *parent;
454 parent = udev_device_get_parent_with_subsystem_devtype(dev,
457 bus = "nvme"; /* continue with bus symlink search */
463 * locate the bus specific by-id link
465 (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
466 entry = udev_device_get_devlinks_list_entry(dev);
467 while (entry != NULL) {
470 name = udev_list_entry_get_name(entry);
471 if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
472 name += strlen(DEV_BYID_PATH);
473 (void) strlcpy(bufptr, name, buflen);
476 entry = udev_list_entry_get_next(entry);
483 * Obtain the persistent physical location string (describes where)
485 * used by ZED vdev matching for auto-{online,expand,replace}
488 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
490 const char *physpath = NULL;
491 struct udev_list_entry *entry;
494 * Normal disks use ID_PATH for their physical path.
496 physpath = udev_device_get_property_value(dev, "ID_PATH");
497 if (physpath != NULL && strlen(physpath) > 0) {
498 (void) strlcpy(bufptr, physpath, buflen);
503 * Device mapper devices are virtual and don't have a physical
504 * path. For them we use ID_VDEV instead, which is setup via the
505 * /etc/vdev_id.conf file. ID_VDEV provides a persistent path
506 * to a virtual device. If you don't have vdev_id.conf setup,
507 * you cannot use multipath autoreplace with device mapper.
509 physpath = udev_device_get_property_value(dev, "ID_VDEV");
510 if (physpath != NULL && strlen(physpath) > 0) {
511 (void) strlcpy(bufptr, physpath, buflen);
516 * For ZFS volumes use the persistent /dev/zvol/dataset identifier
518 entry = udev_device_get_devlinks_list_entry(dev);
519 while (entry != NULL) {
520 physpath = udev_list_entry_get_name(entry);
521 if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
522 (void) strlcpy(bufptr, physpath, buflen);
525 entry = udev_list_entry_get_next(entry);
529 * For all other devices fallback to using the by-uuid name.
531 entry = udev_device_get_devlinks_list_entry(dev);
532 while (entry != NULL) {
533 physpath = udev_list_entry_get_name(entry);
534 if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
535 (void) strlcpy(bufptr, physpath, buflen);
538 entry = udev_list_entry_get_next(entry);
545 * A disk is considered a multipath whole disk when:
546 * DEVNAME key value has "dm-"
547 * DM_NAME key value has "mpath" prefix
549 * ID_PART_TABLE_TYPE key does not exist or is not gpt
552 udev_mpath_whole_disk(struct udev_device *dev)
554 const char *devname, *type, *uuid;
556 devname = udev_device_get_property_value(dev, "DEVNAME");
557 type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
558 uuid = udev_device_get_property_value(dev, "DM_UUID");
560 if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
561 ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
570 udev_device_is_ready(struct udev_device *dev)
572 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
573 return (udev_device_get_is_initialized(dev));
575 /* wait for DEVLINKS property to be initialized */
576 return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
584 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
591 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
596 #endif /* HAVE_LIBUDEV */
599 * Wait up to timeout_ms for udev to set up the device node. The device is
600 * considered ready when libudev determines it has been initialized, all of
601 * the device links have been verified to exist, and it has been allowed to
602 * settle. At this point the device the device can be accessed reliably.
603 * Depending on the complexity of the udev rules this process could take
607 zpool_label_disk_wait(const char *path, int timeout_ms)
611 struct udev_device *dev = NULL;
612 char nodepath[MAXPATHLEN];
613 char *sysname = NULL;
617 hrtime_t start, settle;
619 if ((udev = udev_new()) == NULL)
626 if (sysname == NULL) {
627 if (realpath(path, nodepath) != NULL) {
628 sysname = strrchr(nodepath, '/') + 1;
630 (void) usleep(sleep_ms * MILLISEC);
635 dev = udev_device_new_from_subsystem_sysname(udev,
637 if ((dev != NULL) && udev_device_is_ready(dev)) {
638 struct udev_list_entry *links, *link = NULL;
641 links = udev_device_get_devlinks_list_entry(dev);
643 udev_list_entry_foreach(link, links) {
644 struct stat64 statbuf;
647 name = udev_list_entry_get_name(link);
649 if (stat64(name, &statbuf) == 0 && errno == 0)
659 settle = gethrtime();
660 } else if (NSEC2MSEC(gethrtime() - settle) >=
662 udev_device_unref(dev);
668 udev_device_unref(dev);
669 (void) usleep(sleep_ms * MILLISEC);
671 } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
679 hrtime_t start, settle;
680 struct stat64 statbuf;
687 if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
689 settle = gethrtime();
690 else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
692 } else if (errno != ENOENT) {
696 usleep(sleep_ms * MILLISEC);
697 } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
700 #endif /* HAVE_LIBUDEV */
704 * Encode the persistent devices strings
705 * used for the vdev disk label
708 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
713 struct udev_device *dev = NULL;
714 char nodepath[MAXPATHLEN];
719 if ((udev = udev_new()) == NULL)
722 /* resolve path to a runtime device node instance */
723 if (realpath(path, nodepath) == NULL)
726 sysname = strrchr(nodepath, '/') + 1;
729 * Wait up to 3 seconds for udev to set up the device node context
733 dev = udev_device_new_from_subsystem_sysname(udev, "block",
737 if (udev_device_is_ready(dev))
738 break; /* udev ready */
740 udev_device_unref(dev);
743 if (NSEC2MSEC(gethrtime() - start) < 10)
744 (void) sched_yield(); /* yield/busy wait up to 10ms */
746 (void) usleep(10 * MILLISEC);
748 } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
754 * Only whole disks require extra device strings
756 if (!wholedisk && !udev_mpath_whole_disk(dev))
759 ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
763 /* physical location string (optional) */
764 if (zfs_device_get_physical(dev, ds->vds_devphys,
765 sizeof (ds->vds_devphys)) != 0) {
766 ds->vds_devphys[0] = '\0'; /* empty string --> not available */
770 udev_device_unref(dev);
781 * Update a leaf vdev's persistent device strings
783 * - only applies for a dedicated leaf vdev (aka whole disk)
784 * - updated during pool create|add|attach|import
785 * - used for matching device matching during auto-{online,expand,replace}
786 * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
787 * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
789 * single device node example:
790 * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1'
791 * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
793 * multipath device node example:
794 * devid: 'dm-uuid-mpath-35000c5006304de3f'
796 * We also store the enclosure sysfs path for turning on enclosure LEDs
798 * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
801 update_vdev_config_dev_strs(nvlist_t *nv)
804 char *env, *type, *path;
805 uint64_t wholedisk = 0;
809 * For the benefit of legacy ZFS implementations, allow
810 * for opting out of devid strings in the vdev label.
813 * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
816 * Older OpenZFS implementations had issues when attempting to
817 * display pool config VDEV names if a "devid" NVP value is
818 * present in the pool's config.
820 * For example, a pool that originated on illumos platform would
821 * have a devid value in the config and "zpool status" would fail
822 * when listing the config.
824 * A pool can be stripped of any "devid" values on import or
825 * prevented from adding them on zpool create|add by setting
826 * ZFS_VDEV_DEVID_OPT_OUT.
828 env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
829 if (env && (strtoul(env, NULL, 0) > 0 ||
830 !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
831 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
832 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
836 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
837 strcmp(type, VDEV_TYPE_DISK) != 0) {
840 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
842 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
845 * Update device string values in the config nvlist.
847 if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
848 (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
849 if (vds.vds_devphys[0] != '\0') {
850 (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
854 /* Add enclosure sysfs path (if disk is in an enclosure). */
855 upath = zfs_get_underlying_path(path);
856 spath = zfs_get_enclosure_sysfs_path(upath);
858 nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
861 nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
866 /* Clear out any stale entries. */
867 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
868 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
869 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);