/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * Functions to convert between a list of vdevs and an nvlist representing the * configuration. Each entry in the list can be one of: * * Device vdevs * disk=(path=..., devid=...) * file=(path=...) * * Group vdevs * raidz[1|2]=(...) * mirror=(...) * * Hot spares * * While the underlying implementation supports it, group vdevs cannot contain * other group vdevs. All userland verification of devices is contained within * this file. If successful, the nvlist returned can be passed directly to the * kernel; we've done as much verification as possible in userland. * * Hot spares are a special case, and passed down as an array of disk vdevs, at * the same level as the root of the vdev tree. * * The only function exported by this file is 'get_vdev_spec'. The function * performs several passes: * * 1. Construct the vdev specification. Performs syntax validation and * makes sure each device is valid. * 2. Check for devices in use. Using libdiskmgt, makes sure that no * devices are also in use. Some can be overridden using the 'force' * flag, others cannot. * 3. Check for replication errors if the 'force' flag is not specified. * validates that the replication level is consistent across the * entire pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" /* * For any given vdev specification, we can have multiple errors. The * vdev_error() function keeps track of whether we have seen an error yet, and * prints out a header if its the first error we've seen. */ boolean_t error_seen; boolean_t is_force; /*PRINTFLIKE1*/ static void vdev_error(const char *fmt, ...) { va_list ap; if (!error_seen) { (void) fprintf(stderr, gettext("invalid vdev specification\n")); if (!is_force) (void) fprintf(stderr, gettext("use '-f' to override " "the following errors:\n")); else (void) fprintf(stderr, gettext("the following errors " "must be manually repaired:\n")); error_seen = B_TRUE; } va_start(ap, fmt); (void) vfprintf(stderr, fmt, ap); va_end(ap); } /* * Validate a GEOM provider. */ static int check_provider(const char *name, boolean_t force, boolean_t isspare) { struct gmesh mesh; struct gclass *mp; struct ggeom *gp; struct gprovider *pp; int rv; /* XXX: What to do with isspare? */ if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) name += sizeof(_PATH_DEV) - 1; rv = geom_gettree(&mesh); assert(rv == 0); pp = NULL; LIST_FOREACH(mp, &mesh.lg_class, lg_class) { LIST_FOREACH(gp, &mp->lg_geom, lg_geom) { LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { if (strcmp(pp->lg_name, name) == 0) goto out; } } } out: rv = -1; if (pp == NULL) vdev_error("no such provider %s\n", name); else { int acr, acw, ace; VERIFY(sscanf(pp->lg_mode, "r%dw%de%d", &acr, &acw, &ace) == 3); if (acw == 0 && ace == 0) rv = 0; else vdev_error("%s is in use (%s)\n", name, pp->lg_mode); } geom_deletetree(&mesh); return (rv); } static boolean_t is_provider(const char *name) { int fd; fd = g_open(name, 0); if (fd >= 0) { g_close(fd); return (B_TRUE); } return (B_FALSE); } /* * Create a leaf vdev. Determine if this is a GEOM provider. * Valid forms for a leaf vdev are: * * /dev/xxx Complete path to a GEOM provider * xxx Shorthand for /dev/xxx */ nvlist_t * make_leaf_vdev(const char *arg) { char ident[DISK_IDENT_SIZE], path[MAXPATHLEN]; struct stat64 statbuf; nvlist_t *vdev = NULL; char *type = NULL; boolean_t wholedisk = B_FALSE; if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) strlcpy(path, arg, sizeof (path)); else snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg); if (is_provider(path)) type = VDEV_TYPE_DISK; else { (void) fprintf(stderr, gettext("cannot use '%s': must be a " "GEOM provider\n"), path); return (NULL); } /* * Finally, we have the complete device or file, and we know that it is * acceptable to use. Construct the nvlist to describe this vdev. All * vdevs have a 'path' element, and devices also have a 'devid' element. */ verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)B_FALSE) == 0); /* * For a whole disk, defer getting its devid until after labeling it. */ if (1 || (S_ISBLK(statbuf.st_mode) && !wholedisk)) { /* * Get the devid for the device. */ int fd; ddi_devid_t devid; char *minor = NULL, *devid_str = NULL; if ((fd = open(path, O_RDONLY)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': " "%s\n"), path, strerror(errno)); nvlist_free(vdev); return (NULL); } if (devid_get(fd, &devid) == 0) { if (devid_get_minor_name(fd, &minor) == 0 && (devid_str = devid_str_encode(devid, minor)) != NULL) { verify(nvlist_add_string(vdev, ZPOOL_CONFIG_DEVID, devid_str) == 0); } if (devid_str != NULL) devid_str_free(devid_str); if (minor != NULL) devid_str_free(minor); devid_free(devid); } (void) close(fd); } return (vdev); } /* * Go through and verify the replication level of the pool is consistent. * Performs the following checks: * * For the new spec, verifies that devices in mirrors and raidz are the * same size. * * If the current configuration already has inconsistent replication * levels, ignore any other potential problems in the new spec. * * Otherwise, make sure that the current spec (if there is one) and the new * spec have consistent replication levels. */ typedef struct replication_level { char *zprl_type; uint64_t zprl_children; uint64_t zprl_parity; } replication_level_t; /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then * an error message will be displayed for each self-inconsistent vdev. */ replication_level_t * get_replication(nvlist_t *nvroot, boolean_t fatal) { nvlist_t **top; uint_t t, toplevels; nvlist_t **child; uint_t c, children; nvlist_t *nv; char *type; replication_level_t lastrep, rep, *ret; boolean_t dontreport; ret = safe_malloc(sizeof (replication_level_t)); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); lastrep.zprl_type = NULL; for (t = 0; t < toplevels; t++) { nv = top[t]; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { /* * This is a 'file' or 'disk' vdev. */ rep.zprl_type = type; rep.zprl_children = 1; rep.zprl_parity = 0; } else { uint64_t vdev_size; /* * This is a mirror or RAID-Z vdev. Go through and make * sure the contents are all the same (files vs. disks), * keeping track of the number of elements in the * process. * * We also check that the size of each vdev (if it can * be determined) is the same. */ rep.zprl_type = type; rep.zprl_children = 0; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); assert(rep.zprl_parity != 0); } else { rep.zprl_parity = 0; } /* * The 'dontreport' variable indicatest that we've * already reported an error for this spec, so don't * bother doing it again. */ type = NULL; dontreport = 0; vdev_size = -1ULL; for (c = 0; c < children; c++) { nvlist_t *cnv = child[c]; char *path; struct stat64 statbuf; uint64_t size = -1ULL; char *childtype; int fd, err; rep.zprl_children++; verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &childtype) == 0); /* * If this is a a replacing or spare vdev, then * get the real first child of the vdev. */ if (strcmp(childtype, VDEV_TYPE_REPLACING) == 0 || strcmp(childtype, VDEV_TYPE_SPARE) == 0) { nvlist_t **rchild; uint_t rchildren; verify(nvlist_lookup_nvlist_array(cnv, ZPOOL_CONFIG_CHILDREN, &rchild, &rchildren) == 0); assert(rchildren == 2); cnv = rchild[0]; verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &childtype) == 0); } verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &path) == 0); /* * If we have a raidz/mirror that combines disks * with files, report it as an error. */ if (!dontreport && type != NULL && strcmp(type, childtype) != 0) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication " "level: %s contains both " "files and devices\n"), rep.zprl_type); else return (NULL); dontreport = B_TRUE; } /* * According to stat(2), the value of 'st_size' * is undefined for block devices and character * devices. But there is no effective way to * determine the real size in userland. * * Instead, we'll take advantage of an * implementation detail of spec_size(). If the * device is currently open, then we (should) * return a valid size. * * If we still don't get a valid size (indicated * by a size of 0 or MAXOFFSET_T), then ignore * this device altogether. */ if ((fd = open(path, O_RDONLY)) >= 0) { err = fstat64(fd, &statbuf); (void) close(fd); } else { err = stat64(path, &statbuf); } if (err != 0 || statbuf.st_size == 0) continue; size = statbuf.st_size; /* * Also check the size of each device. If they * differ, then report an error. */ if (!dontreport && vdev_size != -1ULL && size != vdev_size) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "%s contains devices of " "different sizes\n"), rep.zprl_type); else return (NULL); dontreport = B_TRUE; } type = childtype; vdev_size = size; } } /* * At this point, we have the replication of the last toplevel * vdev in 'rep'. Compare it to 'lastrep' to see if its * different. */ if (lastrep.zprl_type != NULL) { if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %s and %s vdevs are " "present\n"), lastrep.zprl_type, rep.zprl_type); else return (NULL); } else if (lastrep.zprl_parity != rep.zprl_parity) { if (ret) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %llu and %llu device parity " "%s vdevs are present\n"), lastrep.zprl_parity, rep.zprl_parity, rep.zprl_type); else return (NULL); } else if (lastrep.zprl_children != rep.zprl_children) { if (ret) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %llu-way and %llu-way %s " "vdevs are present\n"), lastrep.zprl_children, rep.zprl_children, rep.zprl_type); else return (NULL); } } lastrep = rep; } if (ret != NULL) *ret = rep; return (ret); } /* * Check the replication level of the vdev spec against the current pool. Calls * get_replication() to make sure the new spec is self-consistent. If the pool * has a consistent replication level, then we ignore any errors. Otherwise, * report any difference between the two. */ int check_replication(nvlist_t *config, nvlist_t *newroot) { replication_level_t *current = NULL, *new; int ret; /* * If we have a current pool configuration, check to see if it's * self-consistent. If not, simply return success. */ if (config != NULL) { nvlist_t *nvroot; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if ((current = get_replication(nvroot, B_FALSE)) == NULL) return (0); } /* * Get the replication level of the new vdev spec, reporting any * inconsistencies found. */ if ((new = get_replication(newroot, B_TRUE)) == NULL) { free(current); return (-1); } /* * Check to see if the new vdev spec matches the replication level of * the current pool. */ ret = 0; if (current != NULL) { if (strcmp(current->zprl_type, new->zprl_type) != 0) { vdev_error(gettext( "mismatched replication level: pool uses %s " "and new vdev is %s\n"), current->zprl_type, new->zprl_type); ret = -1; } else if (current->zprl_parity != new->zprl_parity) { vdev_error(gettext( "mismatched replication level: pool uses %llu " "device parity and new vdev uses %llu\n"), current->zprl_parity, new->zprl_parity); ret = -1; } else if (current->zprl_children != new->zprl_children) { vdev_error(gettext( "mismatched replication level: pool uses %llu-way " "%s and new vdev uses %llu-way %s\n"), current->zprl_children, current->zprl_type, new->zprl_children, new->zprl_type); ret = -1; } } free(new); if (current != NULL) free(current); return (ret); } /* * Determine if the given path is a hot spare within the given configuration. */ static boolean_t is_spare(nvlist_t *config, const char *path) { int fd; pool_state_t state; char *name = NULL; nvlist_t *label; uint64_t guid, spareguid; nvlist_t *nvroot; nvlist_t **spares; uint_t i, nspares; boolean_t inuse; if ((fd = open(path, O_RDONLY)) < 0) return (B_FALSE); if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || !inuse || state != POOL_STATE_SPARE || zpool_read_label(fd, &label) != 0) { free(name); (void) close(fd); return (B_FALSE); } free(name); (void) close(fd); verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); nvlist_free(label); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { for (i = 0; i < nspares; i++) { verify(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &spareguid) == 0); if (spareguid == guid) return (B_TRUE); } } return (B_FALSE); } /* * Go through and find any devices that are in use. We rely on libdiskmgt for * the majority of this task. */ int check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, int isspare) { nvlist_t **child; uint_t c, children; char *type, *path; int ret; char buf[MAXPATHLEN]; uint64_t wholedisk; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); /* * As a generic check, we look to see if this is a replace of a * hot spare within the same pool. If so, we allow it * regardless of what libdiskmgt or zpool_in_use() says. */ if (isreplacing) { (void) strlcpy(buf, path, sizeof (buf)); if (is_spare(config, buf)) return (0); } if (strcmp(type, VDEV_TYPE_DISK) == 0) ret = check_provider(path, force, isspare); return (ret); } for (c = 0; c < children; c++) if ((ret = check_in_use(config, child[c], force, isreplacing, B_FALSE)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) if ((ret = check_in_use(config, child[c], force, isreplacing, B_TRUE)) != 0) return (ret); return (0); } const char * is_grouping(const char *type, int *mindev) { if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { if (mindev != NULL) *mindev = 2; return (VDEV_TYPE_RAIDZ); } if (strcmp(type, "raidz2") == 0) { if (mindev != NULL) *mindev = 3; return (VDEV_TYPE_RAIDZ); } if (strcmp(type, "mirror") == 0) { if (mindev != NULL) *mindev = 2; return (VDEV_TYPE_MIRROR); } if (strcmp(type, "spare") == 0) { if (mindev != NULL) *mindev = 1; return (VDEV_TYPE_SPARE); } return (NULL); } /* * Construct a syntactically valid vdev specification, * and ensure that all devices and files exist and can be opened. * Note: we don't bother freeing anything in the error paths * because the program is just going to exit anyway. */ nvlist_t * construct_spec(int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares; int t, toplevels, mindev, nspares; const char *type; top = NULL; toplevels = 0; spares = NULL; nspares = 0; while (argc > 0) { nv = NULL; /* * If it's a mirror or raidz, the subsequent arguments are * its leaves -- until we encounter the next mirror or raidz. */ if ((type = is_grouping(argv[0], &mindev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; if (strcmp(type, VDEV_TYPE_SPARE) == 0 && spares != NULL) { (void) fprintf(stderr, gettext("invalid vdev " "specification: 'spare' can be " "specified only once\n")); return (NULL); } for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL) != NULL) break; children++; child = realloc(child, children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); if ((nv = make_leaf_vdev(argv[c])) == NULL) return (NULL); child[children - 1] = nv; } if (children < mindev) { (void) fprintf(stderr, gettext("invalid vdev " "specification: %s requires at least %d " "devices\n"), argv[0], mindev); return (NULL); } argc -= c; argv += c; if (strcmp(type, VDEV_TYPE_SPARE) == 0) { spares = child; nspares = children; continue; } else { verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, type) == 0); if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } verify(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, children) == 0); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); } } else { /* * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ if ((nv = make_leaf_vdev(argv[0])) == NULL) return (NULL); argc--; argv++; } toplevels++; top = realloc(top, toplevels * sizeof (nvlist_t *)); if (top == NULL) zpool_no_memory(); top[toplevels - 1] = nv; } if (toplevels == 0 && nspares == 0) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); return (NULL); } /* * Finally, create nvroot and add all top-level vdevs to it. */ verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, top, toplevels) == 0); if (nspares != 0) verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); for (t = 0; t < toplevels; t++) nvlist_free(top[t]); for (t = 0; t < nspares; t++) nvlist_free(spares[t]); if (spares) free(spares); free(top); return (nvroot); } /* * Get and validate the contents of the given vdev specification. This ensures * that the nvlist returned is well-formed, that all the devices exist, and that * they are not currently in use by any other known consumer. The 'poolconfig' * parameter is the current configuration of the pool when adding devices * existing pool, and is used to perform additional checks, such as changing the * replication level of the pool. It can be 'NULL' to indicate that this is a * new pool. The 'force' flag controls whether devices should be forcefully * added, even if they appear in use. */ nvlist_t * make_root_vdev(nvlist_t *poolconfig, int force, int check_rep, boolean_t isreplacing, int argc, char **argv) { nvlist_t *newroot; is_force = force; /* * Construct the vdev specification. If this is successful, we know * that we have a valid specification, and that all devices can be * opened. */ if ((newroot = construct_spec(argc, argv)) == NULL) return (NULL); /* * Validate each device to make sure that its not shared with another * subsystem. We do this even if 'force' is set, because there are some * uses (such as a dedicated dump device) that even '-f' cannot * override. */ if (check_in_use(poolconfig, newroot, force, isreplacing, B_FALSE) != 0) { nvlist_free(newroot); return (NULL); } /* * Check the replication level of the given vdevs and report any errors * found. We include the existing pool spec, if any, as we need to * catch changes against the existing replication level. */ if (check_rep && check_replication(poolconfig, newroot) != 0) { nvlist_free(newroot); return (NULL); } return (newroot); }