]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - cmd/zpool/zpool_vdev.c
Default ashift for Amazon EC2 NVMe devices
[FreeBSD/FreeBSD.git] / cmd / zpool / zpool_vdev.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
25  * Copyright (c) 2016 Intel Corporation.
26  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27  */
28
29 /*
30  * Functions to convert between a list of vdevs and an nvlist representing the
31  * configuration.  Each entry in the list can be one of:
32  *
33  *      Device vdevs
34  *              disk=(path=..., devid=...)
35  *              file=(path=...)
36  *
37  *      Group vdevs
38  *              raidz[1|2]=(...)
39  *              mirror=(...)
40  *
41  *      Hot spares
42  *
43  * While the underlying implementation supports it, group vdevs cannot contain
44  * other group vdevs.  All userland verification of devices is contained within
45  * this file.  If successful, the nvlist returned can be passed directly to the
46  * kernel; we've done as much verification as possible in userland.
47  *
48  * Hot spares are a special case, and passed down as an array of disk vdevs, at
49  * the same level as the root of the vdev tree.
50  *
51  * The only function exported by this file is 'make_root_vdev'.  The
52  * function performs several passes:
53  *
54  *      1. Construct the vdev specification.  Performs syntax validation and
55  *         makes sure each device is valid.
56  *      2. Check for devices in use.  Using libblkid to make sure that no
57  *         devices are also in use.  Some can be overridden using the 'force'
58  *         flag, others cannot.
59  *      3. Check for replication errors if the 'force' flag is not specified.
60  *         validates that the replication level is consistent across the
61  *         entire pool.
62  *      4. Call libzfs to label any whole disks with an EFI label.
63  */
64
65 #include <assert.h>
66 #include <ctype.h>
67 #include <devid.h>
68 #include <errno.h>
69 #include <fcntl.h>
70 #include <libintl.h>
71 #include <libnvpair.h>
72 #include <limits.h>
73 #include <sys/spa.h>
74 #include <scsi/scsi.h>
75 #include <scsi/sg.h>
76 #include <stdio.h>
77 #include <string.h>
78 #include <unistd.h>
79 #include <sys/efi_partition.h>
80 #include <sys/stat.h>
81 #include <sys/vtoc.h>
82 #include <sys/mntent.h>
83 #include <uuid/uuid.h>
84 #include <blkid/blkid.h>
85 #include "zpool_util.h"
86 #include <sys/zfs_context.h>
87
88 /*
89  * For any given vdev specification, we can have multiple errors.  The
90  * vdev_error() function keeps track of whether we have seen an error yet, and
91  * prints out a header if its the first error we've seen.
92  */
93 boolean_t error_seen;
94 boolean_t is_force;
95
96 typedef struct vdev_disk_db_entry
97 {
98         char id[24];
99         int sector_size;
100 } vdev_disk_db_entry_t;
101
102 /*
103  * Database of block devices that lie about physical sector sizes.  The
104  * identification string must be precisely 24 characters to avoid false
105  * negatives
106  */
107 static vdev_disk_db_entry_t vdev_disk_database[] = {
108         {"ATA     ADATA SSD S396 3", 8192},
109         {"ATA     APPLE SSD SM128E", 8192},
110         {"ATA     APPLE SSD SM256E", 8192},
111         {"ATA     APPLE SSD SM512E", 8192},
112         {"ATA     APPLE SSD SM768E", 8192},
113         {"ATA     C400-MTFDDAC064M", 8192},
114         {"ATA     C400-MTFDDAC128M", 8192},
115         {"ATA     C400-MTFDDAC256M", 8192},
116         {"ATA     C400-MTFDDAC512M", 8192},
117         {"ATA     Corsair Force 3 ", 8192},
118         {"ATA     Corsair Force GS", 8192},
119         {"ATA     INTEL SSDSA2CT04", 8192},
120         {"ATA     INTEL SSDSA2BZ10", 8192},
121         {"ATA     INTEL SSDSA2BZ20", 8192},
122         {"ATA     INTEL SSDSA2BZ30", 8192},
123         {"ATA     INTEL SSDSA2CW04", 8192},
124         {"ATA     INTEL SSDSA2CW08", 8192},
125         {"ATA     INTEL SSDSA2CW12", 8192},
126         {"ATA     INTEL SSDSA2CW16", 8192},
127         {"ATA     INTEL SSDSA2CW30", 8192},
128         {"ATA     INTEL SSDSA2CW60", 8192},
129         {"ATA     INTEL SSDSC2CT06", 8192},
130         {"ATA     INTEL SSDSC2CT12", 8192},
131         {"ATA     INTEL SSDSC2CT18", 8192},
132         {"ATA     INTEL SSDSC2CT24", 8192},
133         {"ATA     INTEL SSDSC2CW06", 8192},
134         {"ATA     INTEL SSDSC2CW12", 8192},
135         {"ATA     INTEL SSDSC2CW18", 8192},
136         {"ATA     INTEL SSDSC2CW24", 8192},
137         {"ATA     INTEL SSDSC2CW48", 8192},
138         {"ATA     KINGSTON SH100S3", 8192},
139         {"ATA     KINGSTON SH103S3", 8192},
140         {"ATA     M4-CT064M4SSD2  ", 8192},
141         {"ATA     M4-CT128M4SSD2  ", 8192},
142         {"ATA     M4-CT256M4SSD2  ", 8192},
143         {"ATA     M4-CT512M4SSD2  ", 8192},
144         {"ATA     OCZ-AGILITY2    ", 8192},
145         {"ATA     OCZ-AGILITY3    ", 8192},
146         {"ATA     OCZ-VERTEX2 3.5 ", 8192},
147         {"ATA     OCZ-VERTEX3     ", 8192},
148         {"ATA     OCZ-VERTEX3 LT  ", 8192},
149         {"ATA     OCZ-VERTEX3 MI  ", 8192},
150         {"ATA     OCZ-VERTEX4     ", 8192},
151         {"ATA     SAMSUNG MZ7WD120", 8192},
152         {"ATA     SAMSUNG MZ7WD240", 8192},
153         {"ATA     SAMSUNG MZ7WD480", 8192},
154         {"ATA     SAMSUNG MZ7WD960", 8192},
155         {"ATA     SAMSUNG SSD 830 ", 8192},
156         {"ATA     Samsung SSD 840 ", 8192},
157         {"ATA     SanDisk SSD U100", 8192},
158         {"ATA     TOSHIBA THNSNH06", 8192},
159         {"ATA     TOSHIBA THNSNH12", 8192},
160         {"ATA     TOSHIBA THNSNH25", 8192},
161         {"ATA     TOSHIBA THNSNH51", 8192},
162         {"ATA     APPLE SSD TS064C", 4096},
163         {"ATA     APPLE SSD TS128C", 4096},
164         {"ATA     APPLE SSD TS256C", 4096},
165         {"ATA     APPLE SSD TS512C", 4096},
166         {"ATA     INTEL SSDSA2M040", 4096},
167         {"ATA     INTEL SSDSA2M080", 4096},
168         {"ATA     INTEL SSDSA2M160", 4096},
169         {"ATA     INTEL SSDSC2MH12", 4096},
170         {"ATA     INTEL SSDSC2MH25", 4096},
171         {"ATA     OCZ CORE_SSD    ", 4096},
172         {"ATA     OCZ-VERTEX      ", 4096},
173         {"ATA     SAMSUNG MCCOE32G", 4096},
174         {"ATA     SAMSUNG MCCOE64G", 4096},
175         {"ATA     SAMSUNG SSD PM80", 4096},
176         /* Flash drives optimized for 4KB IOs on larger pages */
177         {"ATA     INTEL SSDSC2BA10", 4096},
178         {"ATA     INTEL SSDSC2BA20", 4096},
179         {"ATA     INTEL SSDSC2BA40", 4096},
180         {"ATA     INTEL SSDSC2BA80", 4096},
181         {"ATA     INTEL SSDSC2BB08", 4096},
182         {"ATA     INTEL SSDSC2BB12", 4096},
183         {"ATA     INTEL SSDSC2BB16", 4096},
184         {"ATA     INTEL SSDSC2BB24", 4096},
185         {"ATA     INTEL SSDSC2BB30", 4096},
186         {"ATA     INTEL SSDSC2BB40", 4096},
187         {"ATA     INTEL SSDSC2BB48", 4096},
188         {"ATA     INTEL SSDSC2BB60", 4096},
189         {"ATA     INTEL SSDSC2BB80", 4096},
190         {"ATA     INTEL SSDSC2BW24", 4096},
191         {"ATA     INTEL SSDSC2BW48", 4096},
192         {"ATA     INTEL SSDSC2BP24", 4096},
193         {"ATA     INTEL SSDSC2BP48", 4096},
194         {"NA      SmrtStorSDLKAE9W", 4096},
195         {"NVMe    Amazon EC2 NVMe ", 4096},
196         /* Imported from Open Solaris */
197         {"ATA     MARVELL SD88SA02", 4096},
198         /* Advanced format Hard drives */
199         {"ATA     Hitachi HDS5C303", 4096},
200         {"ATA     SAMSUNG HD204UI ", 4096},
201         {"ATA     ST2000DL004 HD20", 4096},
202         {"ATA     WDC WD10EARS-00M", 4096},
203         {"ATA     WDC WD10EARS-00S", 4096},
204         {"ATA     WDC WD10EARS-00Z", 4096},
205         {"ATA     WDC WD15EARS-00M", 4096},
206         {"ATA     WDC WD15EARS-00S", 4096},
207         {"ATA     WDC WD15EARS-00Z", 4096},
208         {"ATA     WDC WD20EARS-00M", 4096},
209         {"ATA     WDC WD20EARS-00S", 4096},
210         {"ATA     WDC WD20EARS-00Z", 4096},
211         {"ATA     WDC WD1600BEVT-0", 4096},
212         {"ATA     WDC WD2500BEVT-0", 4096},
213         {"ATA     WDC WD3200BEVT-0", 4096},
214         {"ATA     WDC WD5000BEVT-0", 4096},
215         /* Virtual disks: Assume zvols with default volblocksize */
216 #if 0
217         {"ATA     QEMU HARDDISK   ", 8192},
218         {"IET     VIRTUAL-DISK    ", 8192},
219         {"OI      COMSTAR         ", 8192},
220         {"SUN     COMSTAR         ", 8192},
221         {"NETAPP  LUN             ", 8192},
222 #endif
223 };
224
225 static const int vdev_disk_database_size =
226         sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
227
228 #define INQ_REPLY_LEN   96
229 #define INQ_CMD_LEN     6
230
231 static boolean_t
232 check_sector_size_database(char *path, int *sector_size)
233 {
234         unsigned char inq_buff[INQ_REPLY_LEN];
235         unsigned char sense_buffer[32];
236         unsigned char inq_cmd_blk[INQ_CMD_LEN] =
237             {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
238         sg_io_hdr_t io_hdr;
239         int error;
240         int fd;
241         int i;
242
243         /* Prepare INQUIRY command */
244         memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
245         io_hdr.interface_id = 'S';
246         io_hdr.cmd_len = sizeof (inq_cmd_blk);
247         io_hdr.mx_sb_len = sizeof (sense_buffer);
248         io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
249         io_hdr.dxfer_len = INQ_REPLY_LEN;
250         io_hdr.dxferp = inq_buff;
251         io_hdr.cmdp = inq_cmd_blk;
252         io_hdr.sbp = sense_buffer;
253         io_hdr.timeout = 10;            /* 10 milliseconds is ample time */
254
255         if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
256                 return (B_FALSE);
257
258         error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
259
260         (void) close(fd);
261
262         if (error < 0)
263                 return (B_FALSE);
264
265         if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
266                 return (B_FALSE);
267
268         for (i = 0; i < vdev_disk_database_size; i++) {
269                 if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
270                         continue;
271
272                 *sector_size = vdev_disk_database[i].sector_size;
273                 return (B_TRUE);
274         }
275
276         return (B_FALSE);
277 }
278
279 /*PRINTFLIKE1*/
280 static void
281 vdev_error(const char *fmt, ...)
282 {
283         va_list ap;
284
285         if (!error_seen) {
286                 (void) fprintf(stderr, gettext("invalid vdev specification\n"));
287                 if (!is_force)
288                         (void) fprintf(stderr, gettext("use '-f' to override "
289                             "the following errors:\n"));
290                 else
291                         (void) fprintf(stderr, gettext("the following errors "
292                             "must be manually repaired:\n"));
293                 error_seen = B_TRUE;
294         }
295
296         va_start(ap, fmt);
297         (void) vfprintf(stderr, fmt, ap);
298         va_end(ap);
299 }
300
301 /*
302  * Check that a file is valid.  All we can do in this case is check that it's
303  * not in use by another pool, and not in use by swap.
304  */
305 static int
306 check_file(const char *file, boolean_t force, boolean_t isspare)
307 {
308         char  *name;
309         int fd;
310         int ret = 0;
311         pool_state_t state;
312         boolean_t inuse;
313
314         if ((fd = open(file, O_RDONLY)) < 0)
315                 return (0);
316
317         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
318                 const char *desc;
319
320                 switch (state) {
321                 case POOL_STATE_ACTIVE:
322                         desc = gettext("active");
323                         break;
324
325                 case POOL_STATE_EXPORTED:
326                         desc = gettext("exported");
327                         break;
328
329                 case POOL_STATE_POTENTIALLY_ACTIVE:
330                         desc = gettext("potentially active");
331                         break;
332
333                 default:
334                         desc = gettext("unknown");
335                         break;
336                 }
337
338                 /*
339                  * Allow hot spares to be shared between pools.
340                  */
341                 if (state == POOL_STATE_SPARE && isspare) {
342                         free(name);
343                         (void) close(fd);
344                         return (0);
345                 }
346
347                 if (state == POOL_STATE_ACTIVE ||
348                     state == POOL_STATE_SPARE || !force) {
349                         switch (state) {
350                         case POOL_STATE_SPARE:
351                                 vdev_error(gettext("%s is reserved as a hot "
352                                     "spare for pool %s\n"), file, name);
353                                 break;
354                         default:
355                                 vdev_error(gettext("%s is part of %s pool "
356                                     "'%s'\n"), file, desc, name);
357                                 break;
358                         }
359                         ret = -1;
360                 }
361
362                 free(name);
363         }
364
365         (void) close(fd);
366         return (ret);
367 }
368
369 static int
370 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
371 {
372         int err;
373         char *value;
374
375         /* No valid type detected device is safe to use */
376         value = blkid_get_tag_value(cache, "TYPE", path);
377         if (value == NULL)
378                 return (0);
379
380         /*
381          * If libblkid detects a ZFS device, we check the device
382          * using check_file() to see if it's safe.  The one safe
383          * case is a spare device shared between multiple pools.
384          */
385         if (strcmp(value, "zfs_member") == 0) {
386                 err = check_file(path, force, isspare);
387         } else {
388                 if (force) {
389                         err = 0;
390                 } else {
391                         err = -1;
392                         vdev_error(gettext("%s contains a filesystem of "
393                             "type '%s'\n"), path, value);
394                 }
395         }
396
397         free(value);
398
399         return (err);
400 }
401
402 /*
403  * Validate that a disk including all partitions are safe to use.
404  *
405  * For EFI labeled disks this can done relatively easily with the libefi
406  * library.  The partition numbers are extracted from the label and used
407  * to generate the expected /dev/ paths.  Each partition can then be
408  * checked for conflicts.
409  *
410  * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
411  * but due to the lack of a readily available libraries this scanning is
412  * not implemented.  Instead only the device path as given is checked.
413  */
414 static int
415 check_disk(const char *path, blkid_cache cache, int force,
416     boolean_t isspare, boolean_t iswholedisk)
417 {
418         struct dk_gpt *vtoc;
419         char slice_path[MAXPATHLEN];
420         int err = 0;
421         int fd, i;
422
423         if (!iswholedisk)
424                 return (check_slice(path, cache, force, isspare));
425
426         if ((fd = open(path, O_RDONLY|O_DIRECT|O_EXCL)) < 0) {
427                 char *value = blkid_get_tag_value(cache, "TYPE", path);
428                 (void) fprintf(stderr, gettext("%s is in use and contains "
429                     "a %s filesystem.\n"), path, value ? value : "unknown");
430                 return (-1);
431         }
432
433         /*
434          * Expected to fail for non-EFI labled disks.  Just check the device
435          * as given and do not attempt to detect and scan partitions.
436          */
437         err = efi_alloc_and_read(fd, &vtoc);
438         if (err) {
439                 (void) close(fd);
440                 return (check_slice(path, cache, force, isspare));
441         }
442
443         /*
444          * The primary efi partition label is damaged however the secondary
445          * label at the end of the device is intact.  Rather than use this
446          * label we should play it safe and treat this as a non efi device.
447          */
448         if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
449                 efi_free(vtoc);
450                 (void) close(fd);
451
452                 if (force) {
453                         /* Partitions will now be created using the backup */
454                         return (0);
455                 } else {
456                         vdev_error(gettext("%s contains a corrupt primary "
457                             "EFI label.\n"), path);
458                         return (-1);
459                 }
460         }
461
462         for (i = 0; i < vtoc->efi_nparts; i++) {
463
464                 if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
465                     uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
466                         continue;
467
468                 if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
469                         (void) snprintf(slice_path, sizeof (slice_path),
470                             "%s%s%d", path, "-part", i+1);
471                 else
472                         (void) snprintf(slice_path, sizeof (slice_path),
473                             "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
474                             "p" : "", i+1);
475
476                 err = check_slice(slice_path, cache, force, isspare);
477                 if (err)
478                         break;
479         }
480
481         efi_free(vtoc);
482         (void) close(fd);
483
484         return (err);
485 }
486
487 static int
488 check_device(const char *path, boolean_t force,
489     boolean_t isspare, boolean_t iswholedisk)
490 {
491         blkid_cache cache;
492         int error;
493
494         error = blkid_get_cache(&cache, NULL);
495         if (error != 0) {
496                 (void) fprintf(stderr, gettext("unable to access the blkid "
497                     "cache.\n"));
498                 return (-1);
499         }
500
501         error = check_disk(path, cache, force, isspare, iswholedisk);
502         blkid_put_cache(cache);
503
504         return (error);
505 }
506
507 /*
508  * This may be a shorthand device path or it could be total gibberish.
509  * Check to see if it is a known device available in zfs_vdev_paths.
510  * As part of this check, see if we've been given an entire disk
511  * (minus the slice number).
512  */
513 static int
514 is_shorthand_path(const char *arg, char *path, size_t path_size,
515     struct stat64 *statbuf, boolean_t *wholedisk)
516 {
517         int error;
518
519         error = zfs_resolve_shortname(arg, path, path_size);
520         if (error == 0) {
521                 *wholedisk = zfs_dev_is_whole_disk(path);
522                 if (*wholedisk || (stat64(path, statbuf) == 0))
523                         return (0);
524         }
525
526         strlcpy(path, arg, path_size);
527         memset(statbuf, 0, sizeof (*statbuf));
528         *wholedisk = B_FALSE;
529
530         return (error);
531 }
532
533 /*
534  * Determine if the given path is a hot spare within the given configuration.
535  * If no configuration is given we rely solely on the label.
536  */
537 static boolean_t
538 is_spare(nvlist_t *config, const char *path)
539 {
540         int fd;
541         pool_state_t state;
542         char *name = NULL;
543         nvlist_t *label;
544         uint64_t guid, spareguid;
545         nvlist_t *nvroot;
546         nvlist_t **spares;
547         uint_t i, nspares;
548         boolean_t inuse;
549
550         if ((fd = open(path, O_RDONLY)) < 0)
551                 return (B_FALSE);
552
553         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
554             !inuse ||
555             state != POOL_STATE_SPARE ||
556             zpool_read_label(fd, &label, NULL) != 0) {
557                 free(name);
558                 (void) close(fd);
559                 return (B_FALSE);
560         }
561         free(name);
562         (void) close(fd);
563
564         if (config == NULL) {
565                 nvlist_free(label);
566                 return (B_TRUE);
567         }
568
569         verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
570         nvlist_free(label);
571
572         verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
573             &nvroot) == 0);
574         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
575             &spares, &nspares) == 0) {
576                 for (i = 0; i < nspares; i++) {
577                         verify(nvlist_lookup_uint64(spares[i],
578                             ZPOOL_CONFIG_GUID, &spareguid) == 0);
579                         if (spareguid == guid)
580                                 return (B_TRUE);
581                 }
582         }
583
584         return (B_FALSE);
585 }
586
587 /*
588  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
589  * device, fill in the device id to make a complete nvlist.  Valid forms for a
590  * leaf vdev are:
591  *
592  *      /dev/xxx        Complete disk path
593  *      /xxx            Full path to file
594  *      xxx             Shorthand for <zfs_vdev_paths>/xxx
595  */
596 static nvlist_t *
597 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
598 {
599         char path[MAXPATHLEN];
600         struct stat64 statbuf;
601         nvlist_t *vdev = NULL;
602         char *type = NULL;
603         boolean_t wholedisk = B_FALSE;
604         uint64_t ashift = 0;
605         int err;
606
607         /*
608          * Determine what type of vdev this is, and put the full path into
609          * 'path'.  We detect whether this is a device of file afterwards by
610          * checking the st_mode of the file.
611          */
612         if (arg[0] == '/') {
613                 /*
614                  * Complete device or file path.  Exact type is determined by
615                  * examining the file descriptor afterwards.  Symbolic links
616                  * are resolved to their real paths to determine whole disk
617                  * and S_ISBLK/S_ISREG type checks.  However, we are careful
618                  * to store the given path as ZPOOL_CONFIG_PATH to ensure we
619                  * can leverage udev's persistent device labels.
620                  */
621                 if (realpath(arg, path) == NULL) {
622                         (void) fprintf(stderr,
623                             gettext("cannot resolve path '%s'\n"), arg);
624                         return (NULL);
625                 }
626
627                 wholedisk = zfs_dev_is_whole_disk(path);
628                 if (!wholedisk && (stat64(path, &statbuf) != 0)) {
629                         (void) fprintf(stderr,
630                             gettext("cannot open '%s': %s\n"),
631                             path, strerror(errno));
632                         return (NULL);
633                 }
634
635                 /* After whole disk check restore original passed path */
636                 strlcpy(path, arg, sizeof (path));
637         } else {
638                 err = is_shorthand_path(arg, path, sizeof (path),
639                     &statbuf, &wholedisk);
640                 if (err != 0) {
641                         /*
642                          * If we got ENOENT, then the user gave us
643                          * gibberish, so try to direct them with a
644                          * reasonable error message.  Otherwise,
645                          * regurgitate strerror() since it's the best we
646                          * can do.
647                          */
648                         if (err == ENOENT) {
649                                 (void) fprintf(stderr,
650                                     gettext("cannot open '%s': no such "
651                                     "device in %s\n"), arg, DISK_ROOT);
652                                 (void) fprintf(stderr,
653                                     gettext("must be a full path or "
654                                     "shorthand device name\n"));
655                                 return (NULL);
656                         } else {
657                                 (void) fprintf(stderr,
658                                     gettext("cannot open '%s': %s\n"),
659                                     path, strerror(errno));
660                                 return (NULL);
661                         }
662                 }
663         }
664
665         /*
666          * Determine whether this is a device or a file.
667          */
668         if (wholedisk || S_ISBLK(statbuf.st_mode)) {
669                 type = VDEV_TYPE_DISK;
670         } else if (S_ISREG(statbuf.st_mode)) {
671                 type = VDEV_TYPE_FILE;
672         } else {
673                 (void) fprintf(stderr, gettext("cannot use '%s': must be a "
674                     "block device or regular file\n"), path);
675                 return (NULL);
676         }
677
678         /*
679          * Finally, we have the complete device or file, and we know that it is
680          * acceptable to use.  Construct the nvlist to describe this vdev.  All
681          * vdevs have a 'path' element, and devices also have a 'devid' element.
682          */
683         verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
684         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
685         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
686         verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
687         if (strcmp(type, VDEV_TYPE_DISK) == 0)
688                 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
689                     (uint64_t)wholedisk) == 0);
690
691         /*
692          * Override defaults if custom properties are provided.
693          */
694         if (props != NULL) {
695                 char *value = NULL;
696
697                 if (nvlist_lookup_string(props,
698                     zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
699                         if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
700                                 (void) fprintf(stderr,
701                                     gettext("ashift must be a number.\n"));
702                                 return (NULL);
703                         }
704                         if (ashift != 0 &&
705                             (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
706                                 (void) fprintf(stderr,
707                                     gettext("invalid 'ashift=%" PRIu64 "' "
708                                     "property: only values between %" PRId32 " "
709                                     "and %" PRId32 " are allowed.\n"),
710                                     ashift, ASHIFT_MIN, ASHIFT_MAX);
711                                 return (NULL);
712                         }
713                 }
714         }
715
716         /*
717          * If the device is known to incorrectly report its physical sector
718          * size explicitly provide the known correct value.
719          */
720         if (ashift == 0) {
721                 int sector_size;
722
723                 if (check_sector_size_database(path, &sector_size) == B_TRUE)
724                         ashift = highbit64(sector_size) - 1;
725         }
726
727         if (ashift > 0)
728                 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
729
730         return (vdev);
731 }
732
733 /*
734  * Go through and verify the replication level of the pool is consistent.
735  * Performs the following checks:
736  *
737  *      For the new spec, verifies that devices in mirrors and raidz are the
738  *      same size.
739  *
740  *      If the current configuration already has inconsistent replication
741  *      levels, ignore any other potential problems in the new spec.
742  *
743  *      Otherwise, make sure that the current spec (if there is one) and the new
744  *      spec have consistent replication levels.
745  */
746 typedef struct replication_level {
747         char *zprl_type;
748         uint64_t zprl_children;
749         uint64_t zprl_parity;
750 } replication_level_t;
751
752 #define ZPOOL_FUZZ      (16 * 1024 * 1024)
753
754 static boolean_t
755 is_raidz_mirror(replication_level_t *a, replication_level_t *b,
756     replication_level_t **raidz, replication_level_t **mirror)
757 {
758         if (strcmp(a->zprl_type, "raidz") == 0 &&
759             strcmp(b->zprl_type, "mirror") == 0) {
760                 *raidz = a;
761                 *mirror = b;
762                 return (B_TRUE);
763         }
764         return (B_FALSE);
765 }
766
767 /*
768  * Given a list of toplevel vdevs, return the current replication level.  If
769  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
770  * an error message will be displayed for each self-inconsistent vdev.
771  */
772 static replication_level_t *
773 get_replication(nvlist_t *nvroot, boolean_t fatal)
774 {
775         nvlist_t **top;
776         uint_t t, toplevels;
777         nvlist_t **child;
778         uint_t c, children;
779         nvlist_t *nv;
780         char *type;
781         replication_level_t lastrep = {0};
782         replication_level_t rep;
783         replication_level_t *ret;
784         replication_level_t *raidz, *mirror;
785         boolean_t dontreport;
786
787         ret = safe_malloc(sizeof (replication_level_t));
788
789         verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
790             &top, &toplevels) == 0);
791
792         for (t = 0; t < toplevels; t++) {
793                 uint64_t is_log = B_FALSE;
794
795                 nv = top[t];
796
797                 /*
798                  * For separate logs we ignore the top level vdev replication
799                  * constraints.
800                  */
801                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
802                 if (is_log)
803                         continue;
804
805                 /* Ignore holes introduced by removing aux devices */
806                 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
807                 if (strcmp(type, VDEV_TYPE_HOLE) == 0)
808                         continue;
809
810                 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
811                     &child, &children) != 0) {
812                         /*
813                          * This is a 'file' or 'disk' vdev.
814                          */
815                         rep.zprl_type = type;
816                         rep.zprl_children = 1;
817                         rep.zprl_parity = 0;
818                 } else {
819                         uint64_t vdev_size;
820
821                         /*
822                          * This is a mirror or RAID-Z vdev.  Go through and make
823                          * sure the contents are all the same (files vs. disks),
824                          * keeping track of the number of elements in the
825                          * process.
826                          *
827                          * We also check that the size of each vdev (if it can
828                          * be determined) is the same.
829                          */
830                         rep.zprl_type = type;
831                         rep.zprl_children = 0;
832
833                         if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
834                                 verify(nvlist_lookup_uint64(nv,
835                                     ZPOOL_CONFIG_NPARITY,
836                                     &rep.zprl_parity) == 0);
837                                 assert(rep.zprl_parity != 0);
838                         } else {
839                                 rep.zprl_parity = 0;
840                         }
841
842                         /*
843                          * The 'dontreport' variable indicates that we've
844                          * already reported an error for this spec, so don't
845                          * bother doing it again.
846                          */
847                         type = NULL;
848                         dontreport = 0;
849                         vdev_size = -1ULL;
850                         for (c = 0; c < children; c++) {
851                                 nvlist_t *cnv = child[c];
852                                 char *path;
853                                 struct stat64 statbuf;
854                                 uint64_t size = -1ULL;
855                                 char *childtype;
856                                 int fd, err;
857
858                                 rep.zprl_children++;
859
860                                 verify(nvlist_lookup_string(cnv,
861                                     ZPOOL_CONFIG_TYPE, &childtype) == 0);
862
863                                 /*
864                                  * If this is a replacing or spare vdev, then
865                                  * get the real first child of the vdev: do this
866                                  * in a loop because replacing and spare vdevs
867                                  * can be nested.
868                                  */
869                                 while (strcmp(childtype,
870                                     VDEV_TYPE_REPLACING) == 0 ||
871                                     strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
872                                         nvlist_t **rchild;
873                                         uint_t rchildren;
874
875                                         verify(nvlist_lookup_nvlist_array(cnv,
876                                             ZPOOL_CONFIG_CHILDREN, &rchild,
877                                             &rchildren) == 0);
878                                         assert(rchildren == 2);
879                                         cnv = rchild[0];
880
881                                         verify(nvlist_lookup_string(cnv,
882                                             ZPOOL_CONFIG_TYPE,
883                                             &childtype) == 0);
884                                 }
885
886                                 verify(nvlist_lookup_string(cnv,
887                                     ZPOOL_CONFIG_PATH, &path) == 0);
888
889                                 /*
890                                  * If we have a raidz/mirror that combines disks
891                                  * with files, report it as an error.
892                                  */
893                                 if (!dontreport && type != NULL &&
894                                     strcmp(type, childtype) != 0) {
895                                         if (ret != NULL)
896                                                 free(ret);
897                                         ret = NULL;
898                                         if (fatal)
899                                                 vdev_error(gettext(
900                                                     "mismatched replication "
901                                                     "level: %s contains both "
902                                                     "files and devices\n"),
903                                                     rep.zprl_type);
904                                         else
905                                                 return (NULL);
906                                         dontreport = B_TRUE;
907                                 }
908
909                                 /*
910                                  * According to stat(2), the value of 'st_size'
911                                  * is undefined for block devices and character
912                                  * devices.  But there is no effective way to
913                                  * determine the real size in userland.
914                                  *
915                                  * Instead, we'll take advantage of an
916                                  * implementation detail of spec_size().  If the
917                                  * device is currently open, then we (should)
918                                  * return a valid size.
919                                  *
920                                  * If we still don't get a valid size (indicated
921                                  * by a size of 0 or MAXOFFSET_T), then ignore
922                                  * this device altogether.
923                                  */
924                                 if ((fd = open(path, O_RDONLY)) >= 0) {
925                                         err = fstat64_blk(fd, &statbuf);
926                                         (void) close(fd);
927                                 } else {
928                                         err = stat64(path, &statbuf);
929                                 }
930
931                                 if (err != 0 ||
932                                     statbuf.st_size == 0 ||
933                                     statbuf.st_size == MAXOFFSET_T)
934                                         continue;
935
936                                 size = statbuf.st_size;
937
938                                 /*
939                                  * Also make sure that devices and
940                                  * slices have a consistent size.  If
941                                  * they differ by a significant amount
942                                  * (~16MB) then report an error.
943                                  */
944                                 if (!dontreport &&
945                                     (vdev_size != -1ULL &&
946                                     (labs(size - vdev_size) >
947                                     ZPOOL_FUZZ))) {
948                                         if (ret != NULL)
949                                                 free(ret);
950                                         ret = NULL;
951                                         if (fatal)
952                                                 vdev_error(gettext(
953                                                     "%s contains devices of "
954                                                     "different sizes\n"),
955                                                     rep.zprl_type);
956                                         else
957                                                 return (NULL);
958                                         dontreport = B_TRUE;
959                                 }
960
961                                 type = childtype;
962                                 vdev_size = size;
963                         }
964                 }
965
966                 /*
967                  * At this point, we have the replication of the last toplevel
968                  * vdev in 'rep'.  Compare it to 'lastrep' to see if its
969                  * different.
970                  */
971                 if (lastrep.zprl_type != NULL) {
972                         if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
973                             is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
974                                 /*
975                                  * Accepted raidz and mirror when they can
976                                  * handle the same number of disk failures.
977                                  */
978                                 if (raidz->zprl_parity !=
979                                     mirror->zprl_children - 1) {
980                                         if (ret != NULL)
981                                                 free(ret);
982                                         ret = NULL;
983                                         if (fatal)
984                                                 vdev_error(gettext(
985                                                     "mismatched replication "
986                                                     "level: "
987                                                     "%s and %s vdevs with "
988                                                     "different redundancy, "
989                                                     "%llu vs. %llu (%llu-way) "
990                                                     "are present\n"),
991                                                     raidz->zprl_type,
992                                                     mirror->zprl_type,
993                                                     raidz->zprl_parity,
994                                                     mirror->zprl_children - 1,
995                                                     mirror->zprl_children);
996                                         else
997                                                 return (NULL);
998                                 }
999                         } else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
1000                             0) {
1001                                 if (ret != NULL)
1002                                         free(ret);
1003                                 ret = NULL;
1004                                 if (fatal)
1005                                         vdev_error(gettext(
1006                                             "mismatched replication level: "
1007                                             "both %s and %s vdevs are "
1008                                             "present\n"),
1009                                             lastrep.zprl_type, rep.zprl_type);
1010                                 else
1011                                         return (NULL);
1012                         } else if (lastrep.zprl_parity != rep.zprl_parity) {
1013                                 if (ret)
1014                                         free(ret);
1015                                 ret = NULL;
1016                                 if (fatal)
1017                                         vdev_error(gettext(
1018                                             "mismatched replication level: "
1019                                             "both %llu and %llu device parity "
1020                                             "%s vdevs are present\n"),
1021                                             lastrep.zprl_parity,
1022                                             rep.zprl_parity,
1023                                             rep.zprl_type);
1024                                 else
1025                                         return (NULL);
1026                         } else if (lastrep.zprl_children != rep.zprl_children) {
1027                                 if (ret)
1028                                         free(ret);
1029                                 ret = NULL;
1030                                 if (fatal)
1031                                         vdev_error(gettext(
1032                                             "mismatched replication level: "
1033                                             "both %llu-way and %llu-way %s "
1034                                             "vdevs are present\n"),
1035                                             lastrep.zprl_children,
1036                                             rep.zprl_children,
1037                                             rep.zprl_type);
1038                                 else
1039                                         return (NULL);
1040                         }
1041                 }
1042                 lastrep = rep;
1043         }
1044
1045         if (ret != NULL)
1046                 *ret = rep;
1047
1048         return (ret);
1049 }
1050
1051 /*
1052  * Check the replication level of the vdev spec against the current pool.  Calls
1053  * get_replication() to make sure the new spec is self-consistent.  If the pool
1054  * has a consistent replication level, then we ignore any errors.  Otherwise,
1055  * report any difference between the two.
1056  */
1057 static int
1058 check_replication(nvlist_t *config, nvlist_t *newroot)
1059 {
1060         nvlist_t **child;
1061         uint_t  children;
1062         replication_level_t *current = NULL, *new;
1063         replication_level_t *raidz, *mirror;
1064         int ret;
1065
1066         /*
1067          * If we have a current pool configuration, check to see if it's
1068          * self-consistent.  If not, simply return success.
1069          */
1070         if (config != NULL) {
1071                 nvlist_t *nvroot;
1072
1073                 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1074                     &nvroot) == 0);
1075                 if ((current = get_replication(nvroot, B_FALSE)) == NULL)
1076                         return (0);
1077         }
1078         /*
1079          * for spares there may be no children, and therefore no
1080          * replication level to check
1081          */
1082         if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
1083             &child, &children) != 0) || (children == 0)) {
1084                 free(current);
1085                 return (0);
1086         }
1087
1088         /*
1089          * If all we have is logs then there's no replication level to check.
1090          */
1091         if (num_logs(newroot) == children) {
1092                 free(current);
1093                 return (0);
1094         }
1095
1096         /*
1097          * Get the replication level of the new vdev spec, reporting any
1098          * inconsistencies found.
1099          */
1100         if ((new = get_replication(newroot, B_TRUE)) == NULL) {
1101                 free(current);
1102                 return (-1);
1103         }
1104
1105         /*
1106          * Check to see if the new vdev spec matches the replication level of
1107          * the current pool.
1108          */
1109         ret = 0;
1110         if (current != NULL) {
1111                 if (is_raidz_mirror(current, new, &raidz, &mirror) ||
1112                     is_raidz_mirror(new, current, &raidz, &mirror)) {
1113                         if (raidz->zprl_parity != mirror->zprl_children - 1) {
1114                                 vdev_error(gettext(
1115                                     "mismatched replication level: pool and "
1116                                     "new vdev with different redundancy, %s "
1117                                     "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
1118                                     raidz->zprl_type,
1119                                     mirror->zprl_type,
1120                                     raidz->zprl_parity,
1121                                     mirror->zprl_children - 1,
1122                                     mirror->zprl_children);
1123                                 ret = -1;
1124                         }
1125                 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
1126                         vdev_error(gettext(
1127                             "mismatched replication level: pool uses %s "
1128                             "and new vdev is %s\n"),
1129                             current->zprl_type, new->zprl_type);
1130                         ret = -1;
1131                 } else if (current->zprl_parity != new->zprl_parity) {
1132                         vdev_error(gettext(
1133                             "mismatched replication level: pool uses %llu "
1134                             "device parity and new vdev uses %llu\n"),
1135                             current->zprl_parity, new->zprl_parity);
1136                         ret = -1;
1137                 } else if (current->zprl_children != new->zprl_children) {
1138                         vdev_error(gettext(
1139                             "mismatched replication level: pool uses %llu-way "
1140                             "%s and new vdev uses %llu-way %s\n"),
1141                             current->zprl_children, current->zprl_type,
1142                             new->zprl_children, new->zprl_type);
1143                         ret = -1;
1144                 }
1145         }
1146
1147         free(new);
1148         if (current != NULL)
1149                 free(current);
1150
1151         return (ret);
1152 }
1153
1154 static int
1155 zero_label(char *path)
1156 {
1157         const int size = 4096;
1158         char buf[size];
1159         int err, fd;
1160
1161         if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
1162                 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
1163                     path, strerror(errno));
1164                 return (-1);
1165         }
1166
1167         memset(buf, 0, size);
1168         err = write(fd, buf, size);
1169         (void) fdatasync(fd);
1170         (void) close(fd);
1171
1172         if (err == -1) {
1173                 (void) fprintf(stderr, gettext("cannot zero first %d bytes "
1174                     "of '%s': %s\n"), size, path, strerror(errno));
1175                 return (-1);
1176         }
1177
1178         if (err != size) {
1179                 (void) fprintf(stderr, gettext("could only zero %d/%d bytes "
1180                     "of '%s'\n"), err, size, path);
1181                 return (-1);
1182         }
1183
1184         return (0);
1185 }
1186
1187 /*
1188  * Go through and find any whole disks in the vdev specification, labelling them
1189  * as appropriate.  When constructing the vdev spec, we were unable to open this
1190  * device in order to provide a devid.  Now that we have labelled the disk and
1191  * know that slice 0 is valid, we can construct the devid now.
1192  *
1193  * If the disk was already labeled with an EFI label, we will have gotten the
1194  * devid already (because we were able to open the whole disk).  Otherwise, we
1195  * need to get the devid after we label the disk.
1196  */
1197 static int
1198 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
1199 {
1200         nvlist_t **child;
1201         uint_t c, children;
1202         char *type, *path;
1203         char devpath[MAXPATHLEN];
1204         char udevpath[MAXPATHLEN];
1205         uint64_t wholedisk;
1206         struct stat64 statbuf;
1207         int is_exclusive = 0;
1208         int fd;
1209         int ret;
1210
1211         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1212
1213         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1214             &child, &children) != 0) {
1215
1216                 if (strcmp(type, VDEV_TYPE_DISK) != 0)
1217                         return (0);
1218
1219                 /*
1220                  * We have a disk device.  If this is a whole disk write
1221                  * out the efi partition table, otherwise write zero's to
1222                  * the first 4k of the partition.  This is to ensure that
1223                  * libblkid will not misidentify the partition due to a
1224                  * magic value left by the previous filesystem.
1225                  */
1226                 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1227                 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1228                     &wholedisk));
1229
1230                 if (!wholedisk) {
1231                         /*
1232                          * Update device id string for mpath nodes (Linux only)
1233                          */
1234                         if (is_mpath_whole_disk(path))
1235                                 update_vdev_config_dev_strs(nv);
1236
1237                         if (!is_spare(NULL, path))
1238                                 (void) zero_label(path);
1239                         return (0);
1240                 }
1241
1242                 if (realpath(path, devpath) == NULL) {
1243                         ret = errno;
1244                         (void) fprintf(stderr,
1245                             gettext("cannot resolve path '%s'\n"), path);
1246                         return (ret);
1247                 }
1248
1249                 /*
1250                  * Remove any previously existing symlink from a udev path to
1251                  * the device before labeling the disk.  This ensures that
1252                  * only newly created links are used.  Otherwise there is a
1253                  * window between when udev deletes and recreates the link
1254                  * during which access attempts will fail with ENOENT.
1255                  */
1256                 strlcpy(udevpath, path, MAXPATHLEN);
1257                 (void) zfs_append_partition(udevpath, MAXPATHLEN);
1258
1259                 fd = open(devpath, O_RDWR|O_EXCL);
1260                 if (fd == -1) {
1261                         if (errno == EBUSY)
1262                                 is_exclusive = 1;
1263                 } else {
1264                         (void) close(fd);
1265                 }
1266
1267                 /*
1268                  * If the partition exists, contains a valid spare label,
1269                  * and is opened exclusively there is no need to partition
1270                  * it.  Hot spares have already been partitioned and are
1271                  * held open exclusively by the kernel as a safety measure.
1272                  *
1273                  * If the provided path is for a /dev/disk/ device its
1274                  * symbolic link will be removed, partition table created,
1275                  * and then block until udev creates the new link.
1276                  */
1277                 if (!is_exclusive || !is_spare(NULL, udevpath)) {
1278                         char *devnode = strrchr(devpath, '/') + 1;
1279
1280                         ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
1281                         if (ret == 0) {
1282                                 ret = lstat64(udevpath, &statbuf);
1283                                 if (ret == 0 && S_ISLNK(statbuf.st_mode))
1284                                         (void) unlink(udevpath);
1285                         }
1286
1287                         /*
1288                          * When labeling a pool the raw device node name
1289                          * is provided as it appears under /dev/.
1290                          */
1291                         if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
1292                                 return (-1);
1293
1294                         /*
1295                          * Wait for udev to signal the device is available
1296                          * by the provided path.
1297                          */
1298                         ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
1299                         if (ret) {
1300                                 (void) fprintf(stderr,
1301                                     gettext("missing link: %s was "
1302                                     "partitioned but %s is missing\n"),
1303                                     devnode, udevpath);
1304                                 return (ret);
1305                         }
1306
1307                         ret = zero_label(udevpath);
1308                         if (ret)
1309                                 return (ret);
1310                 }
1311
1312                 /*
1313                  * Update the path to refer to the partition.  The presence of
1314                  * the 'whole_disk' field indicates to the CLI that we should
1315                  * chop off the partition number when displaying the device in
1316                  * future output.
1317                  */
1318                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
1319
1320                 /*
1321                  * Update device id strings for whole disks (Linux only)
1322                  */
1323                 update_vdev_config_dev_strs(nv);
1324
1325                 return (0);
1326         }
1327
1328         for (c = 0; c < children; c++)
1329                 if ((ret = make_disks(zhp, child[c])) != 0)
1330                         return (ret);
1331
1332         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1333             &child, &children) == 0)
1334                 for (c = 0; c < children; c++)
1335                         if ((ret = make_disks(zhp, child[c])) != 0)
1336                                 return (ret);
1337
1338         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1339             &child, &children) == 0)
1340                 for (c = 0; c < children; c++)
1341                         if ((ret = make_disks(zhp, child[c])) != 0)
1342                                 return (ret);
1343
1344         return (0);
1345 }
1346
1347 /*
1348  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1349  * the majority of this task.
1350  */
1351 static boolean_t
1352 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1353     boolean_t replacing, boolean_t isspare)
1354 {
1355         nvlist_t **child;
1356         uint_t c, children;
1357         char *type, *path;
1358         int ret = 0;
1359         char buf[MAXPATHLEN];
1360         uint64_t wholedisk = B_FALSE;
1361         boolean_t anyinuse = B_FALSE;
1362
1363         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1364
1365         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1366             &child, &children) != 0) {
1367
1368                 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1369                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1370                         verify(!nvlist_lookup_uint64(nv,
1371                             ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
1372
1373                 /*
1374                  * As a generic check, we look to see if this is a replace of a
1375                  * hot spare within the same pool.  If so, we allow it
1376                  * regardless of what libblkid or zpool_in_use() says.
1377                  */
1378                 if (replacing) {
1379                         (void) strlcpy(buf, path, sizeof (buf));
1380                         if (wholedisk) {
1381                                 ret = zfs_append_partition(buf,  sizeof (buf));
1382                                 if (ret == -1)
1383                                         return (-1);
1384                         }
1385
1386                         if (is_spare(config, buf))
1387                                 return (B_FALSE);
1388                 }
1389
1390                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1391                         ret = check_device(path, force, isspare, wholedisk);
1392
1393                 else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1394                         ret = check_file(path, force, isspare);
1395
1396                 return (ret != 0);
1397         }
1398
1399         for (c = 0; c < children; c++)
1400                 if (is_device_in_use(config, child[c], force, replacing,
1401                     B_FALSE))
1402                         anyinuse = B_TRUE;
1403
1404         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1405             &child, &children) == 0)
1406                 for (c = 0; c < children; c++)
1407                         if (is_device_in_use(config, child[c], force, replacing,
1408                             B_TRUE))
1409                                 anyinuse = B_TRUE;
1410
1411         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1412             &child, &children) == 0)
1413                 for (c = 0; c < children; c++)
1414                         if (is_device_in_use(config, child[c], force, replacing,
1415                             B_FALSE))
1416                                 anyinuse = B_TRUE;
1417
1418         return (anyinuse);
1419 }
1420
1421 static const char *
1422 is_grouping(const char *type, int *mindev, int *maxdev)
1423 {
1424         if (strncmp(type, "raidz", 5) == 0) {
1425                 const char *p = type + 5;
1426                 char *end;
1427                 long nparity;
1428
1429                 if (*p == '\0') {
1430                         nparity = 1;
1431                 } else if (*p == '0') {
1432                         return (NULL); /* no zero prefixes allowed */
1433                 } else {
1434                         errno = 0;
1435                         nparity = strtol(p, &end, 10);
1436                         if (errno != 0 || nparity < 1 || nparity >= 255 ||
1437                             *end != '\0')
1438                                 return (NULL);
1439                 }
1440
1441                 if (mindev != NULL)
1442                         *mindev = nparity + 1;
1443                 if (maxdev != NULL)
1444                         *maxdev = 255;
1445                 return (VDEV_TYPE_RAIDZ);
1446         }
1447
1448         if (maxdev != NULL)
1449                 *maxdev = INT_MAX;
1450
1451         if (strcmp(type, "mirror") == 0) {
1452                 if (mindev != NULL)
1453                         *mindev = 2;
1454                 return (VDEV_TYPE_MIRROR);
1455         }
1456
1457         if (strcmp(type, "spare") == 0) {
1458                 if (mindev != NULL)
1459                         *mindev = 1;
1460                 return (VDEV_TYPE_SPARE);
1461         }
1462
1463         if (strcmp(type, "log") == 0) {
1464                 if (mindev != NULL)
1465                         *mindev = 1;
1466                 return (VDEV_TYPE_LOG);
1467         }
1468
1469         if (strcmp(type, "cache") == 0) {
1470                 if (mindev != NULL)
1471                         *mindev = 1;
1472                 return (VDEV_TYPE_L2CACHE);
1473         }
1474
1475         return (NULL);
1476 }
1477
1478 /*
1479  * Construct a syntactically valid vdev specification,
1480  * and ensure that all devices and files exist and can be opened.
1481  * Note: we don't bother freeing anything in the error paths
1482  * because the program is just going to exit anyway.
1483  */
1484 nvlist_t *
1485 construct_spec(nvlist_t *props, int argc, char **argv)
1486 {
1487         nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1488         int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1489         const char *type;
1490         uint64_t is_log;
1491         boolean_t seen_logs;
1492
1493         top = NULL;
1494         toplevels = 0;
1495         spares = NULL;
1496         l2cache = NULL;
1497         nspares = 0;
1498         nlogs = 0;
1499         nl2cache = 0;
1500         is_log = B_FALSE;
1501         seen_logs = B_FALSE;
1502         nvroot = NULL;
1503
1504         while (argc > 0) {
1505                 nv = NULL;
1506
1507                 /*
1508                  * If it's a mirror or raidz, the subsequent arguments are
1509                  * its leaves -- until we encounter the next mirror or raidz.
1510                  */
1511                 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1512                         nvlist_t **child = NULL;
1513                         int c, children = 0;
1514
1515                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1516                                 if (spares != NULL) {
1517                                         (void) fprintf(stderr,
1518                                             gettext("invalid vdev "
1519                                             "specification: 'spare' can be "
1520                                             "specified only once\n"));
1521                                         goto spec_out;
1522                                 }
1523                                 is_log = B_FALSE;
1524                         }
1525
1526                         if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1527                                 if (seen_logs) {
1528                                         (void) fprintf(stderr,
1529                                             gettext("invalid vdev "
1530                                             "specification: 'log' can be "
1531                                             "specified only once\n"));
1532                                         goto spec_out;
1533                                 }
1534                                 seen_logs = B_TRUE;
1535                                 is_log = B_TRUE;
1536                                 argc--;
1537                                 argv++;
1538                                 /*
1539                                  * A log is not a real grouping device.
1540                                  * We just set is_log and continue.
1541                                  */
1542                                 continue;
1543                         }
1544
1545                         if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1546                                 if (l2cache != NULL) {
1547                                         (void) fprintf(stderr,
1548                                             gettext("invalid vdev "
1549                                             "specification: 'cache' can be "
1550                                             "specified only once\n"));
1551                                         goto spec_out;
1552                                 }
1553                                 is_log = B_FALSE;
1554                         }
1555
1556                         if (is_log) {
1557                                 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1558                                         (void) fprintf(stderr,
1559                                             gettext("invalid vdev "
1560                                             "specification: unsupported 'log' "
1561                                             "device: %s\n"), type);
1562                                         goto spec_out;
1563                                 }
1564                                 nlogs++;
1565                         }
1566
1567                         for (c = 1; c < argc; c++) {
1568                                 if (is_grouping(argv[c], NULL, NULL) != NULL)
1569                                         break;
1570                                 children++;
1571                                 child = realloc(child,
1572                                     children * sizeof (nvlist_t *));
1573                                 if (child == NULL)
1574                                         zpool_no_memory();
1575                                 if ((nv = make_leaf_vdev(props, argv[c],
1576                                     B_FALSE)) == NULL) {
1577                                         for (c = 0; c < children - 1; c++)
1578                                                 nvlist_free(child[c]);
1579                                         free(child);
1580                                         goto spec_out;
1581                                 }
1582
1583                                 child[children - 1] = nv;
1584                         }
1585
1586                         if (children < mindev) {
1587                                 (void) fprintf(stderr, gettext("invalid vdev "
1588                                     "specification: %s requires at least %d "
1589                                     "devices\n"), argv[0], mindev);
1590                                 for (c = 0; c < children; c++)
1591                                         nvlist_free(child[c]);
1592                                 free(child);
1593                                 goto spec_out;
1594                         }
1595
1596                         if (children > maxdev) {
1597                                 (void) fprintf(stderr, gettext("invalid vdev "
1598                                     "specification: %s supports no more than "
1599                                     "%d devices\n"), argv[0], maxdev);
1600                                 for (c = 0; c < children; c++)
1601                                         nvlist_free(child[c]);
1602                                 free(child);
1603                                 goto spec_out;
1604                         }
1605
1606                         argc -= c;
1607                         argv += c;
1608
1609                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1610                                 spares = child;
1611                                 nspares = children;
1612                                 continue;
1613                         } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1614                                 l2cache = child;
1615                                 nl2cache = children;
1616                                 continue;
1617                         } else {
1618                                 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1619                                     0) == 0);
1620                                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1621                                     type) == 0);
1622                                 verify(nvlist_add_uint64(nv,
1623                                     ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1624                                 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1625                                         verify(nvlist_add_uint64(nv,
1626                                             ZPOOL_CONFIG_NPARITY,
1627                                             mindev - 1) == 0);
1628                                 }
1629                                 verify(nvlist_add_nvlist_array(nv,
1630                                     ZPOOL_CONFIG_CHILDREN, child,
1631                                     children) == 0);
1632
1633                                 for (c = 0; c < children; c++)
1634                                         nvlist_free(child[c]);
1635                                 free(child);
1636                         }
1637                 } else {
1638                         /*
1639                          * We have a device.  Pass off to make_leaf_vdev() to
1640                          * construct the appropriate nvlist describing the vdev.
1641                          */
1642                         if ((nv = make_leaf_vdev(props, argv[0],
1643                             is_log)) == NULL)
1644                                 goto spec_out;
1645
1646                         if (is_log)
1647                                 nlogs++;
1648                         argc--;
1649                         argv++;
1650                 }
1651
1652                 toplevels++;
1653                 top = realloc(top, toplevels * sizeof (nvlist_t *));
1654                 if (top == NULL)
1655                         zpool_no_memory();
1656                 top[toplevels - 1] = nv;
1657         }
1658
1659         if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1660                 (void) fprintf(stderr, gettext("invalid vdev "
1661                     "specification: at least one toplevel vdev must be "
1662                     "specified\n"));
1663                 goto spec_out;
1664         }
1665
1666         if (seen_logs && nlogs == 0) {
1667                 (void) fprintf(stderr, gettext("invalid vdev specification: "
1668                     "log requires at least 1 device\n"));
1669                 goto spec_out;
1670         }
1671
1672         /*
1673          * Finally, create nvroot and add all top-level vdevs to it.
1674          */
1675         verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1676         verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1677             VDEV_TYPE_ROOT) == 0);
1678         verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1679             top, toplevels) == 0);
1680         if (nspares != 0)
1681                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1682                     spares, nspares) == 0);
1683         if (nl2cache != 0)
1684                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1685                     l2cache, nl2cache) == 0);
1686
1687 spec_out:
1688         for (t = 0; t < toplevels; t++)
1689                 nvlist_free(top[t]);
1690         for (t = 0; t < nspares; t++)
1691                 nvlist_free(spares[t]);
1692         for (t = 0; t < nl2cache; t++)
1693                 nvlist_free(l2cache[t]);
1694
1695         free(spares);
1696         free(l2cache);
1697         free(top);
1698
1699         return (nvroot);
1700 }
1701
1702 nvlist_t *
1703 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1704     splitflags_t flags, int argc, char **argv)
1705 {
1706         nvlist_t *newroot = NULL, **child;
1707         uint_t c, children;
1708
1709         if (argc > 0) {
1710                 if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1711                         (void) fprintf(stderr, gettext("Unable to build a "
1712                             "pool from the specified devices\n"));
1713                         return (NULL);
1714                 }
1715
1716                 if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1717                         nvlist_free(newroot);
1718                         return (NULL);
1719                 }
1720
1721                 /* avoid any tricks in the spec */
1722                 verify(nvlist_lookup_nvlist_array(newroot,
1723                     ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1724                 for (c = 0; c < children; c++) {
1725                         char *path;
1726                         const char *type;
1727                         int min, max;
1728
1729                         verify(nvlist_lookup_string(child[c],
1730                             ZPOOL_CONFIG_PATH, &path) == 0);
1731                         if ((type = is_grouping(path, &min, &max)) != NULL) {
1732                                 (void) fprintf(stderr, gettext("Cannot use "
1733                                     "'%s' as a device for splitting\n"), type);
1734                                 nvlist_free(newroot);
1735                                 return (NULL);
1736                         }
1737                 }
1738         }
1739
1740         if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1741                 nvlist_free(newroot);
1742                 return (NULL);
1743         }
1744
1745         return (newroot);
1746 }
1747
1748 /*
1749  * Get and validate the contents of the given vdev specification.  This ensures
1750  * that the nvlist returned is well-formed, that all the devices exist, and that
1751  * they are not currently in use by any other known consumer.  The 'poolconfig'
1752  * parameter is the current configuration of the pool when adding devices
1753  * existing pool, and is used to perform additional checks, such as changing the
1754  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1755  * new pool.  The 'force' flag controls whether devices should be forcefully
1756  * added, even if they appear in use.
1757  */
1758 nvlist_t *
1759 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1760     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1761 {
1762         nvlist_t *newroot;
1763         nvlist_t *poolconfig = NULL;
1764         is_force = force;
1765
1766         /*
1767          * Construct the vdev specification.  If this is successful, we know
1768          * that we have a valid specification, and that all devices can be
1769          * opened.
1770          */
1771         if ((newroot = construct_spec(props, argc, argv)) == NULL)
1772                 return (NULL);
1773
1774         if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
1775                 nvlist_free(newroot);
1776                 return (NULL);
1777         }
1778
1779         /*
1780          * Validate each device to make sure that its not shared with another
1781          * subsystem.  We do this even if 'force' is set, because there are some
1782          * uses (such as a dedicated dump device) that even '-f' cannot
1783          * override.
1784          */
1785         if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1786                 nvlist_free(newroot);
1787                 return (NULL);
1788         }
1789
1790         /*
1791          * Check the replication level of the given vdevs and report any errors
1792          * found.  We include the existing pool spec, if any, as we need to
1793          * catch changes against the existing replication level.
1794          */
1795         if (check_rep && check_replication(poolconfig, newroot) != 0) {
1796                 nvlist_free(newroot);
1797                 return (NULL);
1798         }
1799
1800         /*
1801          * Run through the vdev specification and label any whole disks found.
1802          */
1803         if (!dryrun && make_disks(zhp, newroot) != 0) {
1804                 nvlist_free(newroot);
1805                 return (NULL);
1806         }
1807
1808         return (newroot);
1809 }