]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - cmd/zpool/zpool_vdev.c
Improve `zpool labelclear`
[FreeBSD/FreeBSD.git] / cmd / zpool / zpool_vdev.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2016, 2017 Intel Corporation.
26  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27  */
28
29 /*
30  * Functions to convert between a list of vdevs and an nvlist representing the
31  * configuration.  Each entry in the list can be one of:
32  *
33  *      Device vdevs
34  *              disk=(path=..., devid=...)
35  *              file=(path=...)
36  *
37  *      Group vdevs
38  *              raidz[1|2]=(...)
39  *              mirror=(...)
40  *
41  *      Hot spares
42  *
43  * While the underlying implementation supports it, group vdevs cannot contain
44  * other group vdevs.  All userland verification of devices is contained within
45  * this file.  If successful, the nvlist returned can be passed directly to the
46  * kernel; we've done as much verification as possible in userland.
47  *
48  * Hot spares are a special case, and passed down as an array of disk vdevs, at
49  * the same level as the root of the vdev tree.
50  *
51  * The only function exported by this file is 'make_root_vdev'.  The
52  * function performs several passes:
53  *
54  *      1. Construct the vdev specification.  Performs syntax validation and
55  *         makes sure each device is valid.
56  *      2. Check for devices in use.  Using libblkid to make sure that no
57  *         devices are also in use.  Some can be overridden using the 'force'
58  *         flag, others cannot.
59  *      3. Check for replication errors if the 'force' flag is not specified.
60  *         validates that the replication level is consistent across the
61  *         entire pool.
62  *      4. Call libzfs to label any whole disks with an EFI label.
63  */
64
65 #include <assert.h>
66 #include <ctype.h>
67 #include <devid.h>
68 #include <errno.h>
69 #include <fcntl.h>
70 #include <libintl.h>
71 #include <libnvpair.h>
72 #include <libzutil.h>
73 #include <limits.h>
74 #include <sys/spa.h>
75 #include <scsi/scsi.h>
76 #include <scsi/sg.h>
77 #include <stdio.h>
78 #include <string.h>
79 #include <unistd.h>
80 #include <sys/efi_partition.h>
81 #include <sys/stat.h>
82 #include <sys/vtoc.h>
83 #include <sys/mntent.h>
84 #include <uuid/uuid.h>
85 #include <blkid/blkid.h>
86 #include "zpool_util.h"
87 #include <sys/zfs_context.h>
88
89 /*
90  * For any given vdev specification, we can have multiple errors.  The
91  * vdev_error() function keeps track of whether we have seen an error yet, and
92  * prints out a header if its the first error we've seen.
93  */
94 boolean_t error_seen;
95 boolean_t is_force;
96
97 typedef struct vdev_disk_db_entry
98 {
99         char id[24];
100         int sector_size;
101 } vdev_disk_db_entry_t;
102
103 /*
104  * Database of block devices that lie about physical sector sizes.  The
105  * identification string must be precisely 24 characters to avoid false
106  * negatives
107  */
108 static vdev_disk_db_entry_t vdev_disk_database[] = {
109         {"ATA     ADATA SSD S396 3", 8192},
110         {"ATA     APPLE SSD SM128E", 8192},
111         {"ATA     APPLE SSD SM256E", 8192},
112         {"ATA     APPLE SSD SM512E", 8192},
113         {"ATA     APPLE SSD SM768E", 8192},
114         {"ATA     C400-MTFDDAC064M", 8192},
115         {"ATA     C400-MTFDDAC128M", 8192},
116         {"ATA     C400-MTFDDAC256M", 8192},
117         {"ATA     C400-MTFDDAC512M", 8192},
118         {"ATA     Corsair Force 3 ", 8192},
119         {"ATA     Corsair Force GS", 8192},
120         {"ATA     INTEL SSDSA2CT04", 8192},
121         {"ATA     INTEL SSDSA2BZ10", 8192},
122         {"ATA     INTEL SSDSA2BZ20", 8192},
123         {"ATA     INTEL SSDSA2BZ30", 8192},
124         {"ATA     INTEL SSDSA2CW04", 8192},
125         {"ATA     INTEL SSDSA2CW08", 8192},
126         {"ATA     INTEL SSDSA2CW12", 8192},
127         {"ATA     INTEL SSDSA2CW16", 8192},
128         {"ATA     INTEL SSDSA2CW30", 8192},
129         {"ATA     INTEL SSDSA2CW60", 8192},
130         {"ATA     INTEL SSDSC2CT06", 8192},
131         {"ATA     INTEL SSDSC2CT12", 8192},
132         {"ATA     INTEL SSDSC2CT18", 8192},
133         {"ATA     INTEL SSDSC2CT24", 8192},
134         {"ATA     INTEL SSDSC2CW06", 8192},
135         {"ATA     INTEL SSDSC2CW12", 8192},
136         {"ATA     INTEL SSDSC2CW18", 8192},
137         {"ATA     INTEL SSDSC2CW24", 8192},
138         {"ATA     INTEL SSDSC2CW48", 8192},
139         {"ATA     KINGSTON SH100S3", 8192},
140         {"ATA     KINGSTON SH103S3", 8192},
141         {"ATA     M4-CT064M4SSD2  ", 8192},
142         {"ATA     M4-CT128M4SSD2  ", 8192},
143         {"ATA     M4-CT256M4SSD2  ", 8192},
144         {"ATA     M4-CT512M4SSD2  ", 8192},
145         {"ATA     OCZ-AGILITY2    ", 8192},
146         {"ATA     OCZ-AGILITY3    ", 8192},
147         {"ATA     OCZ-VERTEX2 3.5 ", 8192},
148         {"ATA     OCZ-VERTEX3     ", 8192},
149         {"ATA     OCZ-VERTEX3 LT  ", 8192},
150         {"ATA     OCZ-VERTEX3 MI  ", 8192},
151         {"ATA     OCZ-VERTEX4     ", 8192},
152         {"ATA     SAMSUNG MZ7WD120", 8192},
153         {"ATA     SAMSUNG MZ7WD240", 8192},
154         {"ATA     SAMSUNG MZ7WD480", 8192},
155         {"ATA     SAMSUNG MZ7WD960", 8192},
156         {"ATA     SAMSUNG SSD 830 ", 8192},
157         {"ATA     Samsung SSD 840 ", 8192},
158         {"ATA     SanDisk SSD U100", 8192},
159         {"ATA     TOSHIBA THNSNH06", 8192},
160         {"ATA     TOSHIBA THNSNH12", 8192},
161         {"ATA     TOSHIBA THNSNH25", 8192},
162         {"ATA     TOSHIBA THNSNH51", 8192},
163         {"ATA     APPLE SSD TS064C", 4096},
164         {"ATA     APPLE SSD TS128C", 4096},
165         {"ATA     APPLE SSD TS256C", 4096},
166         {"ATA     APPLE SSD TS512C", 4096},
167         {"ATA     INTEL SSDSA2M040", 4096},
168         {"ATA     INTEL SSDSA2M080", 4096},
169         {"ATA     INTEL SSDSA2M160", 4096},
170         {"ATA     INTEL SSDSC2MH12", 4096},
171         {"ATA     INTEL SSDSC2MH25", 4096},
172         {"ATA     OCZ CORE_SSD    ", 4096},
173         {"ATA     OCZ-VERTEX      ", 4096},
174         {"ATA     SAMSUNG MCCOE32G", 4096},
175         {"ATA     SAMSUNG MCCOE64G", 4096},
176         {"ATA     SAMSUNG SSD PM80", 4096},
177         /* Flash drives optimized for 4KB IOs on larger pages */
178         {"ATA     INTEL SSDSC2BA10", 4096},
179         {"ATA     INTEL SSDSC2BA20", 4096},
180         {"ATA     INTEL SSDSC2BA40", 4096},
181         {"ATA     INTEL SSDSC2BA80", 4096},
182         {"ATA     INTEL SSDSC2BB08", 4096},
183         {"ATA     INTEL SSDSC2BB12", 4096},
184         {"ATA     INTEL SSDSC2BB16", 4096},
185         {"ATA     INTEL SSDSC2BB24", 4096},
186         {"ATA     INTEL SSDSC2BB30", 4096},
187         {"ATA     INTEL SSDSC2BB40", 4096},
188         {"ATA     INTEL SSDSC2BB48", 4096},
189         {"ATA     INTEL SSDSC2BB60", 4096},
190         {"ATA     INTEL SSDSC2BB80", 4096},
191         {"ATA     INTEL SSDSC2BW24", 4096},
192         {"ATA     INTEL SSDSC2BW48", 4096},
193         {"ATA     INTEL SSDSC2BP24", 4096},
194         {"ATA     INTEL SSDSC2BP48", 4096},
195         {"NA      SmrtStorSDLKAE9W", 4096},
196         {"NVMe    Amazon EC2 NVMe ", 4096},
197         /* Imported from Open Solaris */
198         {"ATA     MARVELL SD88SA02", 4096},
199         /* Advanced format Hard drives */
200         {"ATA     Hitachi HDS5C303", 4096},
201         {"ATA     SAMSUNG HD204UI ", 4096},
202         {"ATA     ST2000DL004 HD20", 4096},
203         {"ATA     WDC WD10EARS-00M", 4096},
204         {"ATA     WDC WD10EARS-00S", 4096},
205         {"ATA     WDC WD10EARS-00Z", 4096},
206         {"ATA     WDC WD15EARS-00M", 4096},
207         {"ATA     WDC WD15EARS-00S", 4096},
208         {"ATA     WDC WD15EARS-00Z", 4096},
209         {"ATA     WDC WD20EARS-00M", 4096},
210         {"ATA     WDC WD20EARS-00S", 4096},
211         {"ATA     WDC WD20EARS-00Z", 4096},
212         {"ATA     WDC WD1600BEVT-0", 4096},
213         {"ATA     WDC WD2500BEVT-0", 4096},
214         {"ATA     WDC WD3200BEVT-0", 4096},
215         {"ATA     WDC WD5000BEVT-0", 4096},
216         /* Virtual disks: Assume zvols with default volblocksize */
217 #if 0
218         {"ATA     QEMU HARDDISK   ", 8192},
219         {"IET     VIRTUAL-DISK    ", 8192},
220         {"OI      COMSTAR         ", 8192},
221         {"SUN     COMSTAR         ", 8192},
222         {"NETAPP  LUN             ", 8192},
223 #endif
224 };
225
226 static const int vdev_disk_database_size =
227         sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
228
229 #define INQ_REPLY_LEN   96
230 #define INQ_CMD_LEN     6
231
232 static boolean_t
233 check_sector_size_database(char *path, int *sector_size)
234 {
235         unsigned char inq_buff[INQ_REPLY_LEN];
236         unsigned char sense_buffer[32];
237         unsigned char inq_cmd_blk[INQ_CMD_LEN] =
238             {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
239         sg_io_hdr_t io_hdr;
240         int error;
241         int fd;
242         int i;
243
244         /* Prepare INQUIRY command */
245         memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
246         io_hdr.interface_id = 'S';
247         io_hdr.cmd_len = sizeof (inq_cmd_blk);
248         io_hdr.mx_sb_len = sizeof (sense_buffer);
249         io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
250         io_hdr.dxfer_len = INQ_REPLY_LEN;
251         io_hdr.dxferp = inq_buff;
252         io_hdr.cmdp = inq_cmd_blk;
253         io_hdr.sbp = sense_buffer;
254         io_hdr.timeout = 10;            /* 10 milliseconds is ample time */
255
256         if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
257                 return (B_FALSE);
258
259         error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
260
261         (void) close(fd);
262
263         if (error < 0)
264                 return (B_FALSE);
265
266         if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
267                 return (B_FALSE);
268
269         for (i = 0; i < vdev_disk_database_size; i++) {
270                 if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
271                         continue;
272
273                 *sector_size = vdev_disk_database[i].sector_size;
274                 return (B_TRUE);
275         }
276
277         return (B_FALSE);
278 }
279
280 /*PRINTFLIKE1*/
281 static void
282 vdev_error(const char *fmt, ...)
283 {
284         va_list ap;
285
286         if (!error_seen) {
287                 (void) fprintf(stderr, gettext("invalid vdev specification\n"));
288                 if (!is_force)
289                         (void) fprintf(stderr, gettext("use '-f' to override "
290                             "the following errors:\n"));
291                 else
292                         (void) fprintf(stderr, gettext("the following errors "
293                             "must be manually repaired:\n"));
294                 error_seen = B_TRUE;
295         }
296
297         va_start(ap, fmt);
298         (void) vfprintf(stderr, fmt, ap);
299         va_end(ap);
300 }
301
302 /*
303  * Check that a file is valid.  All we can do in this case is check that it's
304  * not in use by another pool, and not in use by swap.
305  */
306 static int
307 check_file(const char *file, boolean_t force, boolean_t isspare)
308 {
309         char  *name;
310         int fd;
311         int ret = 0;
312         pool_state_t state;
313         boolean_t inuse;
314
315         if ((fd = open(file, O_RDONLY)) < 0)
316                 return (0);
317
318         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
319                 const char *desc;
320
321                 switch (state) {
322                 case POOL_STATE_ACTIVE:
323                         desc = gettext("active");
324                         break;
325
326                 case POOL_STATE_EXPORTED:
327                         desc = gettext("exported");
328                         break;
329
330                 case POOL_STATE_POTENTIALLY_ACTIVE:
331                         desc = gettext("potentially active");
332                         break;
333
334                 default:
335                         desc = gettext("unknown");
336                         break;
337                 }
338
339                 /*
340                  * Allow hot spares to be shared between pools.
341                  */
342                 if (state == POOL_STATE_SPARE && isspare) {
343                         free(name);
344                         (void) close(fd);
345                         return (0);
346                 }
347
348                 if (state == POOL_STATE_ACTIVE ||
349                     state == POOL_STATE_SPARE || !force) {
350                         switch (state) {
351                         case POOL_STATE_SPARE:
352                                 vdev_error(gettext("%s is reserved as a hot "
353                                     "spare for pool %s\n"), file, name);
354                                 break;
355                         default:
356                                 vdev_error(gettext("%s is part of %s pool "
357                                     "'%s'\n"), file, desc, name);
358                                 break;
359                         }
360                         ret = -1;
361                 }
362
363                 free(name);
364         }
365
366         (void) close(fd);
367         return (ret);
368 }
369
370 static int
371 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
372 {
373         int err;
374         char *value;
375
376         /* No valid type detected device is safe to use */
377         value = blkid_get_tag_value(cache, "TYPE", path);
378         if (value == NULL)
379                 return (0);
380
381         /*
382          * If libblkid detects a ZFS device, we check the device
383          * using check_file() to see if it's safe.  The one safe
384          * case is a spare device shared between multiple pools.
385          */
386         if (strcmp(value, "zfs_member") == 0) {
387                 err = check_file(path, force, isspare);
388         } else {
389                 if (force) {
390                         err = 0;
391                 } else {
392                         err = -1;
393                         vdev_error(gettext("%s contains a filesystem of "
394                             "type '%s'\n"), path, value);
395                 }
396         }
397
398         free(value);
399
400         return (err);
401 }
402
403 /*
404  * Validate that a disk including all partitions are safe to use.
405  *
406  * For EFI labeled disks this can done relatively easily with the libefi
407  * library.  The partition numbers are extracted from the label and used
408  * to generate the expected /dev/ paths.  Each partition can then be
409  * checked for conflicts.
410  *
411  * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
412  * but due to the lack of a readily available libraries this scanning is
413  * not implemented.  Instead only the device path as given is checked.
414  */
415 static int
416 check_disk(const char *path, blkid_cache cache, int force,
417     boolean_t isspare, boolean_t iswholedisk)
418 {
419         struct dk_gpt *vtoc;
420         char slice_path[MAXPATHLEN];
421         int err = 0;
422         int fd, i;
423         int flags = O_RDONLY|O_DIRECT;
424
425         if (!iswholedisk)
426                 return (check_slice(path, cache, force, isspare));
427
428         /* only spares can be shared, other devices require exclusive access */
429         if (!isspare)
430                 flags |= O_EXCL;
431
432         if ((fd = open(path, flags)) < 0) {
433                 char *value = blkid_get_tag_value(cache, "TYPE", path);
434                 (void) fprintf(stderr, gettext("%s is in use and contains "
435                     "a %s filesystem.\n"), path, value ? value : "unknown");
436                 return (-1);
437         }
438
439         /*
440          * Expected to fail for non-EFI labled disks.  Just check the device
441          * as given and do not attempt to detect and scan partitions.
442          */
443         err = efi_alloc_and_read(fd, &vtoc);
444         if (err) {
445                 (void) close(fd);
446                 return (check_slice(path, cache, force, isspare));
447         }
448
449         /*
450          * The primary efi partition label is damaged however the secondary
451          * label at the end of the device is intact.  Rather than use this
452          * label we should play it safe and treat this as a non efi device.
453          */
454         if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
455                 efi_free(vtoc);
456                 (void) close(fd);
457
458                 if (force) {
459                         /* Partitions will now be created using the backup */
460                         return (0);
461                 } else {
462                         vdev_error(gettext("%s contains a corrupt primary "
463                             "EFI label.\n"), path);
464                         return (-1);
465                 }
466         }
467
468         for (i = 0; i < vtoc->efi_nparts; i++) {
469
470                 if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
471                     uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
472                         continue;
473
474                 if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
475                         (void) snprintf(slice_path, sizeof (slice_path),
476                             "%s%s%d", path, "-part", i+1);
477                 else
478                         (void) snprintf(slice_path, sizeof (slice_path),
479                             "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
480                             "p" : "", i+1);
481
482                 err = check_slice(slice_path, cache, force, isspare);
483                 if (err)
484                         break;
485         }
486
487         efi_free(vtoc);
488         (void) close(fd);
489
490         return (err);
491 }
492
493 static int
494 check_device(const char *path, boolean_t force,
495     boolean_t isspare, boolean_t iswholedisk)
496 {
497         blkid_cache cache;
498         int error;
499
500         error = blkid_get_cache(&cache, NULL);
501         if (error != 0) {
502                 (void) fprintf(stderr, gettext("unable to access the blkid "
503                     "cache.\n"));
504                 return (-1);
505         }
506
507         error = check_disk(path, cache, force, isspare, iswholedisk);
508         blkid_put_cache(cache);
509
510         return (error);
511 }
512
513 /*
514  * This may be a shorthand device path or it could be total gibberish.
515  * Check to see if it is a known device available in zfs_vdev_paths.
516  * As part of this check, see if we've been given an entire disk
517  * (minus the slice number).
518  */
519 static int
520 is_shorthand_path(const char *arg, char *path, size_t path_size,
521     struct stat64 *statbuf, boolean_t *wholedisk)
522 {
523         int error;
524
525         error = zfs_resolve_shortname(arg, path, path_size);
526         if (error == 0) {
527                 *wholedisk = zfs_dev_is_whole_disk(path);
528                 if (*wholedisk || (stat64(path, statbuf) == 0))
529                         return (0);
530         }
531
532         strlcpy(path, arg, path_size);
533         memset(statbuf, 0, sizeof (*statbuf));
534         *wholedisk = B_FALSE;
535
536         return (error);
537 }
538
539 /*
540  * Determine if the given path is a hot spare within the given configuration.
541  * If no configuration is given we rely solely on the label.
542  */
543 static boolean_t
544 is_spare(nvlist_t *config, const char *path)
545 {
546         int fd;
547         pool_state_t state;
548         char *name = NULL;
549         nvlist_t *label;
550         uint64_t guid, spareguid;
551         nvlist_t *nvroot;
552         nvlist_t **spares;
553         uint_t i, nspares;
554         boolean_t inuse;
555
556         if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
557                 return (B_FALSE);
558
559         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
560             !inuse ||
561             state != POOL_STATE_SPARE ||
562             zpool_read_label(fd, &label, NULL) != 0) {
563                 free(name);
564                 (void) close(fd);
565                 return (B_FALSE);
566         }
567         free(name);
568         (void) close(fd);
569
570         if (config == NULL) {
571                 nvlist_free(label);
572                 return (B_TRUE);
573         }
574
575         verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
576         nvlist_free(label);
577
578         verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
579             &nvroot) == 0);
580         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
581             &spares, &nspares) == 0) {
582                 for (i = 0; i < nspares; i++) {
583                         verify(nvlist_lookup_uint64(spares[i],
584                             ZPOOL_CONFIG_GUID, &spareguid) == 0);
585                         if (spareguid == guid)
586                                 return (B_TRUE);
587                 }
588         }
589
590         return (B_FALSE);
591 }
592
593 /*
594  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
595  * device, fill in the device id to make a complete nvlist.  Valid forms for a
596  * leaf vdev are:
597  *
598  *      /dev/xxx        Complete disk path
599  *      /xxx            Full path to file
600  *      xxx             Shorthand for <zfs_vdev_paths>/xxx
601  */
602 static nvlist_t *
603 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
604 {
605         char path[MAXPATHLEN];
606         struct stat64 statbuf;
607         nvlist_t *vdev = NULL;
608         char *type = NULL;
609         boolean_t wholedisk = B_FALSE;
610         uint64_t ashift = 0;
611         int err;
612
613         /*
614          * Determine what type of vdev this is, and put the full path into
615          * 'path'.  We detect whether this is a device of file afterwards by
616          * checking the st_mode of the file.
617          */
618         if (arg[0] == '/') {
619                 /*
620                  * Complete device or file path.  Exact type is determined by
621                  * examining the file descriptor afterwards.  Symbolic links
622                  * are resolved to their real paths to determine whole disk
623                  * and S_ISBLK/S_ISREG type checks.  However, we are careful
624                  * to store the given path as ZPOOL_CONFIG_PATH to ensure we
625                  * can leverage udev's persistent device labels.
626                  */
627                 if (realpath(arg, path) == NULL) {
628                         (void) fprintf(stderr,
629                             gettext("cannot resolve path '%s'\n"), arg);
630                         return (NULL);
631                 }
632
633                 wholedisk = zfs_dev_is_whole_disk(path);
634                 if (!wholedisk && (stat64(path, &statbuf) != 0)) {
635                         (void) fprintf(stderr,
636                             gettext("cannot open '%s': %s\n"),
637                             path, strerror(errno));
638                         return (NULL);
639                 }
640
641                 /* After whole disk check restore original passed path */
642                 strlcpy(path, arg, sizeof (path));
643         } else {
644                 err = is_shorthand_path(arg, path, sizeof (path),
645                     &statbuf, &wholedisk);
646                 if (err != 0) {
647                         /*
648                          * If we got ENOENT, then the user gave us
649                          * gibberish, so try to direct them with a
650                          * reasonable error message.  Otherwise,
651                          * regurgitate strerror() since it's the best we
652                          * can do.
653                          */
654                         if (err == ENOENT) {
655                                 (void) fprintf(stderr,
656                                     gettext("cannot open '%s': no such "
657                                     "device in %s\n"), arg, DISK_ROOT);
658                                 (void) fprintf(stderr,
659                                     gettext("must be a full path or "
660                                     "shorthand device name\n"));
661                                 return (NULL);
662                         } else {
663                                 (void) fprintf(stderr,
664                                     gettext("cannot open '%s': %s\n"),
665                                     path, strerror(errno));
666                                 return (NULL);
667                         }
668                 }
669         }
670
671         /*
672          * Determine whether this is a device or a file.
673          */
674         if (wholedisk || S_ISBLK(statbuf.st_mode)) {
675                 type = VDEV_TYPE_DISK;
676         } else if (S_ISREG(statbuf.st_mode)) {
677                 type = VDEV_TYPE_FILE;
678         } else {
679                 (void) fprintf(stderr, gettext("cannot use '%s': must be a "
680                     "block device or regular file\n"), path);
681                 return (NULL);
682         }
683
684         /*
685          * Finally, we have the complete device or file, and we know that it is
686          * acceptable to use.  Construct the nvlist to describe this vdev.  All
687          * vdevs have a 'path' element, and devices also have a 'devid' element.
688          */
689         verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
690         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
691         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
692         verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
693         if (is_log)
694                 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
695                     VDEV_ALLOC_BIAS_LOG) == 0);
696         if (strcmp(type, VDEV_TYPE_DISK) == 0)
697                 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
698                     (uint64_t)wholedisk) == 0);
699
700         /*
701          * Override defaults if custom properties are provided.
702          */
703         if (props != NULL) {
704                 char *value = NULL;
705
706                 if (nvlist_lookup_string(props,
707                     zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
708                         if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
709                                 (void) fprintf(stderr,
710                                     gettext("ashift must be a number.\n"));
711                                 return (NULL);
712                         }
713                         if (ashift != 0 &&
714                             (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
715                                 (void) fprintf(stderr,
716                                     gettext("invalid 'ashift=%" PRIu64 "' "
717                                     "property: only values between %" PRId32 " "
718                                     "and %" PRId32 " are allowed.\n"),
719                                     ashift, ASHIFT_MIN, ASHIFT_MAX);
720                                 return (NULL);
721                         }
722                 }
723         }
724
725         /*
726          * If the device is known to incorrectly report its physical sector
727          * size explicitly provide the known correct value.
728          */
729         if (ashift == 0) {
730                 int sector_size;
731
732                 if (check_sector_size_database(path, &sector_size) == B_TRUE)
733                         ashift = highbit64(sector_size) - 1;
734         }
735
736         if (ashift > 0)
737                 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
738
739         return (vdev);
740 }
741
742 /*
743  * Go through and verify the replication level of the pool is consistent.
744  * Performs the following checks:
745  *
746  *      For the new spec, verifies that devices in mirrors and raidz are the
747  *      same size.
748  *
749  *      If the current configuration already has inconsistent replication
750  *      levels, ignore any other potential problems in the new spec.
751  *
752  *      Otherwise, make sure that the current spec (if there is one) and the new
753  *      spec have consistent replication levels.
754  *
755  *      If there is no current spec (create), make sure new spec has at least
756  *      one general purpose vdev.
757  */
758 typedef struct replication_level {
759         char *zprl_type;
760         uint64_t zprl_children;
761         uint64_t zprl_parity;
762 } replication_level_t;
763
764 #define ZPOOL_FUZZ      (16 * 1024 * 1024)
765
766 static boolean_t
767 is_raidz_mirror(replication_level_t *a, replication_level_t *b,
768     replication_level_t **raidz, replication_level_t **mirror)
769 {
770         if (strcmp(a->zprl_type, "raidz") == 0 &&
771             strcmp(b->zprl_type, "mirror") == 0) {
772                 *raidz = a;
773                 *mirror = b;
774                 return (B_TRUE);
775         }
776         return (B_FALSE);
777 }
778
779 /*
780  * Given a list of toplevel vdevs, return the current replication level.  If
781  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
782  * an error message will be displayed for each self-inconsistent vdev.
783  */
784 static replication_level_t *
785 get_replication(nvlist_t *nvroot, boolean_t fatal)
786 {
787         nvlist_t **top;
788         uint_t t, toplevels;
789         nvlist_t **child;
790         uint_t c, children;
791         nvlist_t *nv;
792         char *type;
793         replication_level_t lastrep = {0};
794         replication_level_t rep;
795         replication_level_t *ret;
796         replication_level_t *raidz, *mirror;
797         boolean_t dontreport;
798
799         ret = safe_malloc(sizeof (replication_level_t));
800
801         verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
802             &top, &toplevels) == 0);
803
804         for (t = 0; t < toplevels; t++) {
805                 uint64_t is_log = B_FALSE;
806
807                 nv = top[t];
808
809                 /*
810                  * For separate logs we ignore the top level vdev replication
811                  * constraints.
812                  */
813                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
814                 if (is_log)
815                         continue;
816
817                 /* Ignore holes introduced by removing aux devices */
818                 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
819                 if (strcmp(type, VDEV_TYPE_HOLE) == 0)
820                         continue;
821
822                 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
823                     &child, &children) != 0) {
824                         /*
825                          * This is a 'file' or 'disk' vdev.
826                          */
827                         rep.zprl_type = type;
828                         rep.zprl_children = 1;
829                         rep.zprl_parity = 0;
830                 } else {
831                         uint64_t vdev_size;
832
833                         /*
834                          * This is a mirror or RAID-Z vdev.  Go through and make
835                          * sure the contents are all the same (files vs. disks),
836                          * keeping track of the number of elements in the
837                          * process.
838                          *
839                          * We also check that the size of each vdev (if it can
840                          * be determined) is the same.
841                          */
842                         rep.zprl_type = type;
843                         rep.zprl_children = 0;
844
845                         if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
846                                 verify(nvlist_lookup_uint64(nv,
847                                     ZPOOL_CONFIG_NPARITY,
848                                     &rep.zprl_parity) == 0);
849                                 assert(rep.zprl_parity != 0);
850                         } else {
851                                 rep.zprl_parity = 0;
852                         }
853
854                         /*
855                          * The 'dontreport' variable indicates that we've
856                          * already reported an error for this spec, so don't
857                          * bother doing it again.
858                          */
859                         type = NULL;
860                         dontreport = 0;
861                         vdev_size = -1ULL;
862                         for (c = 0; c < children; c++) {
863                                 nvlist_t *cnv = child[c];
864                                 char *path;
865                                 struct stat64 statbuf;
866                                 uint64_t size = -1ULL;
867                                 char *childtype;
868                                 int fd, err;
869
870                                 rep.zprl_children++;
871
872                                 verify(nvlist_lookup_string(cnv,
873                                     ZPOOL_CONFIG_TYPE, &childtype) == 0);
874
875                                 /*
876                                  * If this is a replacing or spare vdev, then
877                                  * get the real first child of the vdev: do this
878                                  * in a loop because replacing and spare vdevs
879                                  * can be nested.
880                                  */
881                                 while (strcmp(childtype,
882                                     VDEV_TYPE_REPLACING) == 0 ||
883                                     strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
884                                         nvlist_t **rchild;
885                                         uint_t rchildren;
886
887                                         verify(nvlist_lookup_nvlist_array(cnv,
888                                             ZPOOL_CONFIG_CHILDREN, &rchild,
889                                             &rchildren) == 0);
890                                         assert(rchildren == 2);
891                                         cnv = rchild[0];
892
893                                         verify(nvlist_lookup_string(cnv,
894                                             ZPOOL_CONFIG_TYPE,
895                                             &childtype) == 0);
896                                 }
897
898                                 verify(nvlist_lookup_string(cnv,
899                                     ZPOOL_CONFIG_PATH, &path) == 0);
900
901                                 /*
902                                  * If we have a raidz/mirror that combines disks
903                                  * with files, report it as an error.
904                                  */
905                                 if (!dontreport && type != NULL &&
906                                     strcmp(type, childtype) != 0) {
907                                         if (ret != NULL)
908                                                 free(ret);
909                                         ret = NULL;
910                                         if (fatal)
911                                                 vdev_error(gettext(
912                                                     "mismatched replication "
913                                                     "level: %s contains both "
914                                                     "files and devices\n"),
915                                                     rep.zprl_type);
916                                         else
917                                                 return (NULL);
918                                         dontreport = B_TRUE;
919                                 }
920
921                                 /*
922                                  * According to stat(2), the value of 'st_size'
923                                  * is undefined for block devices and character
924                                  * devices.  But there is no effective way to
925                                  * determine the real size in userland.
926                                  *
927                                  * Instead, we'll take advantage of an
928                                  * implementation detail of spec_size().  If the
929                                  * device is currently open, then we (should)
930                                  * return a valid size.
931                                  *
932                                  * If we still don't get a valid size (indicated
933                                  * by a size of 0 or MAXOFFSET_T), then ignore
934                                  * this device altogether.
935                                  */
936                                 if ((fd = open(path, O_RDONLY)) >= 0) {
937                                         err = fstat64_blk(fd, &statbuf);
938                                         (void) close(fd);
939                                 } else {
940                                         err = stat64(path, &statbuf);
941                                 }
942
943                                 if (err != 0 ||
944                                     statbuf.st_size == 0 ||
945                                     statbuf.st_size == MAXOFFSET_T)
946                                         continue;
947
948                                 size = statbuf.st_size;
949
950                                 /*
951                                  * Also make sure that devices and
952                                  * slices have a consistent size.  If
953                                  * they differ by a significant amount
954                                  * (~16MB) then report an error.
955                                  */
956                                 if (!dontreport &&
957                                     (vdev_size != -1ULL &&
958                                     (labs(size - vdev_size) >
959                                     ZPOOL_FUZZ))) {
960                                         if (ret != NULL)
961                                                 free(ret);
962                                         ret = NULL;
963                                         if (fatal)
964                                                 vdev_error(gettext(
965                                                     "%s contains devices of "
966                                                     "different sizes\n"),
967                                                     rep.zprl_type);
968                                         else
969                                                 return (NULL);
970                                         dontreport = B_TRUE;
971                                 }
972
973                                 type = childtype;
974                                 vdev_size = size;
975                         }
976                 }
977
978                 /*
979                  * At this point, we have the replication of the last toplevel
980                  * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
981                  * different.
982                  */
983                 if (lastrep.zprl_type != NULL) {
984                         if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
985                             is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
986                                 /*
987                                  * Accepted raidz and mirror when they can
988                                  * handle the same number of disk failures.
989                                  */
990                                 if (raidz->zprl_parity !=
991                                     mirror->zprl_children - 1) {
992                                         if (ret != NULL)
993                                                 free(ret);
994                                         ret = NULL;
995                                         if (fatal)
996                                                 vdev_error(gettext(
997                                                     "mismatched replication "
998                                                     "level: "
999                                                     "%s and %s vdevs with "
1000                                                     "different redundancy, "
1001                                                     "%llu vs. %llu (%llu-way) "
1002                                                     "are present\n"),
1003                                                     raidz->zprl_type,
1004                                                     mirror->zprl_type,
1005                                                     raidz->zprl_parity,
1006                                                     mirror->zprl_children - 1,
1007                                                     mirror->zprl_children);
1008                                         else
1009                                                 return (NULL);
1010                                 }
1011                         } else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
1012                             0) {
1013                                 if (ret != NULL)
1014                                         free(ret);
1015                                 ret = NULL;
1016                                 if (fatal)
1017                                         vdev_error(gettext(
1018                                             "mismatched replication level: "
1019                                             "both %s and %s vdevs are "
1020                                             "present\n"),
1021                                             lastrep.zprl_type, rep.zprl_type);
1022                                 else
1023                                         return (NULL);
1024                         } else if (lastrep.zprl_parity != rep.zprl_parity) {
1025                                 if (ret)
1026                                         free(ret);
1027                                 ret = NULL;
1028                                 if (fatal)
1029                                         vdev_error(gettext(
1030                                             "mismatched replication level: "
1031                                             "both %llu and %llu device parity "
1032                                             "%s vdevs are present\n"),
1033                                             lastrep.zprl_parity,
1034                                             rep.zprl_parity,
1035                                             rep.zprl_type);
1036                                 else
1037                                         return (NULL);
1038                         } else if (lastrep.zprl_children != rep.zprl_children) {
1039                                 if (ret)
1040                                         free(ret);
1041                                 ret = NULL;
1042                                 if (fatal)
1043                                         vdev_error(gettext(
1044                                             "mismatched replication level: "
1045                                             "both %llu-way and %llu-way %s "
1046                                             "vdevs are present\n"),
1047                                             lastrep.zprl_children,
1048                                             rep.zprl_children,
1049                                             rep.zprl_type);
1050                                 else
1051                                         return (NULL);
1052                         }
1053                 }
1054                 lastrep = rep;
1055         }
1056
1057         if (ret != NULL)
1058                 *ret = rep;
1059
1060         return (ret);
1061 }
1062
1063 /*
1064  * Check the replication level of the vdev spec against the current pool.  Calls
1065  * get_replication() to make sure the new spec is self-consistent.  If the pool
1066  * has a consistent replication level, then we ignore any errors.  Otherwise,
1067  * report any difference between the two.
1068  */
1069 static int
1070 check_replication(nvlist_t *config, nvlist_t *newroot)
1071 {
1072         nvlist_t **child;
1073         uint_t  children;
1074         replication_level_t *current = NULL, *new;
1075         replication_level_t *raidz, *mirror;
1076         int ret;
1077
1078         /*
1079          * If we have a current pool configuration, check to see if it's
1080          * self-consistent.  If not, simply return success.
1081          */
1082         if (config != NULL) {
1083                 nvlist_t *nvroot;
1084
1085                 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1086                     &nvroot) == 0);
1087                 if ((current = get_replication(nvroot, B_FALSE)) == NULL)
1088                         return (0);
1089         }
1090         /*
1091          * for spares there may be no children, and therefore no
1092          * replication level to check
1093          */
1094         if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
1095             &child, &children) != 0) || (children == 0)) {
1096                 free(current);
1097                 return (0);
1098         }
1099
1100         /*
1101          * If all we have is logs then there's no replication level to check.
1102          */
1103         if (num_logs(newroot) == children) {
1104                 free(current);
1105                 return (0);
1106         }
1107
1108         /*
1109          * Get the replication level of the new vdev spec, reporting any
1110          * inconsistencies found.
1111          */
1112         if ((new = get_replication(newroot, B_TRUE)) == NULL) {
1113                 free(current);
1114                 return (-1);
1115         }
1116
1117         /*
1118          * Check to see if the new vdev spec matches the replication level of
1119          * the current pool.
1120          */
1121         ret = 0;
1122         if (current != NULL) {
1123                 if (is_raidz_mirror(current, new, &raidz, &mirror) ||
1124                     is_raidz_mirror(new, current, &raidz, &mirror)) {
1125                         if (raidz->zprl_parity != mirror->zprl_children - 1) {
1126                                 vdev_error(gettext(
1127                                     "mismatched replication level: pool and "
1128                                     "new vdev with different redundancy, %s "
1129                                     "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
1130                                     raidz->zprl_type,
1131                                     mirror->zprl_type,
1132                                     raidz->zprl_parity,
1133                                     mirror->zprl_children - 1,
1134                                     mirror->zprl_children);
1135                                 ret = -1;
1136                         }
1137                 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
1138                         vdev_error(gettext(
1139                             "mismatched replication level: pool uses %s "
1140                             "and new vdev is %s\n"),
1141                             current->zprl_type, new->zprl_type);
1142                         ret = -1;
1143                 } else if (current->zprl_parity != new->zprl_parity) {
1144                         vdev_error(gettext(
1145                             "mismatched replication level: pool uses %llu "
1146                             "device parity and new vdev uses %llu\n"),
1147                             current->zprl_parity, new->zprl_parity);
1148                         ret = -1;
1149                 } else if (current->zprl_children != new->zprl_children) {
1150                         vdev_error(gettext(
1151                             "mismatched replication level: pool uses %llu-way "
1152                             "%s and new vdev uses %llu-way %s\n"),
1153                             current->zprl_children, current->zprl_type,
1154                             new->zprl_children, new->zprl_type);
1155                         ret = -1;
1156                 }
1157         }
1158
1159         free(new);
1160         if (current != NULL)
1161                 free(current);
1162
1163         return (ret);
1164 }
1165
1166 static int
1167 zero_label(char *path)
1168 {
1169         const int size = 4096;
1170         char buf[size];
1171         int err, fd;
1172
1173         if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
1174                 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
1175                     path, strerror(errno));
1176                 return (-1);
1177         }
1178
1179         memset(buf, 0, size);
1180         err = write(fd, buf, size);
1181         (void) fdatasync(fd);
1182         (void) close(fd);
1183
1184         if (err == -1) {
1185                 (void) fprintf(stderr, gettext("cannot zero first %d bytes "
1186                     "of '%s': %s\n"), size, path, strerror(errno));
1187                 return (-1);
1188         }
1189
1190         if (err != size) {
1191                 (void) fprintf(stderr, gettext("could only zero %d/%d bytes "
1192                     "of '%s'\n"), err, size, path);
1193                 return (-1);
1194         }
1195
1196         return (0);
1197 }
1198
1199 /*
1200  * Go through and find any whole disks in the vdev specification, labelling them
1201  * as appropriate.  When constructing the vdev spec, we were unable to open this
1202  * device in order to provide a devid.  Now that we have labelled the disk and
1203  * know that slice 0 is valid, we can construct the devid now.
1204  *
1205  * If the disk was already labeled with an EFI label, we will have gotten the
1206  * devid already (because we were able to open the whole disk).  Otherwise, we
1207  * need to get the devid after we label the disk.
1208  */
1209 static int
1210 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
1211 {
1212         nvlist_t **child;
1213         uint_t c, children;
1214         char *type, *path;
1215         char devpath[MAXPATHLEN];
1216         char udevpath[MAXPATHLEN];
1217         uint64_t wholedisk;
1218         struct stat64 statbuf;
1219         int is_exclusive = 0;
1220         int fd;
1221         int ret;
1222
1223         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1224
1225         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1226             &child, &children) != 0) {
1227
1228                 if (strcmp(type, VDEV_TYPE_DISK) != 0)
1229                         return (0);
1230
1231                 /*
1232                  * We have a disk device.  If this is a whole disk write
1233                  * out the efi partition table, otherwise write zero's to
1234                  * the first 4k of the partition.  This is to ensure that
1235                  * libblkid will not misidentify the partition due to a
1236                  * magic value left by the previous filesystem.
1237                  */
1238                 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1239                 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1240                     &wholedisk));
1241
1242                 if (!wholedisk) {
1243                         /*
1244                          * Update device id string for mpath nodes (Linux only)
1245                          */
1246                         if (is_mpath_whole_disk(path))
1247                                 update_vdev_config_dev_strs(nv);
1248
1249                         if (!is_spare(NULL, path))
1250                                 (void) zero_label(path);
1251                         return (0);
1252                 }
1253
1254                 if (realpath(path, devpath) == NULL) {
1255                         ret = errno;
1256                         (void) fprintf(stderr,
1257                             gettext("cannot resolve path '%s'\n"), path);
1258                         return (ret);
1259                 }
1260
1261                 /*
1262                  * Remove any previously existing symlink from a udev path to
1263                  * the device before labeling the disk.  This ensures that
1264                  * only newly created links are used.  Otherwise there is a
1265                  * window between when udev deletes and recreates the link
1266                  * during which access attempts will fail with ENOENT.
1267                  */
1268                 strlcpy(udevpath, path, MAXPATHLEN);
1269                 (void) zfs_append_partition(udevpath, MAXPATHLEN);
1270
1271                 fd = open(devpath, O_RDWR|O_EXCL);
1272                 if (fd == -1) {
1273                         if (errno == EBUSY)
1274                                 is_exclusive = 1;
1275                 } else {
1276                         (void) close(fd);
1277                 }
1278
1279                 /*
1280                  * If the partition exists, contains a valid spare label,
1281                  * and is opened exclusively there is no need to partition
1282                  * it.  Hot spares have already been partitioned and are
1283                  * held open exclusively by the kernel as a safety measure.
1284                  *
1285                  * If the provided path is for a /dev/disk/ device its
1286                  * symbolic link will be removed, partition table created,
1287                  * and then block until udev creates the new link.
1288                  */
1289                 if (!is_exclusive && !is_spare(NULL, udevpath)) {
1290                         char *devnode = strrchr(devpath, '/') + 1;
1291
1292                         ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
1293                         if (ret == 0) {
1294                                 ret = lstat64(udevpath, &statbuf);
1295                                 if (ret == 0 && S_ISLNK(statbuf.st_mode))
1296                                         (void) unlink(udevpath);
1297                         }
1298
1299                         /*
1300                          * When labeling a pool the raw device node name
1301                          * is provided as it appears under /dev/.
1302                          */
1303                         if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
1304                                 return (-1);
1305
1306                         /*
1307                          * Wait for udev to signal the device is available
1308                          * by the provided path.
1309                          */
1310                         ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
1311                         if (ret) {
1312                                 (void) fprintf(stderr,
1313                                     gettext("missing link: %s was "
1314                                     "partitioned but %s is missing\n"),
1315                                     devnode, udevpath);
1316                                 return (ret);
1317                         }
1318
1319                         ret = zero_label(udevpath);
1320                         if (ret)
1321                                 return (ret);
1322                 }
1323
1324                 /*
1325                  * Update the path to refer to the partition.  The presence of
1326                  * the 'whole_disk' field indicates to the CLI that we should
1327                  * chop off the partition number when displaying the device in
1328                  * future output.
1329                  */
1330                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
1331
1332                 /*
1333                  * Update device id strings for whole disks (Linux only)
1334                  */
1335                 update_vdev_config_dev_strs(nv);
1336
1337                 return (0);
1338         }
1339
1340         for (c = 0; c < children; c++)
1341                 if ((ret = make_disks(zhp, child[c])) != 0)
1342                         return (ret);
1343
1344         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1345             &child, &children) == 0)
1346                 for (c = 0; c < children; c++)
1347                         if ((ret = make_disks(zhp, child[c])) != 0)
1348                                 return (ret);
1349
1350         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1351             &child, &children) == 0)
1352                 for (c = 0; c < children; c++)
1353                         if ((ret = make_disks(zhp, child[c])) != 0)
1354                                 return (ret);
1355
1356         return (0);
1357 }
1358
1359 /*
1360  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1361  * the majority of this task.
1362  */
1363 static boolean_t
1364 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1365     boolean_t replacing, boolean_t isspare)
1366 {
1367         nvlist_t **child;
1368         uint_t c, children;
1369         char *type, *path;
1370         int ret = 0;
1371         char buf[MAXPATHLEN];
1372         uint64_t wholedisk = B_FALSE;
1373         boolean_t anyinuse = B_FALSE;
1374
1375         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1376
1377         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1378             &child, &children) != 0) {
1379
1380                 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1381                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1382                         verify(!nvlist_lookup_uint64(nv,
1383                             ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
1384
1385                 /*
1386                  * As a generic check, we look to see if this is a replace of a
1387                  * hot spare within the same pool.  If so, we allow it
1388                  * regardless of what libblkid or zpool_in_use() says.
1389                  */
1390                 if (replacing) {
1391                         (void) strlcpy(buf, path, sizeof (buf));
1392                         if (wholedisk) {
1393                                 ret = zfs_append_partition(buf,  sizeof (buf));
1394                                 if (ret == -1)
1395                                         return (-1);
1396                         }
1397
1398                         if (is_spare(config, buf))
1399                                 return (B_FALSE);
1400                 }
1401
1402                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1403                         ret = check_device(path, force, isspare, wholedisk);
1404
1405                 else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1406                         ret = check_file(path, force, isspare);
1407
1408                 return (ret != 0);
1409         }
1410
1411         for (c = 0; c < children; c++)
1412                 if (is_device_in_use(config, child[c], force, replacing,
1413                     B_FALSE))
1414                         anyinuse = B_TRUE;
1415
1416         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1417             &child, &children) == 0)
1418                 for (c = 0; c < children; c++)
1419                         if (is_device_in_use(config, child[c], force, replacing,
1420                             B_TRUE))
1421                                 anyinuse = B_TRUE;
1422
1423         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1424             &child, &children) == 0)
1425                 for (c = 0; c < children; c++)
1426                         if (is_device_in_use(config, child[c], force, replacing,
1427                             B_FALSE))
1428                                 anyinuse = B_TRUE;
1429
1430         return (anyinuse);
1431 }
1432
1433 static const char *
1434 is_grouping(const char *type, int *mindev, int *maxdev)
1435 {
1436         if (strncmp(type, "raidz", 5) == 0) {
1437                 const char *p = type + 5;
1438                 char *end;
1439                 long nparity;
1440
1441                 if (*p == '\0') {
1442                         nparity = 1;
1443                 } else if (*p == '0') {
1444                         return (NULL); /* no zero prefixes allowed */
1445                 } else {
1446                         errno = 0;
1447                         nparity = strtol(p, &end, 10);
1448                         if (errno != 0 || nparity < 1 || nparity >= 255 ||
1449                             *end != '\0')
1450                                 return (NULL);
1451                 }
1452
1453                 if (mindev != NULL)
1454                         *mindev = nparity + 1;
1455                 if (maxdev != NULL)
1456                         *maxdev = 255;
1457                 return (VDEV_TYPE_RAIDZ);
1458         }
1459
1460         if (maxdev != NULL)
1461                 *maxdev = INT_MAX;
1462
1463         if (strcmp(type, "mirror") == 0) {
1464                 if (mindev != NULL)
1465                         *mindev = 2;
1466                 return (VDEV_TYPE_MIRROR);
1467         }
1468
1469         if (strcmp(type, "spare") == 0) {
1470                 if (mindev != NULL)
1471                         *mindev = 1;
1472                 return (VDEV_TYPE_SPARE);
1473         }
1474
1475         if (strcmp(type, "log") == 0) {
1476                 if (mindev != NULL)
1477                         *mindev = 1;
1478                 return (VDEV_TYPE_LOG);
1479         }
1480
1481         if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
1482             strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1483                 if (mindev != NULL)
1484                         *mindev = 1;
1485                 return (type);
1486         }
1487
1488         if (strcmp(type, "cache") == 0) {
1489                 if (mindev != NULL)
1490                         *mindev = 1;
1491                 return (VDEV_TYPE_L2CACHE);
1492         }
1493
1494         return (NULL);
1495 }
1496
1497 /*
1498  * Construct a syntactically valid vdev specification,
1499  * and ensure that all devices and files exist and can be opened.
1500  * Note: we don't bother freeing anything in the error paths
1501  * because the program is just going to exit anyway.
1502  */
1503 nvlist_t *
1504 construct_spec(nvlist_t *props, int argc, char **argv)
1505 {
1506         nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1507         int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1508         const char *type;
1509         uint64_t is_log, is_special, is_dedup;
1510         boolean_t seen_logs;
1511
1512         top = NULL;
1513         toplevels = 0;
1514         spares = NULL;
1515         l2cache = NULL;
1516         nspares = 0;
1517         nlogs = 0;
1518         nl2cache = 0;
1519         is_log = is_special = is_dedup = B_FALSE;
1520         seen_logs = B_FALSE;
1521         nvroot = NULL;
1522
1523         while (argc > 0) {
1524                 nv = NULL;
1525
1526                 /*
1527                  * If it's a mirror or raidz, the subsequent arguments are
1528                  * its leaves -- until we encounter the next mirror or raidz.
1529                  */
1530                 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1531                         nvlist_t **child = NULL;
1532                         int c, children = 0;
1533
1534                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1535                                 if (spares != NULL) {
1536                                         (void) fprintf(stderr,
1537                                             gettext("invalid vdev "
1538                                             "specification: 'spare' can be "
1539                                             "specified only once\n"));
1540                                         goto spec_out;
1541                                 }
1542                                 is_log = is_special = is_dedup = B_FALSE;
1543                         }
1544
1545                         if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1546                                 if (seen_logs) {
1547                                         (void) fprintf(stderr,
1548                                             gettext("invalid vdev "
1549                                             "specification: 'log' can be "
1550                                             "specified only once\n"));
1551                                         goto spec_out;
1552                                 }
1553                                 seen_logs = B_TRUE;
1554                                 is_log = B_TRUE;
1555                                 is_special = B_FALSE;
1556                                 is_dedup = B_FALSE;
1557                                 argc--;
1558                                 argv++;
1559                                 /*
1560                                  * A log is not a real grouping device.
1561                                  * We just set is_log and continue.
1562                                  */
1563                                 continue;
1564                         }
1565
1566                         if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
1567                                 is_special = B_TRUE;
1568                                 is_log = B_FALSE;
1569                                 is_dedup = B_FALSE;
1570                                 argc--;
1571                                 argv++;
1572                                 continue;
1573                         }
1574
1575                         if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1576                                 is_dedup = B_TRUE;
1577                                 is_log = B_FALSE;
1578                                 is_special = B_FALSE;
1579                                 argc--;
1580                                 argv++;
1581                                 continue;
1582                         }
1583
1584                         if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1585                                 if (l2cache != NULL) {
1586                                         (void) fprintf(stderr,
1587                                             gettext("invalid vdev "
1588                                             "specification: 'cache' can be "
1589                                             "specified only once\n"));
1590                                         goto spec_out;
1591                                 }
1592                                 is_log = is_special = is_dedup = B_FALSE;
1593                         }
1594
1595                         if (is_log || is_special || is_dedup) {
1596                                 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1597                                         (void) fprintf(stderr,
1598                                             gettext("invalid vdev "
1599                                             "specification: unsupported '%s' "
1600                                             "device: %s\n"), is_log ? "log" :
1601                                             "special", type);
1602                                         goto spec_out;
1603                                 }
1604                                 nlogs++;
1605                         }
1606
1607                         for (c = 1; c < argc; c++) {
1608                                 if (is_grouping(argv[c], NULL, NULL) != NULL)
1609                                         break;
1610                                 children++;
1611                                 child = realloc(child,
1612                                     children * sizeof (nvlist_t *));
1613                                 if (child == NULL)
1614                                         zpool_no_memory();
1615                                 if ((nv = make_leaf_vdev(props, argv[c],
1616                                     B_FALSE)) == NULL) {
1617                                         for (c = 0; c < children - 1; c++)
1618                                                 nvlist_free(child[c]);
1619                                         free(child);
1620                                         goto spec_out;
1621                                 }
1622
1623                                 child[children - 1] = nv;
1624                         }
1625
1626                         if (children < mindev) {
1627                                 (void) fprintf(stderr, gettext("invalid vdev "
1628                                     "specification: %s requires at least %d "
1629                                     "devices\n"), argv[0], mindev);
1630                                 for (c = 0; c < children; c++)
1631                                         nvlist_free(child[c]);
1632                                 free(child);
1633                                 goto spec_out;
1634                         }
1635
1636                         if (children > maxdev) {
1637                                 (void) fprintf(stderr, gettext("invalid vdev "
1638                                     "specification: %s supports no more than "
1639                                     "%d devices\n"), argv[0], maxdev);
1640                                 for (c = 0; c < children; c++)
1641                                         nvlist_free(child[c]);
1642                                 free(child);
1643                                 goto spec_out;
1644                         }
1645
1646                         argc -= c;
1647                         argv += c;
1648
1649                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1650                                 spares = child;
1651                                 nspares = children;
1652                                 continue;
1653                         } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1654                                 l2cache = child;
1655                                 nl2cache = children;
1656                                 continue;
1657                         } else {
1658                                 /* create a top-level vdev with children */
1659                                 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1660                                     0) == 0);
1661                                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1662                                     type) == 0);
1663                                 verify(nvlist_add_uint64(nv,
1664                                     ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1665                                 if (is_log)
1666                                         verify(nvlist_add_string(nv,
1667                                             ZPOOL_CONFIG_ALLOCATION_BIAS,
1668                                             VDEV_ALLOC_BIAS_LOG) == 0);
1669                                 if (is_special) {
1670                                         verify(nvlist_add_string(nv,
1671                                             ZPOOL_CONFIG_ALLOCATION_BIAS,
1672                                             VDEV_ALLOC_BIAS_SPECIAL) == 0);
1673                                 }
1674                                 if (is_dedup) {
1675                                         verify(nvlist_add_string(nv,
1676                                             ZPOOL_CONFIG_ALLOCATION_BIAS,
1677                                             VDEV_ALLOC_BIAS_DEDUP) == 0);
1678                                 }
1679                                 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1680                                         verify(nvlist_add_uint64(nv,
1681                                             ZPOOL_CONFIG_NPARITY,
1682                                             mindev - 1) == 0);
1683                                 }
1684                                 verify(nvlist_add_nvlist_array(nv,
1685                                     ZPOOL_CONFIG_CHILDREN, child,
1686                                     children) == 0);
1687
1688                                 for (c = 0; c < children; c++)
1689                                         nvlist_free(child[c]);
1690                                 free(child);
1691                         }
1692                 } else {
1693                         /*
1694                          * We have a device.  Pass off to make_leaf_vdev() to
1695                          * construct the appropriate nvlist describing the vdev.
1696                          */
1697                         if ((nv = make_leaf_vdev(props, argv[0],
1698                             is_log)) == NULL)
1699                                 goto spec_out;
1700
1701                         if (is_log)
1702                                 nlogs++;
1703                         if (is_special) {
1704                                 verify(nvlist_add_string(nv,
1705                                     ZPOOL_CONFIG_ALLOCATION_BIAS,
1706                                     VDEV_ALLOC_BIAS_SPECIAL) == 0);
1707                         }
1708                         if (is_dedup) {
1709                                 verify(nvlist_add_string(nv,
1710                                     ZPOOL_CONFIG_ALLOCATION_BIAS,
1711                                     VDEV_ALLOC_BIAS_DEDUP) == 0);
1712                         }
1713                         argc--;
1714                         argv++;
1715                 }
1716
1717                 toplevels++;
1718                 top = realloc(top, toplevels * sizeof (nvlist_t *));
1719                 if (top == NULL)
1720                         zpool_no_memory();
1721                 top[toplevels - 1] = nv;
1722         }
1723
1724         if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1725                 (void) fprintf(stderr, gettext("invalid vdev "
1726                     "specification: at least one toplevel vdev must be "
1727                     "specified\n"));
1728                 goto spec_out;
1729         }
1730
1731         if (seen_logs && nlogs == 0) {
1732                 (void) fprintf(stderr, gettext("invalid vdev specification: "
1733                     "log requires at least 1 device\n"));
1734                 goto spec_out;
1735         }
1736
1737         /*
1738          * Finally, create nvroot and add all top-level vdevs to it.
1739          */
1740         verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1741         verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1742             VDEV_TYPE_ROOT) == 0);
1743         verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1744             top, toplevels) == 0);
1745         if (nspares != 0)
1746                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1747                     spares, nspares) == 0);
1748         if (nl2cache != 0)
1749                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1750                     l2cache, nl2cache) == 0);
1751
1752 spec_out:
1753         for (t = 0; t < toplevels; t++)
1754                 nvlist_free(top[t]);
1755         for (t = 0; t < nspares; t++)
1756                 nvlist_free(spares[t]);
1757         for (t = 0; t < nl2cache; t++)
1758                 nvlist_free(l2cache[t]);
1759
1760         free(spares);
1761         free(l2cache);
1762         free(top);
1763
1764         return (nvroot);
1765 }
1766
1767 nvlist_t *
1768 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1769     splitflags_t flags, int argc, char **argv)
1770 {
1771         nvlist_t *newroot = NULL, **child;
1772         uint_t c, children;
1773
1774         if (argc > 0) {
1775                 if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1776                         (void) fprintf(stderr, gettext("Unable to build a "
1777                             "pool from the specified devices\n"));
1778                         return (NULL);
1779                 }
1780
1781                 if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1782                         nvlist_free(newroot);
1783                         return (NULL);
1784                 }
1785
1786                 /* avoid any tricks in the spec */
1787                 verify(nvlist_lookup_nvlist_array(newroot,
1788                     ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1789                 for (c = 0; c < children; c++) {
1790                         char *path;
1791                         const char *type;
1792                         int min, max;
1793
1794                         verify(nvlist_lookup_string(child[c],
1795                             ZPOOL_CONFIG_PATH, &path) == 0);
1796                         if ((type = is_grouping(path, &min, &max)) != NULL) {
1797                                 (void) fprintf(stderr, gettext("Cannot use "
1798                                     "'%s' as a device for splitting\n"), type);
1799                                 nvlist_free(newroot);
1800                                 return (NULL);
1801                         }
1802                 }
1803         }
1804
1805         if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1806                 nvlist_free(newroot);
1807                 return (NULL);
1808         }
1809
1810         return (newroot);
1811 }
1812
1813 static int
1814 num_normal_vdevs(nvlist_t *nvroot)
1815 {
1816         nvlist_t **top;
1817         uint_t t, toplevels, normal = 0;
1818
1819         verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1820             &top, &toplevels) == 0);
1821
1822         for (t = 0; t < toplevels; t++) {
1823                 uint64_t log = B_FALSE;
1824
1825                 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
1826                 if (log)
1827                         continue;
1828                 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
1829                         continue;
1830
1831                 normal++;
1832         }
1833
1834         return (normal);
1835 }
1836
1837 /*
1838  * Get and validate the contents of the given vdev specification.  This ensures
1839  * that the nvlist returned is well-formed, that all the devices exist, and that
1840  * they are not currently in use by any other known consumer.  The 'poolconfig'
1841  * parameter is the current configuration of the pool when adding devices
1842  * existing pool, and is used to perform additional checks, such as changing the
1843  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1844  * new pool.  The 'force' flag controls whether devices should be forcefully
1845  * added, even if they appear in use.
1846  */
1847 nvlist_t *
1848 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1849     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1850 {
1851         nvlist_t *newroot;
1852         nvlist_t *poolconfig = NULL;
1853         is_force = force;
1854
1855         /*
1856          * Construct the vdev specification.  If this is successful, we know
1857          * that we have a valid specification, and that all devices can be
1858          * opened.
1859          */
1860         if ((newroot = construct_spec(props, argc, argv)) == NULL)
1861                 return (NULL);
1862
1863         if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
1864                 nvlist_free(newroot);
1865                 return (NULL);
1866         }
1867
1868         /*
1869          * Validate each device to make sure that its not shared with another
1870          * subsystem.  We do this even if 'force' is set, because there are some
1871          * uses (such as a dedicated dump device) that even '-f' cannot
1872          * override.
1873          */
1874         if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1875                 nvlist_free(newroot);
1876                 return (NULL);
1877         }
1878
1879         /*
1880          * Check the replication level of the given vdevs and report any errors
1881          * found.  We include the existing pool spec, if any, as we need to
1882          * catch changes against the existing replication level.
1883          */
1884         if (check_rep && check_replication(poolconfig, newroot) != 0) {
1885                 nvlist_free(newroot);
1886                 return (NULL);
1887         }
1888
1889         /*
1890          * On pool create the new vdev spec must have one normal vdev.
1891          */
1892         if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
1893                 vdev_error(gettext("at least one general top-level vdev must "
1894                     "be specified\n"));
1895                 nvlist_free(newroot);
1896                 return (NULL);
1897         }
1898
1899         /*
1900          * Run through the vdev specification and label any whole disks found.
1901          */
1902         if (!dryrun && make_disks(zhp, newroot) != 0) {
1903                 nvlist_free(newroot);
1904                 return (NULL);
1905         }
1906
1907         return (newroot);
1908 }