4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2016, 2017 Intel Corporation.
26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
30 * Functions to convert between a list of vdevs and an nvlist representing the
31 * configuration. Each entry in the list can be one of:
34 * disk=(path=..., devid=...)
43 * While the underlying implementation supports it, group vdevs cannot contain
44 * other group vdevs. All userland verification of devices is contained within
45 * this file. If successful, the nvlist returned can be passed directly to the
46 * kernel; we've done as much verification as possible in userland.
48 * Hot spares are a special case, and passed down as an array of disk vdevs, at
49 * the same level as the root of the vdev tree.
51 * The only function exported by this file is 'make_root_vdev'. The
52 * function performs several passes:
54 * 1. Construct the vdev specification. Performs syntax validation and
55 * makes sure each device is valid.
56 * 2. Check for devices in use. Using libblkid to make sure that no
57 * devices are also in use. Some can be overridden using the 'force'
58 * flag, others cannot.
59 * 3. Check for replication errors if the 'force' flag is not specified.
60 * validates that the replication level is consistent across the
62 * 4. Call libzfs to label any whole disks with an EFI label.
70 #include <libnvpair.h>
77 #include "zpool_util.h"
78 #include <sys/zfs_context.h>
80 #include <scsi/scsi.h>
85 #include <sys/efi_partition.h>
88 #include <sys/mntent.h>
89 #include <uuid/uuid.h>
90 #include <blkid/blkid.h>
92 typedef struct vdev_disk_db_entry
96 } vdev_disk_db_entry_t;
99 * Database of block devices that lie about physical sector sizes. The
100 * identification string must be precisely 24 characters to avoid false
103 static vdev_disk_db_entry_t vdev_disk_database[] = {
104 {"ATA ADATA SSD S396 3", 8192},
105 {"ATA APPLE SSD SM128E", 8192},
106 {"ATA APPLE SSD SM256E", 8192},
107 {"ATA APPLE SSD SM512E", 8192},
108 {"ATA APPLE SSD SM768E", 8192},
109 {"ATA C400-MTFDDAC064M", 8192},
110 {"ATA C400-MTFDDAC128M", 8192},
111 {"ATA C400-MTFDDAC256M", 8192},
112 {"ATA C400-MTFDDAC512M", 8192},
113 {"ATA Corsair Force 3 ", 8192},
114 {"ATA Corsair Force GS", 8192},
115 {"ATA INTEL SSDSA2CT04", 8192},
116 {"ATA INTEL SSDSA2BZ10", 8192},
117 {"ATA INTEL SSDSA2BZ20", 8192},
118 {"ATA INTEL SSDSA2BZ30", 8192},
119 {"ATA INTEL SSDSA2CW04", 8192},
120 {"ATA INTEL SSDSA2CW08", 8192},
121 {"ATA INTEL SSDSA2CW12", 8192},
122 {"ATA INTEL SSDSA2CW16", 8192},
123 {"ATA INTEL SSDSA2CW30", 8192},
124 {"ATA INTEL SSDSA2CW60", 8192},
125 {"ATA INTEL SSDSC2CT06", 8192},
126 {"ATA INTEL SSDSC2CT12", 8192},
127 {"ATA INTEL SSDSC2CT18", 8192},
128 {"ATA INTEL SSDSC2CT24", 8192},
129 {"ATA INTEL SSDSC2CW06", 8192},
130 {"ATA INTEL SSDSC2CW12", 8192},
131 {"ATA INTEL SSDSC2CW18", 8192},
132 {"ATA INTEL SSDSC2CW24", 8192},
133 {"ATA INTEL SSDSC2CW48", 8192},
134 {"ATA KINGSTON SH100S3", 8192},
135 {"ATA KINGSTON SH103S3", 8192},
136 {"ATA M4-CT064M4SSD2 ", 8192},
137 {"ATA M4-CT128M4SSD2 ", 8192},
138 {"ATA M4-CT256M4SSD2 ", 8192},
139 {"ATA M4-CT512M4SSD2 ", 8192},
140 {"ATA OCZ-AGILITY2 ", 8192},
141 {"ATA OCZ-AGILITY3 ", 8192},
142 {"ATA OCZ-VERTEX2 3.5 ", 8192},
143 {"ATA OCZ-VERTEX3 ", 8192},
144 {"ATA OCZ-VERTEX3 LT ", 8192},
145 {"ATA OCZ-VERTEX3 MI ", 8192},
146 {"ATA OCZ-VERTEX4 ", 8192},
147 {"ATA SAMSUNG MZ7WD120", 8192},
148 {"ATA SAMSUNG MZ7WD240", 8192},
149 {"ATA SAMSUNG MZ7WD480", 8192},
150 {"ATA SAMSUNG MZ7WD960", 8192},
151 {"ATA SAMSUNG SSD 830 ", 8192},
152 {"ATA Samsung SSD 840 ", 8192},
153 {"ATA SanDisk SSD U100", 8192},
154 {"ATA TOSHIBA THNSNH06", 8192},
155 {"ATA TOSHIBA THNSNH12", 8192},
156 {"ATA TOSHIBA THNSNH25", 8192},
157 {"ATA TOSHIBA THNSNH51", 8192},
158 {"ATA APPLE SSD TS064C", 4096},
159 {"ATA APPLE SSD TS128C", 4096},
160 {"ATA APPLE SSD TS256C", 4096},
161 {"ATA APPLE SSD TS512C", 4096},
162 {"ATA INTEL SSDSA2M040", 4096},
163 {"ATA INTEL SSDSA2M080", 4096},
164 {"ATA INTEL SSDSA2M160", 4096},
165 {"ATA INTEL SSDSC2MH12", 4096},
166 {"ATA INTEL SSDSC2MH25", 4096},
167 {"ATA OCZ CORE_SSD ", 4096},
168 {"ATA OCZ-VERTEX ", 4096},
169 {"ATA SAMSUNG MCCOE32G", 4096},
170 {"ATA SAMSUNG MCCOE64G", 4096},
171 {"ATA SAMSUNG SSD PM80", 4096},
172 /* Flash drives optimized for 4KB IOs on larger pages */
173 {"ATA INTEL SSDSC2BA10", 4096},
174 {"ATA INTEL SSDSC2BA20", 4096},
175 {"ATA INTEL SSDSC2BA40", 4096},
176 {"ATA INTEL SSDSC2BA80", 4096},
177 {"ATA INTEL SSDSC2BB08", 4096},
178 {"ATA INTEL SSDSC2BB12", 4096},
179 {"ATA INTEL SSDSC2BB16", 4096},
180 {"ATA INTEL SSDSC2BB24", 4096},
181 {"ATA INTEL SSDSC2BB30", 4096},
182 {"ATA INTEL SSDSC2BB40", 4096},
183 {"ATA INTEL SSDSC2BB48", 4096},
184 {"ATA INTEL SSDSC2BB60", 4096},
185 {"ATA INTEL SSDSC2BB80", 4096},
186 {"ATA INTEL SSDSC2BW24", 4096},
187 {"ATA INTEL SSDSC2BW48", 4096},
188 {"ATA INTEL SSDSC2BP24", 4096},
189 {"ATA INTEL SSDSC2BP48", 4096},
190 {"NA SmrtStorSDLKAE9W", 4096},
191 {"NVMe Amazon EC2 NVMe ", 4096},
192 /* Imported from Open Solaris */
193 {"ATA MARVELL SD88SA02", 4096},
194 /* Advanced format Hard drives */
195 {"ATA Hitachi HDS5C303", 4096},
196 {"ATA SAMSUNG HD204UI ", 4096},
197 {"ATA ST2000DL004 HD20", 4096},
198 {"ATA WDC WD10EARS-00M", 4096},
199 {"ATA WDC WD10EARS-00S", 4096},
200 {"ATA WDC WD10EARS-00Z", 4096},
201 {"ATA WDC WD15EARS-00M", 4096},
202 {"ATA WDC WD15EARS-00S", 4096},
203 {"ATA WDC WD15EARS-00Z", 4096},
204 {"ATA WDC WD20EARS-00M", 4096},
205 {"ATA WDC WD20EARS-00S", 4096},
206 {"ATA WDC WD20EARS-00Z", 4096},
207 {"ATA WDC WD1600BEVT-0", 4096},
208 {"ATA WDC WD2500BEVT-0", 4096},
209 {"ATA WDC WD3200BEVT-0", 4096},
210 {"ATA WDC WD5000BEVT-0", 4096},
214 #define INQ_REPLY_LEN 96
215 #define INQ_CMD_LEN 6
217 static const int vdev_disk_database_size =
218 sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
221 check_sector_size_database(char *path, int *sector_size)
223 unsigned char inq_buff[INQ_REPLY_LEN];
224 unsigned char sense_buffer[32];
225 unsigned char inq_cmd_blk[INQ_CMD_LEN] =
226 {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
232 /* Prepare INQUIRY command */
233 memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
234 io_hdr.interface_id = 'S';
235 io_hdr.cmd_len = sizeof (inq_cmd_blk);
236 io_hdr.mx_sb_len = sizeof (sense_buffer);
237 io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
238 io_hdr.dxfer_len = INQ_REPLY_LEN;
239 io_hdr.dxferp = inq_buff;
240 io_hdr.cmdp = inq_cmd_blk;
241 io_hdr.sbp = sense_buffer;
242 io_hdr.timeout = 10; /* 10 milliseconds is ample time */
244 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
247 error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
254 if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
257 for (i = 0; i < vdev_disk_database_size; i++) {
258 if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
261 *sector_size = vdev_disk_database[i].sector_size;
269 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
274 /* No valid type detected device is safe to use */
275 value = blkid_get_tag_value(cache, "TYPE", path);
280 * If libblkid detects a ZFS device, we check the device
281 * using check_file() to see if it's safe. The one safe
282 * case is a spare device shared between multiple pools.
284 if (strcmp(value, "zfs_member") == 0) {
285 err = check_file(path, force, isspare);
291 vdev_error(gettext("%s contains a filesystem of "
292 "type '%s'\n"), path, value);
302 * Validate that a disk including all partitions are safe to use.
304 * For EFI labeled disks this can done relatively easily with the libefi
305 * library. The partition numbers are extracted from the label and used
306 * to generate the expected /dev/ paths. Each partition can then be
307 * checked for conflicts.
309 * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
310 * but due to the lack of a readily available libraries this scanning is
311 * not implemented. Instead only the device path as given is checked.
314 check_disk(const char *path, blkid_cache cache, int force,
315 boolean_t isspare, boolean_t iswholedisk)
318 char slice_path[MAXPATHLEN];
321 int flags = O_RDONLY|O_DIRECT;
324 return (check_slice(path, cache, force, isspare));
326 /* only spares can be shared, other devices require exclusive access */
330 if ((fd = open(path, flags)) < 0) {
331 char *value = blkid_get_tag_value(cache, "TYPE", path);
332 (void) fprintf(stderr, gettext("%s is in use and contains "
333 "a %s filesystem.\n"), path, value ? value : "unknown");
339 * Expected to fail for non-EFI labeled disks. Just check the device
340 * as given and do not attempt to detect and scan partitions.
342 err = efi_alloc_and_read(fd, &vtoc);
345 return (check_slice(path, cache, force, isspare));
349 * The primary efi partition label is damaged however the secondary
350 * label at the end of the device is intact. Rather than use this
351 * label we should play it safe and treat this as a non efi device.
353 if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
358 /* Partitions will now be created using the backup */
361 vdev_error(gettext("%s contains a corrupt primary "
362 "EFI label.\n"), path);
367 for (i = 0; i < vtoc->efi_nparts; i++) {
369 if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
370 uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
373 if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
374 (void) snprintf(slice_path, sizeof (slice_path),
375 "%s%s%d", path, "-part", i+1);
377 (void) snprintf(slice_path, sizeof (slice_path),
378 "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
381 err = check_slice(slice_path, cache, force, isspare);
393 check_device(const char *path, boolean_t force,
394 boolean_t isspare, boolean_t iswholedisk)
399 error = blkid_get_cache(&cache, NULL);
401 (void) fprintf(stderr, gettext("unable to access the blkid "
406 error = check_disk(path, cache, force, isspare, iswholedisk);
407 blkid_put_cache(cache);