]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
MFC r300881, r302058 (by asomers):
[FreeBSD/stable/10.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_geom.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23  * All rights reserved.
24  *
25  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26  */
27
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/bio.h>
32 #include <sys/disk.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zio.h>
38 #include <geom/geom.h>
39 #include <geom/geom_int.h>
40
41 /*
42  * Virtual device vector for GEOM.
43  */
44
45 static g_attrchanged_t vdev_geom_attrchanged;
46 struct g_class zfs_vdev_class = {
47         .name = "ZFS::VDEV",
48         .version = G_VERSION,
49         .attrchanged = vdev_geom_attrchanged,
50 };
51
52 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54 SYSCTL_DECL(_vfs_zfs_vdev);
55 /* Don't send BIO_FLUSH. */
56 static int vdev_geom_bio_flush_disable = 0;
57 TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60 /* Don't send BIO_DELETE. */
61 static int vdev_geom_bio_delete_disable = 0;
62 TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65
66 /* Declare local functions */
67 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
68
69 /*
70  * Thread local storage used to indicate when a thread is probing geoms
71  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
72  * it is looking for a replacement for the vdev_t* that is its value.
73  */
74 uint_t zfs_geom_probe_vdev_key;
75
76 static void
77 vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
78
79         int error;
80         uint16_t rate;
81
82         error = g_getattr("GEOM::rotation_rate", cp, &rate);
83         if (error == 0)
84                 vd->vdev_rotation_rate = rate;
85         else
86                 vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
87 }
88
89 static void
90 vdev_geom_set_physpath(struct g_consumer *cp, boolean_t do_null_update)
91 {
92         boolean_t needs_update = B_FALSE;
93         vdev_t *vd;
94         char *physpath;
95         int error, physpath_len;
96
97         if (g_access(cp, 1, 0, 0) != 0)
98                 return;
99
100         vd = cp->private;
101         physpath_len = MAXPATHLEN;
102         physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
103         error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
104         g_access(cp, -1, 0, 0);
105         if (error == 0) {
106                 char *old_physpath;
107
108                 /* g_topology lock ensures that vdev has not been closed */
109                 g_topology_assert();
110                 old_physpath = vd->vdev_physpath;
111                 vd->vdev_physpath = spa_strdup(physpath);
112
113                 if (old_physpath != NULL) {
114                         needs_update = (strcmp(old_physpath,
115                                                 vd->vdev_physpath) != 0);
116                         spa_strfree(old_physpath);
117                 } else
118                         needs_update = do_null_update;
119         }
120         g_free(physpath);
121
122         /*
123          * If the physical path changed, update the config.
124          * Only request an update for previously unset physpaths if
125          * requested by the caller.
126          */
127         if (needs_update)
128                 spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
129
130 }
131
132 static void
133 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
134 {
135         vdev_t *vd;
136         char *old_physpath;
137         int error;
138
139         vd = cp->private;
140         if (vd == NULL)
141                 return;
142
143         if (strcmp(attr, "GEOM::rotation_rate") == 0) {
144                 vdev_geom_set_rotation_rate(vd, cp);
145                 return;
146         }
147
148         if (strcmp(attr, "GEOM::physpath") == 0) {
149                 vdev_geom_set_physpath(cp, /*do_null_update*/B_TRUE);
150                 return;
151         }
152 }
153
154 static void
155 vdev_geom_orphan(struct g_consumer *cp)
156 {
157         vdev_t *vd;
158
159         g_topology_assert();
160
161         vd = cp->private;
162         if (vd == NULL) {
163                 /* Vdev close in progress.  Ignore the event. */
164                 return;
165         }
166
167         /*
168          * Orphan callbacks occur from the GEOM event thread.
169          * Concurrent with this call, new I/O requests may be
170          * working their way through GEOM about to find out
171          * (only once executed by the g_down thread) that we've
172          * been orphaned from our disk provider.  These I/Os
173          * must be retired before we can detach our consumer.
174          * This is most easily achieved by acquiring the
175          * SPA ZIO configuration lock as a writer, but doing
176          * so with the GEOM topology lock held would cause
177          * a lock order reversal.  Instead, rely on the SPA's
178          * async removal support to invoke a close on this
179          * vdev once it is safe to do so.
180          */
181         vd->vdev_remove_wanted = B_TRUE;
182         spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
183 }
184
185 static struct g_consumer *
186 vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
187 {
188         struct g_geom *gp;
189         struct g_consumer *cp;
190         int error;
191
192         g_topology_assert();
193
194         ZFS_LOG(1, "Attaching to %s.", pp->name);
195
196         if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
197                 ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
198                     pp->name, pp->sectorsize);
199                 return (NULL);
200         } else if (pp->mediasize < SPA_MINDEVSIZE) {
201                 ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
202                     pp->name, pp->mediasize);
203                 return (NULL);
204         }
205
206         /* Do we have geom already? No? Create one. */
207         LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
208                 if (gp->flags & G_GEOM_WITHER)
209                         continue;
210                 if (strcmp(gp->name, "zfs::vdev") != 0)
211                         continue;
212                 break;
213         }
214         if (gp == NULL) {
215                 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
216                 gp->orphan = vdev_geom_orphan;
217                 gp->attrchanged = vdev_geom_attrchanged;
218                 cp = g_new_consumer(gp);
219                 error = g_attach(cp, pp);
220                 if (error != 0) {
221                         ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
222                             __LINE__, error);
223                         vdev_geom_detach(cp, B_FALSE);
224                         return (NULL);
225                 }
226                 error = g_access(cp, 1, 0, 1);
227                 if (error != 0) {
228                         ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
229                                __LINE__, error);
230                         vdev_geom_detach(cp, B_FALSE);
231                         return (NULL);
232                 }
233                 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
234         } else {
235                 /* Check if we are already connected to this provider. */
236                 LIST_FOREACH(cp, &gp->consumer, consumer) {
237                         if (cp->provider == pp) {
238                                 ZFS_LOG(1, "Found consumer for %s.", pp->name);
239                                 break;
240                         }
241                 }
242                 if (cp == NULL) {
243                         cp = g_new_consumer(gp);
244                         error = g_attach(cp, pp);
245                         if (error != 0) {
246                                 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
247                                     __func__, __LINE__, error);
248                                 vdev_geom_detach(cp, B_FALSE);
249                                 return (NULL);
250                         }
251                         error = g_access(cp, 1, 0, 1);
252                         if (error != 0) {
253                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
254                                     __func__, __LINE__, error);
255                                 vdev_geom_detach(cp, B_FALSE);
256                                 return (NULL);
257                         }
258                         ZFS_LOG(1, "Created consumer for %s.", pp->name);
259                 } else {
260                         error = g_access(cp, 1, 0, 1);
261                         if (error != 0) {
262                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
263                                     __func__, __LINE__, error);
264                                 return (NULL);
265                         }
266                         ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
267                 }
268         }
269
270         /* 
271          * BUG: cp may already belong to a vdev.  This could happen if:
272          * 1) That vdev is a shared spare, or
273          * 2) We are trying to reopen a missing vdev and we are scanning by
274          *    guid.  In that case, we'll ultimately fail to open this consumer,
275          *    but not until after setting the private field.
276          * The solution is to:
277          * 1) Don't set the private field until after the open succeeds, and
278          * 2) Set it to a linked list of vdevs, not just a single vdev
279          */
280         cp->private = vd;
281         if (vd != NULL) {
282                 vd->vdev_tsd = cp;
283                 vdev_geom_set_physpath(cp, /*do_null_update*/B_FALSE);
284         }
285
286         cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
287         return (cp);
288 }
289
290 static void
291 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
292 {
293         struct g_geom *gp;
294         vdev_t *vd;
295
296         g_topology_assert();
297
298         ZFS_LOG(1, "Detaching consumer. Provider %s.",
299             cp->provider && cp->provider->name ? cp->provider->name : "NULL");
300
301         vd = cp->private;
302         cp->private = NULL;
303
304         gp = cp->geom;
305         if (open_for_read)
306                 g_access(cp, -1, 0, -1);
307         /* Destroy consumer on last close. */
308         if (cp->acr == 0 && cp->ace == 0) {
309                 if (cp->acw > 0)
310                         g_access(cp, 0, -cp->acw, 0);
311                 if (cp->provider != NULL) {
312                         ZFS_LOG(1, "Destroying consumer to %s.",
313                             cp->provider->name ? cp->provider->name : "NULL");
314                         g_detach(cp);
315                 }
316                 g_destroy_consumer(cp);
317         }
318         /* Destroy geom if there are no consumers left. */
319         if (LIST_EMPTY(&gp->consumer)) {
320                 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
321                 g_wither_geom(gp, ENXIO);
322         }
323 }
324
325 static void
326 vdev_geom_close_locked(vdev_t *vd)
327 {
328         struct g_consumer *cp;
329
330         g_topology_assert();
331
332         cp = vd->vdev_tsd;
333         vd->vdev_tsd = NULL;
334         vd->vdev_delayed_close = B_FALSE;
335         if (cp == NULL)
336                 return;
337
338         ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
339
340         vdev_geom_detach(cp, B_TRUE);
341 }
342
343 static void
344 nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
345 {
346
347         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
348         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
349 }
350
351 /*
352  * Issue one or more bios to the vdev in parallel
353  * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
354  * operation is described by parallel entries from each array.  There may be
355  * more bios actually issued than entries in the array
356  */
357 static void
358 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
359     off_t *sizes, int *errors, int ncmds)
360 {
361         struct bio **bios;
362         u_char *p;
363         off_t off, maxio, s, end;
364         int i, n_bios, j;
365         size_t bios_size;
366
367         maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
368         n_bios = 0;
369
370         /* How many bios are required for all commands ? */
371         for (i = 0; i < ncmds; i++)
372                 n_bios += (sizes[i] + maxio - 1) / maxio;
373
374         /* Allocate memory for the bios */
375         bios_size = n_bios * sizeof(struct bio*);
376         bios = kmem_zalloc(bios_size, KM_SLEEP);
377
378         /* Prepare and issue all of the bios */
379         for (i = j = 0; i < ncmds; i++) {
380                 off = offsets[i];
381                 p = datas[i];
382                 s = sizes[i];
383                 end = off + s;
384                 ASSERT((off % cp->provider->sectorsize) == 0);
385                 ASSERT((s % cp->provider->sectorsize) == 0);
386
387                 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
388                         bios[j] = g_alloc_bio();
389                         bios[j]->bio_cmd = cmds[i];
390                         bios[j]->bio_done = NULL;
391                         bios[j]->bio_offset = off;
392                         bios[j]->bio_length = MIN(s, maxio);
393                         bios[j]->bio_data = p;
394                         g_io_request(bios[j], cp);
395                 }
396         }
397         ASSERT(j == n_bios);
398
399         /* Wait for all of the bios to complete, and clean them up */
400         for (i = j = 0; i < ncmds; i++) {
401                 off = offsets[i];
402                 s = sizes[i];
403                 end = off + s;
404
405                 for (; off < end; off += maxio, s -= maxio, j++) {
406                         errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
407                         g_destroy_bio(bios[j]);
408                 }
409         }
410         kmem_free(bios, bios_size);
411 }
412
413 static int
414 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
415 {
416         struct g_provider *pp;
417         vdev_phys_t *vdev_lists[VDEV_LABELS];
418         char *p, *buf;
419         size_t buflen;
420         uint64_t psize, state, txg;
421         off_t offsets[VDEV_LABELS];
422         off_t size;
423         off_t sizes[VDEV_LABELS];
424         int cmds[VDEV_LABELS];
425         int errors[VDEV_LABELS];
426         int l, len;
427
428         g_topology_assert_not();
429
430         pp = cp->provider;
431         ZFS_LOG(1, "Reading config from %s...", pp->name);
432
433         psize = pp->mediasize;
434         psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
435
436         size = sizeof(*vdev_lists[0]) + pp->sectorsize -
437             ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
438
439         buflen = sizeof(vdev_lists[0]->vp_nvlist);
440
441         *config = NULL;
442         /* Create all of the IO requests */
443         for (l = 0; l < VDEV_LABELS; l++) {
444                 cmds[l] = BIO_READ;
445                 vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
446                 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
447                 sizes[l] = size;
448                 errors[l] = 0;
449                 ASSERT(offsets[l] % pp->sectorsize == 0);
450         }
451
452         /* Issue the IO requests */
453         vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
454             VDEV_LABELS);
455
456         /* Parse the labels */
457         for (l = 0; l < VDEV_LABELS; l++) {
458                 if (errors[l] != 0)
459                         continue;
460
461                 buf = vdev_lists[l]->vp_nvlist;
462
463                 if (nvlist_unpack(buf, buflen, config, 0) != 0)
464                         continue;
465
466                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
467                     &state) != 0 || state > POOL_STATE_L2CACHE) {
468                         nvlist_free(*config);
469                         *config = NULL;
470                         continue;
471                 }
472
473                 if (state != POOL_STATE_SPARE &&
474                     state != POOL_STATE_L2CACHE &&
475                     (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
476                     &txg) != 0 || txg == 0)) {
477                         nvlist_free(*config);
478                         *config = NULL;
479                         continue;
480                 }
481
482                 break;
483         }
484
485         /* Free the label storage */
486         for (l = 0; l < VDEV_LABELS; l++)
487                 kmem_free(vdev_lists[l], size);
488
489         return (*config == NULL ? ENOENT : 0);
490 }
491
492 static void
493 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
494 {
495         nvlist_t **new_configs;
496         uint64_t i;
497
498         if (id < *count)
499                 return;
500         new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
501             KM_SLEEP);
502         for (i = 0; i < *count; i++)
503                 new_configs[i] = (*configs)[i];
504         if (*configs != NULL)
505                 kmem_free(*configs, *count * sizeof(void *));
506         *configs = new_configs;
507         *count = id + 1;
508 }
509
510 static void
511 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
512     const char *name, uint64_t* known_pool_guid)
513 {
514         nvlist_t *vdev_tree;
515         uint64_t pool_guid;
516         uint64_t vdev_guid, known_guid;
517         uint64_t id, txg, known_txg;
518         char *pname;
519         int i;
520
521         if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
522             strcmp(pname, name) != 0)
523                 goto ignore;
524
525         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
526                 goto ignore;
527
528         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
529                 goto ignore;
530
531         if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
532                 goto ignore;
533
534         if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
535                 goto ignore;
536
537         VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
538
539         if (*known_pool_guid != 0) {
540                 if (pool_guid != *known_pool_guid)
541                         goto ignore;
542         } else
543                 *known_pool_guid = pool_guid;
544
545         resize_configs(configs, count, id);
546
547         if ((*configs)[id] != NULL) {
548                 VERIFY(nvlist_lookup_uint64((*configs)[id],
549                     ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
550                 if (txg <= known_txg)
551                         goto ignore;
552                 nvlist_free((*configs)[id]);
553         }
554
555         (*configs)[id] = cfg;
556         return;
557
558 ignore:
559         nvlist_free(cfg);
560 }
561
562 int
563 vdev_geom_read_pool_label(const char *name,
564     nvlist_t ***configs, uint64_t *count)
565 {
566         struct g_class *mp;
567         struct g_geom *gp;
568         struct g_provider *pp;
569         struct g_consumer *zcp;
570         nvlist_t *vdev_cfg;
571         uint64_t pool_guid;
572         int error;
573
574         DROP_GIANT();
575         g_topology_lock();
576
577         *configs = NULL;
578         *count = 0;
579         pool_guid = 0;
580         LIST_FOREACH(mp, &g_classes, class) {
581                 if (mp == &zfs_vdev_class)
582                         continue;
583                 LIST_FOREACH(gp, &mp->geom, geom) {
584                         if (gp->flags & G_GEOM_WITHER)
585                                 continue;
586                         LIST_FOREACH(pp, &gp->provider, provider) {
587                                 if (pp->flags & G_PF_WITHER)
588                                         continue;
589                                 zcp = vdev_geom_attach(pp, NULL);
590                                 if (zcp == NULL)
591                                         continue;
592                                 g_topology_unlock();
593                                 error = vdev_geom_read_config(zcp, &vdev_cfg);
594                                 g_topology_lock();
595                                 vdev_geom_detach(zcp, B_TRUE);
596                                 if (error)
597                                         continue;
598                                 ZFS_LOG(1, "successfully read vdev config");
599
600                                 process_vdev_config(configs, count,
601                                     vdev_cfg, name, &pool_guid);
602                         }
603                 }
604         }
605         g_topology_unlock();
606         PICKUP_GIANT();
607
608         return (*count > 0 ? 0 : ENOENT);
609 }
610
611 static void
612 vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
613 {
614         nvlist_t *config;
615
616         g_topology_assert_not();
617
618         *pguid = 0;
619         *vguid = 0;
620         if (vdev_geom_read_config(cp, &config) == 0) {
621                 nvlist_get_guids(config, pguid, vguid);
622                 nvlist_free(config);
623         }
624 }
625
626 static boolean_t
627 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
628 {
629         uint64_t pool_guid;
630         uint64_t vdev_guid;
631         struct g_consumer *zcp;
632         boolean_t pool_ok;
633         boolean_t vdev_ok;
634
635         zcp = vdev_geom_attach(pp, NULL);
636         if (zcp == NULL) {
637                 ZFS_LOG(1, "Unable to attach tasting instance to %s.",
638                     pp->name);
639                 return (B_FALSE);
640         }
641         g_topology_unlock();
642         vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid);
643         g_topology_lock();
644         vdev_geom_detach(zcp, B_TRUE);
645
646         /* 
647          * Check that the label's vdev guid matches the desired guid.  If the
648          * label has a pool guid, check that it matches too. (Inactive spares
649          * and L2ARCs do not have any pool guid in the label.)
650          */
651         if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) &&
652             vdev_guid == vd->vdev_guid) {
653                 ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path);
654                 return (B_TRUE);
655         } else {
656                 ZFS_LOG(1, "guid mismatch for provider %s: "
657                     "%ju:%ju != %ju:%ju.", vd->vdev_path,
658                     (uintmax_t)spa_guid(vd->vdev_spa),
659                     (uintmax_t)vd->vdev_guid,
660                     (uintmax_t)pool_guid, (uintmax_t)vdev_guid);
661                 return (B_FALSE);
662         }
663 }
664
665 static struct g_consumer *
666 vdev_geom_attach_by_guids(vdev_t *vd)
667 {
668         struct g_class *mp;
669         struct g_geom *gp;
670         struct g_provider *pp;
671         struct g_consumer *cp;
672
673         g_topology_assert();
674
675         cp = NULL;
676         LIST_FOREACH(mp, &g_classes, class) {
677                 if (mp == &zfs_vdev_class)
678                         continue;
679                 LIST_FOREACH(gp, &mp->geom, geom) {
680                         if (gp->flags & G_GEOM_WITHER)
681                                 continue;
682                         LIST_FOREACH(pp, &gp->provider, provider) {
683                                 if (!vdev_attach_ok(vd, pp))
684                                         continue;
685                                 cp = vdev_geom_attach(pp, vd);
686                                 if (cp == NULL) {
687                                         printf("ZFS WARNING: Unable to "
688                                             "attach to %s.\n", pp->name);
689                                         continue;
690                                 }
691                                 break;
692                         }
693                         if (cp != NULL)
694                                 break;
695                 }
696                 if (cp != NULL)
697                         break;
698         }
699 end:
700         return (cp);
701 }
702
703 static struct g_consumer *
704 vdev_geom_open_by_guids(vdev_t *vd)
705 {
706         struct g_consumer *cp;
707         char *buf;
708         size_t len;
709
710         g_topology_assert();
711
712         ZFS_LOG(1, "Searching by guids [%ju:%ju].",
713                 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
714         cp = vdev_geom_attach_by_guids(vd);
715         if (cp != NULL) {
716                 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
717                 buf = kmem_alloc(len, KM_SLEEP);
718
719                 snprintf(buf, len, "/dev/%s", cp->provider->name);
720                 spa_strfree(vd->vdev_path);
721                 vd->vdev_path = buf;
722
723                 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
724                     (uintmax_t)spa_guid(vd->vdev_spa),
725                     (uintmax_t)vd->vdev_guid, vd->vdev_path);
726         } else {
727                 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
728                     (uintmax_t)spa_guid(vd->vdev_spa),
729                     (uintmax_t)vd->vdev_guid);
730         }
731
732         return (cp);
733 }
734
735 static struct g_consumer *
736 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
737 {
738         struct g_provider *pp;
739         struct g_consumer *cp;
740
741         g_topology_assert();
742
743         cp = NULL;
744         pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
745         if (pp != NULL) {
746                 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
747                 if (!check_guid || vdev_attach_ok(vd, pp))
748                         cp = vdev_geom_attach(pp, vd);
749         }
750
751         return (cp);
752 }
753
754 static int
755 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
756     uint64_t *logical_ashift, uint64_t *physical_ashift)
757 {
758         struct g_provider *pp;
759         struct g_consumer *cp;
760         size_t bufsize;
761         int error;
762
763         /* Set the TLS to indicate downstack that we should not access zvols*/
764         VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
765
766         /*
767          * We must have a pathname, and it must be absolute.
768          */
769         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
770                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
771                 return (EINVAL);
772         }
773
774         vd->vdev_tsd = NULL;
775
776         DROP_GIANT();
777         g_topology_lock();
778         error = 0;
779
780         if (vd->vdev_spa->spa_splitting_newspa ||
781             (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
782              vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
783              vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
784                 /*
785                  * We are dealing with a vdev that hasn't been previously
786                  * opened (since boot), and we are not loading an
787                  * existing pool configuration.  This looks like a
788                  * vdev add operation to a new or existing pool.
789                  * Assume the user knows what he/she is doing and find
790                  * GEOM provider by its name, ignoring GUID mismatches.
791                  *
792                  * XXPOLICY: It would be safer to only allow a device
793                  *           that is unlabeled or labeled but missing
794                  *           GUID information to be opened in this fashion,
795                  *           unless we are doing a split, in which case we
796                  *           should allow any guid.
797                  */
798                 cp = vdev_geom_open_by_path(vd, 0);
799         } else {
800                 /*
801                  * Try using the recorded path for this device, but only
802                  * accept it if its label data contains the expected GUIDs.
803                  */
804                 cp = vdev_geom_open_by_path(vd, 1);
805                 if (cp == NULL) {
806                         /*
807                          * The device at vd->vdev_path doesn't have the
808                          * expected GUIDs. The disks might have merely
809                          * moved around so try all other GEOM providers
810                          * to find one with the right GUIDs.
811                          */
812                         cp = vdev_geom_open_by_guids(vd);
813                 }
814         }
815
816         /* Clear the TLS now that tasting is done */
817         VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
818
819         if (cp == NULL) {
820                 ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
821                 error = ENOENT;
822         } else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
823             !ISP2(cp->provider->sectorsize)) {
824                 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
825                     vd->vdev_path);
826
827                 vdev_geom_close_locked(vd);
828                 error = EINVAL;
829                 cp = NULL;
830         } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
831                 int i;
832
833                 for (i = 0; i < 5; i++) {
834                         error = g_access(cp, 0, 1, 0);
835                         if (error == 0)
836                                 break;
837                         g_topology_unlock();
838                         tsleep(vd, 0, "vdev", hz / 2);
839                         g_topology_lock();
840                 }
841                 if (error != 0) {
842                         printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
843                             vd->vdev_path, error);
844                         vdev_geom_close_locked(vd);
845                         cp = NULL;
846                 }
847         }
848
849         /* Fetch initial physical path information for this device. */
850         if (cp != NULL)
851                 vdev_geom_attrchanged(cp, "GEOM::physpath");
852         
853         g_topology_unlock();
854         PICKUP_GIANT();
855         if (cp == NULL) {
856                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
857                 return (error);
858         }
859         pp = cp->provider;
860
861         /*
862          * Determine the actual size of the device.
863          */
864         *max_psize = *psize = pp->mediasize;
865
866         /*
867          * Determine the device's minimum transfer size and preferred
868          * transfer size.
869          */
870         *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
871         *physical_ashift = 0;
872         if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
873             pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
874                 *physical_ashift = highbit(pp->stripesize) - 1;
875
876         /*
877          * Clear the nowritecache settings, so that on a vdev_reopen()
878          * we will try again.
879          */
880         vd->vdev_nowritecache = B_FALSE;
881
882         /*
883          * Determine the device's rotation rate.
884          */
885         vdev_geom_set_rotation_rate(vd, cp);
886
887         return (0);
888 }
889
890 static void
891 vdev_geom_close(vdev_t *vd)
892 {
893
894         DROP_GIANT();
895         g_topology_lock();
896         vdev_geom_close_locked(vd);
897         g_topology_unlock();
898         PICKUP_GIANT();
899 }
900
901 static void
902 vdev_geom_io_intr(struct bio *bp)
903 {
904         vdev_t *vd;
905         zio_t *zio;
906
907         zio = bp->bio_caller1;
908         vd = zio->io_vd;
909         zio->io_error = bp->bio_error;
910         if (zio->io_error == 0 && bp->bio_resid != 0)
911                 zio->io_error = SET_ERROR(EIO);
912
913         switch(zio->io_error) {
914         case ENOTSUP:
915                 /*
916                  * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
917                  * that future attempts will never succeed. In this case
918                  * we set a persistent flag so that we don't bother with
919                  * requests in the future.
920                  */
921                 switch(bp->bio_cmd) {
922                 case BIO_FLUSH:
923                         vd->vdev_nowritecache = B_TRUE;
924                         break;
925                 case BIO_DELETE:
926                         vd->vdev_notrim = B_TRUE;
927                         break;
928                 }
929                 break;
930         case ENXIO:
931                 if (!vd->vdev_remove_wanted) {
932                         /*
933                          * If provider's error is set we assume it is being
934                          * removed.
935                          */
936                         if (bp->bio_to->error != 0) {
937                                 vd->vdev_remove_wanted = B_TRUE;
938                                 spa_async_request(zio->io_spa,
939                                     SPA_ASYNC_REMOVE);
940                         } else if (!vd->vdev_delayed_close) {
941                                 vd->vdev_delayed_close = B_TRUE;
942                         }
943                 }
944                 break;
945         }
946         g_destroy_bio(bp);
947         zio_delay_interrupt(zio);
948 }
949
950 static void
951 vdev_geom_io_start(zio_t *zio)
952 {
953         vdev_t *vd;
954         struct g_consumer *cp;
955         struct bio *bp;
956         int error;
957
958         vd = zio->io_vd;
959
960         switch (zio->io_type) {
961         case ZIO_TYPE_IOCTL:
962                 /* XXPOLICY */
963                 if (!vdev_readable(vd)) {
964                         zio->io_error = SET_ERROR(ENXIO);
965                         zio_interrupt(zio);
966                         return;
967                 } else {
968                         switch (zio->io_cmd) {
969                         case DKIOCFLUSHWRITECACHE:
970                                 if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
971                                         break;
972                                 if (vd->vdev_nowritecache) {
973                                         zio->io_error = SET_ERROR(ENOTSUP);
974                                         break;
975                                 }
976                                 goto sendreq;
977                         default:
978                                 zio->io_error = SET_ERROR(ENOTSUP);
979                         }
980                 }
981
982                 zio_execute(zio);
983                 return;
984         case ZIO_TYPE_FREE:
985                 if (vd->vdev_notrim) {
986                         zio->io_error = SET_ERROR(ENOTSUP);
987                 } else if (!vdev_geom_bio_delete_disable) {
988                         goto sendreq;
989                 }
990                 zio_execute(zio);
991                 return;
992         }
993 sendreq:
994         ASSERT(zio->io_type == ZIO_TYPE_READ ||
995             zio->io_type == ZIO_TYPE_WRITE ||
996             zio->io_type == ZIO_TYPE_FREE ||
997             zio->io_type == ZIO_TYPE_IOCTL);
998
999         cp = vd->vdev_tsd;
1000         if (cp == NULL) {
1001                 zio->io_error = SET_ERROR(ENXIO);
1002                 zio_interrupt(zio);
1003                 return;
1004         }
1005         bp = g_alloc_bio();
1006         bp->bio_caller1 = zio;
1007         switch (zio->io_type) {
1008         case ZIO_TYPE_READ:
1009         case ZIO_TYPE_WRITE:
1010                 zio->io_target_timestamp = zio_handle_io_delay(zio);
1011                 bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
1012                 bp->bio_data = zio->io_data;
1013                 bp->bio_offset = zio->io_offset;
1014                 bp->bio_length = zio->io_size;
1015                 break;
1016         case ZIO_TYPE_FREE:
1017                 bp->bio_cmd = BIO_DELETE;
1018                 bp->bio_data = NULL;
1019                 bp->bio_offset = zio->io_offset;
1020                 bp->bio_length = zio->io_size;
1021                 break;
1022         case ZIO_TYPE_IOCTL:
1023                 bp->bio_cmd = BIO_FLUSH;
1024                 bp->bio_flags |= BIO_ORDERED;
1025                 bp->bio_data = NULL;
1026                 bp->bio_offset = cp->provider->mediasize;
1027                 bp->bio_length = 0;
1028                 break;
1029         }
1030         bp->bio_done = vdev_geom_io_intr;
1031
1032         g_io_request(bp, cp);
1033 }
1034
1035 static void
1036 vdev_geom_io_done(zio_t *zio)
1037 {
1038 }
1039
1040 static void
1041 vdev_geom_hold(vdev_t *vd)
1042 {
1043 }
1044
1045 static void
1046 vdev_geom_rele(vdev_t *vd)
1047 {
1048 }
1049
1050 vdev_ops_t vdev_geom_ops = {
1051         vdev_geom_open,
1052         vdev_geom_close,
1053         vdev_default_asize,
1054         vdev_geom_io_start,
1055         vdev_geom_io_done,
1056         NULL,
1057         vdev_geom_hold,
1058         vdev_geom_rele,
1059         VDEV_TYPE_DISK,         /* name of this vdev type */
1060         B_TRUE                  /* leaf vdev */
1061 };