]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
Speed up vdev_geom_open_by_guids
[FreeBSD/FreeBSD.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_geom.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23  * All rights reserved.
24  *
25  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26  */
27
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/bio.h>
32 #include <sys/disk.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zio.h>
38 #include <geom/geom.h>
39 #include <geom/geom_int.h>
40
41 /*
42  * Virtual device vector for GEOM.
43  */
44
45 static g_attrchanged_t vdev_geom_attrchanged;
46 struct g_class zfs_vdev_class = {
47         .name = "ZFS::VDEV",
48         .version = G_VERSION,
49         .attrchanged = vdev_geom_attrchanged,
50 };
51
52 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54 SYSCTL_DECL(_vfs_zfs_vdev);
55 /* Don't send BIO_FLUSH. */
56 static int vdev_geom_bio_flush_disable;
57 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
58     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
59 /* Don't send BIO_DELETE. */
60 static int vdev_geom_bio_delete_disable;
61 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
62     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
63
64 /* Declare local functions */
65 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
66
67 /*
68  * Thread local storage used to indicate when a thread is probing geoms
69  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
70  * it is looking for a replacement for the vdev_t* that is its value.
71  */
72 uint_t zfs_geom_probe_vdev_key;
73
74 static void
75 vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
76
77         int error;
78         uint16_t rate;
79
80         error = g_getattr("GEOM::rotation_rate", cp, &rate);
81         if (error == 0)
82                 vd->vdev_rotation_rate = rate;
83         else
84                 vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
85 }
86
87 static void
88 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
89 {
90         vdev_t *vd;
91         spa_t *spa;
92         char *physpath;
93         int error, physpath_len;
94
95         vd = cp->private;
96         if (vd == NULL)
97                 return;
98
99         if (strcmp(attr, "GEOM::rotation_rate") == 0) {
100                 vdev_geom_set_rotation_rate(vd, cp);
101                 return;
102         }
103
104         if (strcmp(attr, "GEOM::physpath") != 0)
105                 return;
106
107         if (g_access(cp, 1, 0, 0) != 0)
108                 return;
109
110         /*
111          * Record/Update physical path information for this device.
112          */
113         spa = vd->vdev_spa;
114         physpath_len = MAXPATHLEN;
115         physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
116         error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
117         g_access(cp, -1, 0, 0);
118         if (error == 0) {
119                 char *old_physpath;
120
121                 /* g_topology lock ensures that vdev has not been closed */
122                 g_topology_assert();
123                 old_physpath = vd->vdev_physpath;
124                 vd->vdev_physpath = spa_strdup(physpath);
125                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
126
127                 if (old_physpath != NULL)
128                         spa_strfree(old_physpath);
129         }
130         g_free(physpath);
131 }
132
133 static void
134 vdev_geom_orphan(struct g_consumer *cp)
135 {
136         vdev_t *vd;
137
138         g_topology_assert();
139
140         vd = cp->private;
141         if (vd == NULL) {
142                 /* Vdev close in progress.  Ignore the event. */
143                 return;
144         }
145
146         /*
147          * Orphan callbacks occur from the GEOM event thread.
148          * Concurrent with this call, new I/O requests may be
149          * working their way through GEOM about to find out
150          * (only once executed by the g_down thread) that we've
151          * been orphaned from our disk provider.  These I/Os
152          * must be retired before we can detach our consumer.
153          * This is most easily achieved by acquiring the
154          * SPA ZIO configuration lock as a writer, but doing
155          * so with the GEOM topology lock held would cause
156          * a lock order reversal.  Instead, rely on the SPA's
157          * async removal support to invoke a close on this
158          * vdev once it is safe to do so.
159          */
160         vd->vdev_remove_wanted = B_TRUE;
161         spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
162 }
163
164 static struct g_consumer *
165 vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
166 {
167         struct g_geom *gp;
168         struct g_consumer *cp;
169         int error;
170
171         g_topology_assert();
172
173         ZFS_LOG(1, "Attaching to %s.", pp->name);
174
175         if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
176                 ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
177                     pp->name, pp->sectorsize);
178                 return (NULL);
179         } else if (pp->mediasize < SPA_MINDEVSIZE) {
180                 ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
181                     pp->name, pp->mediasize);
182                 return (NULL);
183         }
184
185         /* Do we have geom already? No? Create one. */
186         LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
187                 if (gp->flags & G_GEOM_WITHER)
188                         continue;
189                 if (strcmp(gp->name, "zfs::vdev") != 0)
190                         continue;
191                 break;
192         }
193         if (gp == NULL) {
194                 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
195                 gp->orphan = vdev_geom_orphan;
196                 gp->attrchanged = vdev_geom_attrchanged;
197                 cp = g_new_consumer(gp);
198                 error = g_attach(cp, pp);
199                 if (error != 0) {
200                         ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
201                             __LINE__, error);
202                         vdev_geom_detach(cp, B_FALSE);
203                         return (NULL);
204                 }
205                 error = g_access(cp, 1, 0, 1);
206                 if (error != 0) {
207                         ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
208                                __LINE__, error);
209                         vdev_geom_detach(cp, B_FALSE);
210                         return (NULL);
211                 }
212                 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
213         } else {
214                 /* Check if we are already connected to this provider. */
215                 LIST_FOREACH(cp, &gp->consumer, consumer) {
216                         if (cp->provider == pp) {
217                                 ZFS_LOG(1, "Found consumer for %s.", pp->name);
218                                 break;
219                         }
220                 }
221                 if (cp == NULL) {
222                         cp = g_new_consumer(gp);
223                         error = g_attach(cp, pp);
224                         if (error != 0) {
225                                 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
226                                     __func__, __LINE__, error);
227                                 vdev_geom_detach(cp, B_FALSE);
228                                 return (NULL);
229                         }
230                         error = g_access(cp, 1, 0, 1);
231                         if (error != 0) {
232                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
233                                     __func__, __LINE__, error);
234                                 vdev_geom_detach(cp, B_FALSE);
235                                 return (NULL);
236                         }
237                         ZFS_LOG(1, "Created consumer for %s.", pp->name);
238                 } else {
239                         error = g_access(cp, 1, 0, 1);
240                         if (error != 0) {
241                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
242                                     __func__, __LINE__, error);
243                                 return (NULL);
244                         }
245                         ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
246                 }
247         }
248
249         /* 
250          * BUG: cp may already belong to a vdev.  This could happen if:
251          * 1) That vdev is a shared spare, or
252          * 2) We are trying to reopen a missing vdev and we are scanning by
253          *    guid.  In that case, we'll ultimately fail to open this consumer,
254          *    but not until after setting the private field.
255          * The solution is to:
256          * 1) Don't set the private field until after the open succeeds, and
257          * 2) Set it to a linked list of vdevs, not just a single vdev
258          */
259         cp->private = vd;
260         if (vd != NULL)
261                 vd->vdev_tsd = cp;
262
263         cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
264         return (cp);
265 }
266
267 static void
268 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
269 {
270         struct g_geom *gp;
271         vdev_t *vd;
272
273         g_topology_assert();
274
275         ZFS_LOG(1, "Detaching consumer. Provider %s.",
276             cp->provider && cp->provider->name ? cp->provider->name : "NULL");
277
278         vd = cp->private;
279         cp->private = NULL;
280
281         gp = cp->geom;
282         if (open_for_read)
283                 g_access(cp, -1, 0, -1);
284         /* Destroy consumer on last close. */
285         if (cp->acr == 0 && cp->ace == 0) {
286                 if (cp->acw > 0)
287                         g_access(cp, 0, -cp->acw, 0);
288                 if (cp->provider != NULL) {
289                         ZFS_LOG(1, "Destroying consumer to %s.",
290                             cp->provider->name ? cp->provider->name : "NULL");
291                         g_detach(cp);
292                 }
293                 g_destroy_consumer(cp);
294         }
295         /* Destroy geom if there are no consumers left. */
296         if (LIST_EMPTY(&gp->consumer)) {
297                 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
298                 g_wither_geom(gp, ENXIO);
299         }
300 }
301
302 static void
303 vdev_geom_close_locked(vdev_t *vd)
304 {
305         struct g_consumer *cp;
306
307         g_topology_assert();
308
309         cp = vd->vdev_tsd;
310         vd->vdev_tsd = NULL;
311         vd->vdev_delayed_close = B_FALSE;
312         if (cp == NULL)
313                 return;
314
315         ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
316
317         vdev_geom_detach(cp, B_TRUE);
318 }
319
320 static void
321 nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
322 {
323
324         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
325         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
326 }
327
328 /*
329  * Issue one or more bios to the vdev in parallel
330  * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
331  * operation is described by parallel entries from each array.  There may be
332  * more bios actually issued than entries in the array
333  */
334 static void
335 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
336     off_t *sizes, int *errors, int ncmds)
337 {
338         struct bio **bios;
339         u_char *p;
340         off_t off, maxio, s, end;
341         int i, n_bios, j;
342         size_t bios_size;
343
344         maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
345         n_bios = 0;
346
347         /* How many bios are required for all commands ? */
348         for (i = 0; i < ncmds; i++)
349                 n_bios += (sizes[i] + maxio - 1) / maxio;
350
351         /* Allocate memory for the bios */
352         bios_size = n_bios * sizeof(struct bio*);
353         bios = kmem_zalloc(bios_size, KM_SLEEP);
354
355         /* Prepare and issue all of the bios */
356         for (i = j = 0; i < ncmds; i++) {
357                 off = offsets[i];
358                 p = datas[i];
359                 s = sizes[i];
360                 end = off + s;
361                 ASSERT((off % cp->provider->sectorsize) == 0);
362                 ASSERT((s % cp->provider->sectorsize) == 0);
363
364                 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
365                         bios[j] = g_alloc_bio();
366                         bios[j]->bio_cmd = cmds[i];
367                         bios[j]->bio_done = NULL;
368                         bios[j]->bio_offset = off;
369                         bios[j]->bio_length = MIN(s, maxio);
370                         bios[j]->bio_data = p;
371                         g_io_request(bios[j], cp);
372                 }
373         }
374         ASSERT(j == n_bios);
375
376         /* Wait for all of the bios to complete, and clean them up */
377         for (i = j = 0; i < ncmds; i++) {
378                 off = offsets[i];
379                 s = sizes[i];
380                 end = off + s;
381
382                 for (; off < end; off += maxio, s -= maxio, j++) {
383                         errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
384                         g_destroy_bio(bios[j]);
385                 }
386         }
387         kmem_free(bios, bios_size);
388 }
389
390 static int
391 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
392 {
393         struct g_provider *pp;
394         vdev_phys_t *vdev_lists[VDEV_LABELS];
395         char *p, *buf;
396         size_t buflen;
397         uint64_t psize, state, txg;
398         off_t offsets[VDEV_LABELS];
399         off_t size;
400         off_t sizes[VDEV_LABELS];
401         int cmds[VDEV_LABELS];
402         int errors[VDEV_LABELS];
403         int l, len;
404
405         g_topology_assert_not();
406
407         pp = cp->provider;
408         ZFS_LOG(1, "Reading config from %s...", pp->name);
409
410         psize = pp->mediasize;
411         psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
412
413         size = sizeof(*vdev_lists[0]) + pp->sectorsize -
414             ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
415
416         buflen = sizeof(vdev_lists[0]->vp_nvlist);
417
418         *config = NULL;
419         /* Create all of the IO requests */
420         for (l = 0; l < VDEV_LABELS; l++) {
421                 cmds[l] = BIO_READ;
422                 vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
423                 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
424                 sizes[l] = size;
425                 errors[l] = 0;
426                 ASSERT(offsets[l] % pp->sectorsize == 0);
427         }
428
429         /* Issue the IO requests */
430         vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
431             VDEV_LABELS);
432
433         /* Parse the labels */
434         for (l = 0; l < VDEV_LABELS; l++) {
435                 if (errors[l] != 0)
436                         continue;
437
438                 buf = vdev_lists[l]->vp_nvlist;
439
440                 if (nvlist_unpack(buf, buflen, config, 0) != 0)
441                         continue;
442
443                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
444                     &state) != 0 || state > POOL_STATE_L2CACHE) {
445                         nvlist_free(*config);
446                         *config = NULL;
447                         continue;
448                 }
449
450                 if (state != POOL_STATE_SPARE &&
451                     state != POOL_STATE_L2CACHE &&
452                     (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
453                     &txg) != 0 || txg == 0)) {
454                         nvlist_free(*config);
455                         *config = NULL;
456                         continue;
457                 }
458
459                 break;
460         }
461
462         /* Free the label storage */
463         for (l = 0; l < VDEV_LABELS; l++)
464                 kmem_free(vdev_lists[l], size);
465
466         return (*config == NULL ? ENOENT : 0);
467 }
468
469 static void
470 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
471 {
472         nvlist_t **new_configs;
473         uint64_t i;
474
475         if (id < *count)
476                 return;
477         new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
478             KM_SLEEP);
479         for (i = 0; i < *count; i++)
480                 new_configs[i] = (*configs)[i];
481         if (*configs != NULL)
482                 kmem_free(*configs, *count * sizeof(void *));
483         *configs = new_configs;
484         *count = id + 1;
485 }
486
487 static void
488 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
489     const char *name, uint64_t* known_pool_guid)
490 {
491         nvlist_t *vdev_tree;
492         uint64_t pool_guid;
493         uint64_t vdev_guid, known_guid;
494         uint64_t id, txg, known_txg;
495         char *pname;
496         int i;
497
498         if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
499             strcmp(pname, name) != 0)
500                 goto ignore;
501
502         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
503                 goto ignore;
504
505         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
506                 goto ignore;
507
508         if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
509                 goto ignore;
510
511         if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
512                 goto ignore;
513
514         VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
515
516         if (*known_pool_guid != 0) {
517                 if (pool_guid != *known_pool_guid)
518                         goto ignore;
519         } else
520                 *known_pool_guid = pool_guid;
521
522         resize_configs(configs, count, id);
523
524         if ((*configs)[id] != NULL) {
525                 VERIFY(nvlist_lookup_uint64((*configs)[id],
526                     ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
527                 if (txg <= known_txg)
528                         goto ignore;
529                 nvlist_free((*configs)[id]);
530         }
531
532         (*configs)[id] = cfg;
533         return;
534
535 ignore:
536         nvlist_free(cfg);
537 }
538
539 int
540 vdev_geom_read_pool_label(const char *name,
541     nvlist_t ***configs, uint64_t *count)
542 {
543         struct g_class *mp;
544         struct g_geom *gp;
545         struct g_provider *pp;
546         struct g_consumer *zcp;
547         nvlist_t *vdev_cfg;
548         uint64_t pool_guid;
549         int error;
550
551         DROP_GIANT();
552         g_topology_lock();
553
554         *configs = NULL;
555         *count = 0;
556         pool_guid = 0;
557         LIST_FOREACH(mp, &g_classes, class) {
558                 if (mp == &zfs_vdev_class)
559                         continue;
560                 LIST_FOREACH(gp, &mp->geom, geom) {
561                         if (gp->flags & G_GEOM_WITHER)
562                                 continue;
563                         LIST_FOREACH(pp, &gp->provider, provider) {
564                                 if (pp->flags & G_PF_WITHER)
565                                         continue;
566                                 zcp = vdev_geom_attach(pp, NULL);
567                                 if (zcp == NULL)
568                                         continue;
569                                 g_topology_unlock();
570                                 error = vdev_geom_read_config(zcp, &vdev_cfg);
571                                 g_topology_lock();
572                                 vdev_geom_detach(zcp, B_TRUE);
573                                 if (error)
574                                         continue;
575                                 ZFS_LOG(1, "successfully read vdev config");
576
577                                 process_vdev_config(configs, count,
578                                     vdev_cfg, name, &pool_guid);
579                         }
580                 }
581         }
582         g_topology_unlock();
583         PICKUP_GIANT();
584
585         return (*count > 0 ? 0 : ENOENT);
586 }
587
588 static void
589 vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
590 {
591         nvlist_t *config;
592
593         g_topology_assert_not();
594
595         *pguid = 0;
596         *vguid = 0;
597         if (vdev_geom_read_config(cp, &config) == 0) {
598                 nvlist_get_guids(config, pguid, vguid);
599                 nvlist_free(config);
600         }
601 }
602
603 static boolean_t
604 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
605 {
606         uint64_t pool_guid;
607         uint64_t vdev_guid;
608         struct g_consumer *zcp;
609         boolean_t pool_ok;
610         boolean_t vdev_ok;
611
612         zcp = vdev_geom_attach(pp, NULL);
613         if (zcp == NULL) {
614                 ZFS_LOG(1, "Unable to attach tasting instance to %s.",
615                     pp->name);
616                 return (B_FALSE);
617         }
618         g_topology_unlock();
619         vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid);
620         g_topology_lock();
621         vdev_geom_detach(zcp, B_TRUE);
622
623         /* 
624          * Check that the label's vdev guid matches the desired guid.  If the
625          * label has a pool guid, check that it matches too. (Inactive spares
626          * and L2ARCs do not have any pool guid in the label.)
627          */
628         if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) &&
629             vdev_guid == vd->vdev_guid) {
630                 ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path);
631                 return (B_TRUE);
632         } else {
633                 ZFS_LOG(1, "guid mismatch for provider %s: "
634                     "%ju:%ju != %ju:%ju.", vd->vdev_path,
635                     (uintmax_t)spa_guid(vd->vdev_spa),
636                     (uintmax_t)vd->vdev_guid,
637                     (uintmax_t)pool_guid, (uintmax_t)vdev_guid);
638                 return (B_FALSE);
639         }
640 }
641
642 static struct g_consumer *
643 vdev_geom_attach_by_guids(vdev_t *vd)
644 {
645         struct g_class *mp;
646         struct g_geom *gp;
647         struct g_provider *pp;
648         struct g_consumer *cp;
649
650         g_topology_assert();
651
652         cp = NULL;
653         LIST_FOREACH(mp, &g_classes, class) {
654                 if (mp == &zfs_vdev_class)
655                         continue;
656                 LIST_FOREACH(gp, &mp->geom, geom) {
657                         if (gp->flags & G_GEOM_WITHER)
658                                 continue;
659                         LIST_FOREACH(pp, &gp->provider, provider) {
660                                 if (!vdev_attach_ok(vd, pp))
661                                         continue;
662                                 cp = vdev_geom_attach(pp, vd);
663                                 if (cp == NULL) {
664                                         printf("ZFS WARNING: Unable to "
665                                             "attach to %s.\n", pp->name);
666                                         continue;
667                                 }
668                                 break;
669                         }
670                         if (cp != NULL)
671                                 break;
672                 }
673                 if (cp != NULL)
674                         break;
675         }
676 end:
677         return (cp);
678 }
679
680 static struct g_consumer *
681 vdev_geom_open_by_guids(vdev_t *vd)
682 {
683         struct g_consumer *cp;
684         char *buf;
685         size_t len;
686
687         g_topology_assert();
688
689         ZFS_LOG(1, "Searching by guids [%ju:%ju].",
690                 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
691         cp = vdev_geom_attach_by_guids(vd);
692         if (cp != NULL) {
693                 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
694                 buf = kmem_alloc(len, KM_SLEEP);
695
696                 snprintf(buf, len, "/dev/%s", cp->provider->name);
697                 spa_strfree(vd->vdev_path);
698                 vd->vdev_path = buf;
699
700                 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
701                     (uintmax_t)spa_guid(vd->vdev_spa),
702                     (uintmax_t)vd->vdev_guid, vd->vdev_path);
703         } else {
704                 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
705                     (uintmax_t)spa_guid(vd->vdev_spa),
706                     (uintmax_t)vd->vdev_guid);
707         }
708
709         return (cp);
710 }
711
712 static struct g_consumer *
713 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
714 {
715         struct g_provider *pp;
716         struct g_consumer *cp;
717
718         g_topology_assert();
719
720         cp = NULL;
721         pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
722         if (pp != NULL) {
723                 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
724                 if (!check_guid || vdev_attach_ok(vd, pp))
725                         cp = vdev_geom_attach(pp, vd);
726         }
727
728         return (cp);
729 }
730
731 static int
732 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
733     uint64_t *logical_ashift, uint64_t *physical_ashift)
734 {
735         struct g_provider *pp;
736         struct g_consumer *cp;
737         size_t bufsize;
738         int error;
739
740         /* Set the TLS to indicate downstack that we should not access zvols*/
741         VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
742
743         /*
744          * We must have a pathname, and it must be absolute.
745          */
746         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
747                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
748                 return (EINVAL);
749         }
750
751         vd->vdev_tsd = NULL;
752
753         DROP_GIANT();
754         g_topology_lock();
755         error = 0;
756
757         if (vd->vdev_spa->spa_splitting_newspa ||
758             (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
759              vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) {
760                 /*
761                  * We are dealing with a vdev that hasn't been previously
762                  * opened (since boot), and we are not loading an
763                  * existing pool configuration.  This looks like a
764                  * vdev add operation to a new or existing pool.
765                  * Assume the user knows what he/she is doing and find
766                  * GEOM provider by its name, ignoring GUID mismatches.
767                  *
768                  * XXPOLICY: It would be safer to only allow a device
769                  *           that is unlabeled or labeled but missing
770                  *           GUID information to be opened in this fashion,
771                  *           unless we are doing a split, in which case we
772                  *           should allow any guid.
773                  */
774                 cp = vdev_geom_open_by_path(vd, 0);
775         } else {
776                 /*
777                  * Try using the recorded path for this device, but only
778                  * accept it if its label data contains the expected GUIDs.
779                  */
780                 cp = vdev_geom_open_by_path(vd, 1);
781                 if (cp == NULL) {
782                         /*
783                          * The device at vd->vdev_path doesn't have the
784                          * expected GUIDs. The disks might have merely
785                          * moved around so try all other GEOM providers
786                          * to find one with the right GUIDs.
787                          */
788                         cp = vdev_geom_open_by_guids(vd);
789                 }
790         }
791
792         /* Clear the TLS now that tasting is done */
793         VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
794
795         if (cp == NULL) {
796                 ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
797                 error = ENOENT;
798         } else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
799             !ISP2(cp->provider->sectorsize)) {
800                 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
801                     vd->vdev_path);
802
803                 vdev_geom_close_locked(vd);
804                 error = EINVAL;
805                 cp = NULL;
806         } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
807                 int i;
808
809                 for (i = 0; i < 5; i++) {
810                         error = g_access(cp, 0, 1, 0);
811                         if (error == 0)
812                                 break;
813                         g_topology_unlock();
814                         tsleep(vd, 0, "vdev", hz / 2);
815                         g_topology_lock();
816                 }
817                 if (error != 0) {
818                         printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
819                             vd->vdev_path, error);
820                         vdev_geom_close_locked(vd);
821                         cp = NULL;
822                 }
823         }
824
825         /* Fetch initial physical path information for this device. */
826         if (cp != NULL)
827                 vdev_geom_attrchanged(cp, "GEOM::physpath");
828         
829         g_topology_unlock();
830         PICKUP_GIANT();
831         if (cp == NULL) {
832                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
833                 return (error);
834         }
835         pp = cp->provider;
836
837         /*
838          * Determine the actual size of the device.
839          */
840         *max_psize = *psize = pp->mediasize;
841
842         /*
843          * Determine the device's minimum transfer size and preferred
844          * transfer size.
845          */
846         *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
847         *physical_ashift = 0;
848         if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
849             pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
850                 *physical_ashift = highbit(pp->stripesize) - 1;
851
852         /*
853          * Clear the nowritecache settings, so that on a vdev_reopen()
854          * we will try again.
855          */
856         vd->vdev_nowritecache = B_FALSE;
857
858         /*
859          * Determine the device's rotation rate.
860          */
861         vdev_geom_set_rotation_rate(vd, cp);
862
863         return (0);
864 }
865
866 static void
867 vdev_geom_close(vdev_t *vd)
868 {
869
870         DROP_GIANT();
871         g_topology_lock();
872         vdev_geom_close_locked(vd);
873         g_topology_unlock();
874         PICKUP_GIANT();
875 }
876
877 static void
878 vdev_geom_io_intr(struct bio *bp)
879 {
880         vdev_t *vd;
881         zio_t *zio;
882
883         zio = bp->bio_caller1;
884         vd = zio->io_vd;
885         zio->io_error = bp->bio_error;
886         if (zio->io_error == 0 && bp->bio_resid != 0)
887                 zio->io_error = SET_ERROR(EIO);
888
889         switch(zio->io_error) {
890         case ENOTSUP:
891                 /*
892                  * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
893                  * that future attempts will never succeed. In this case
894                  * we set a persistent flag so that we don't bother with
895                  * requests in the future.
896                  */
897                 switch(bp->bio_cmd) {
898                 case BIO_FLUSH:
899                         vd->vdev_nowritecache = B_TRUE;
900                         break;
901                 case BIO_DELETE:
902                         vd->vdev_notrim = B_TRUE;
903                         break;
904                 }
905                 break;
906         case ENXIO:
907                 if (!vd->vdev_remove_wanted) {
908                         /*
909                          * If provider's error is set we assume it is being
910                          * removed.
911                          */
912                         if (bp->bio_to->error != 0) {
913                                 vd->vdev_remove_wanted = B_TRUE;
914                                 spa_async_request(zio->io_spa,
915                                     SPA_ASYNC_REMOVE);
916                         } else if (!vd->vdev_delayed_close) {
917                                 vd->vdev_delayed_close = B_TRUE;
918                         }
919                 }
920                 break;
921         }
922         g_destroy_bio(bp);
923         zio_delay_interrupt(zio);
924 }
925
926 static void
927 vdev_geom_io_start(zio_t *zio)
928 {
929         vdev_t *vd;
930         struct g_consumer *cp;
931         struct bio *bp;
932         int error;
933
934         vd = zio->io_vd;
935
936         switch (zio->io_type) {
937         case ZIO_TYPE_IOCTL:
938                 /* XXPOLICY */
939                 if (!vdev_readable(vd)) {
940                         zio->io_error = SET_ERROR(ENXIO);
941                         zio_interrupt(zio);
942                         return;
943                 } else {
944                         switch (zio->io_cmd) {
945                         case DKIOCFLUSHWRITECACHE:
946                                 if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
947                                         break;
948                                 if (vd->vdev_nowritecache) {
949                                         zio->io_error = SET_ERROR(ENOTSUP);
950                                         break;
951                                 }
952                                 goto sendreq;
953                         default:
954                                 zio->io_error = SET_ERROR(ENOTSUP);
955                         }
956                 }
957
958                 zio_execute(zio);
959                 return;
960         case ZIO_TYPE_FREE:
961                 if (vd->vdev_notrim) {
962                         zio->io_error = SET_ERROR(ENOTSUP);
963                 } else if (!vdev_geom_bio_delete_disable) {
964                         goto sendreq;
965                 }
966                 zio_execute(zio);
967                 return;
968         }
969 sendreq:
970         ASSERT(zio->io_type == ZIO_TYPE_READ ||
971             zio->io_type == ZIO_TYPE_WRITE ||
972             zio->io_type == ZIO_TYPE_FREE ||
973             zio->io_type == ZIO_TYPE_IOCTL);
974
975         cp = vd->vdev_tsd;
976         if (cp == NULL) {
977                 zio->io_error = SET_ERROR(ENXIO);
978                 zio_interrupt(zio);
979                 return;
980         }
981         bp = g_alloc_bio();
982         bp->bio_caller1 = zio;
983         switch (zio->io_type) {
984         case ZIO_TYPE_READ:
985         case ZIO_TYPE_WRITE:
986                 zio->io_target_timestamp = zio_handle_io_delay(zio);
987                 bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
988                 bp->bio_data = zio->io_data;
989                 bp->bio_offset = zio->io_offset;
990                 bp->bio_length = zio->io_size;
991                 break;
992         case ZIO_TYPE_FREE:
993                 bp->bio_cmd = BIO_DELETE;
994                 bp->bio_data = NULL;
995                 bp->bio_offset = zio->io_offset;
996                 bp->bio_length = zio->io_size;
997                 break;
998         case ZIO_TYPE_IOCTL:
999                 bp->bio_cmd = BIO_FLUSH;
1000                 bp->bio_flags |= BIO_ORDERED;
1001                 bp->bio_data = NULL;
1002                 bp->bio_offset = cp->provider->mediasize;
1003                 bp->bio_length = 0;
1004                 break;
1005         }
1006         bp->bio_done = vdev_geom_io_intr;
1007
1008         g_io_request(bp, cp);
1009 }
1010
1011 static void
1012 vdev_geom_io_done(zio_t *zio)
1013 {
1014 }
1015
1016 static void
1017 vdev_geom_hold(vdev_t *vd)
1018 {
1019 }
1020
1021 static void
1022 vdev_geom_rele(vdev_t *vd)
1023 {
1024 }
1025
1026 vdev_ops_t vdev_geom_ops = {
1027         vdev_geom_open,
1028         vdev_geom_close,
1029         vdev_default_asize,
1030         vdev_geom_io_start,
1031         vdev_geom_io_done,
1032         NULL,
1033         vdev_geom_hold,
1034         vdev_geom_rele,
1035         VDEV_TYPE_DISK,         /* name of this vdev type */
1036         B_TRUE                  /* leaf vdev */
1037 };