]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
MFC r298814 (by asomers): Fix a use-after-free when "zpool import" fails
[FreeBSD/stable/10.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_geom.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23  * All rights reserved.
24  *
25  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26  */
27
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/bio.h>
32 #include <sys/disk.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zio.h>
38 #include <geom/geom.h>
39 #include <geom/geom_int.h>
40
41 /*
42  * Virtual device vector for GEOM.
43  */
44
45 static g_attrchanged_t vdev_geom_attrchanged;
46 struct g_class zfs_vdev_class = {
47         .name = "ZFS::VDEV",
48         .version = G_VERSION,
49         .attrchanged = vdev_geom_attrchanged,
50 };
51
52 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54 SYSCTL_DECL(_vfs_zfs_vdev);
55 /* Don't send BIO_FLUSH. */
56 static int vdev_geom_bio_flush_disable = 0;
57 TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60 /* Don't send BIO_DELETE. */
61 static int vdev_geom_bio_delete_disable = 0;
62 TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65
66 /* Declare local functions */
67 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
68
69 /*
70  * Thread local storage used to indicate when a thread is probing geoms
71  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
72  * it is looking for a replacement for the vdev_t* that is its value.
73  */
74 uint_t zfs_geom_probe_vdev_key;
75
76 static void
77 vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
78
79         int error;
80         uint16_t rate;
81
82         error = g_getattr("GEOM::rotation_rate", cp, &rate);
83         if (error == 0)
84                 vd->vdev_rotation_rate = rate;
85         else
86                 vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
87 }
88
89 static void
90 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
91 {
92         vdev_t *vd;
93         spa_t *spa;
94         char *physpath;
95         int error, physpath_len;
96
97         vd = cp->private;
98         if (vd == NULL)
99                 return;
100
101         if (strcmp(attr, "GEOM::rotation_rate") == 0) {
102                 vdev_geom_set_rotation_rate(vd, cp);
103                 return;
104         }
105
106         if (strcmp(attr, "GEOM::physpath") != 0)
107                 return;
108
109         if (g_access(cp, 1, 0, 0) != 0)
110                 return;
111
112         /*
113          * Record/Update physical path information for this device.
114          */
115         spa = vd->vdev_spa;
116         physpath_len = MAXPATHLEN;
117         physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
118         error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
119         g_access(cp, -1, 0, 0);
120         if (error == 0) {
121                 char *old_physpath;
122
123                 /* g_topology lock ensures that vdev has not been closed */
124                 g_topology_assert();
125                 old_physpath = vd->vdev_physpath;
126                 vd->vdev_physpath = spa_strdup(physpath);
127                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
128
129                 if (old_physpath != NULL)
130                         spa_strfree(old_physpath);
131         }
132         g_free(physpath);
133 }
134
135 static void
136 vdev_geom_orphan(struct g_consumer *cp)
137 {
138         vdev_t *vd;
139
140         g_topology_assert();
141
142         vd = cp->private;
143         if (vd == NULL) {
144                 /* Vdev close in progress.  Ignore the event. */
145                 return;
146         }
147
148         /*
149          * Orphan callbacks occur from the GEOM event thread.
150          * Concurrent with this call, new I/O requests may be
151          * working their way through GEOM about to find out
152          * (only once executed by the g_down thread) that we've
153          * been orphaned from our disk provider.  These I/Os
154          * must be retired before we can detach our consumer.
155          * This is most easily achieved by acquiring the
156          * SPA ZIO configuration lock as a writer, but doing
157          * so with the GEOM topology lock held would cause
158          * a lock order reversal.  Instead, rely on the SPA's
159          * async removal support to invoke a close on this
160          * vdev once it is safe to do so.
161          */
162         vd->vdev_remove_wanted = B_TRUE;
163         spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
164 }
165
166 static struct g_consumer *
167 vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
168 {
169         struct g_geom *gp;
170         struct g_consumer *cp;
171         int error;
172
173         g_topology_assert();
174
175         ZFS_LOG(1, "Attaching to %s.", pp->name);
176
177         if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
178                 ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
179                     pp->name, pp->sectorsize);
180                 return (NULL);
181         } else if (pp->mediasize < SPA_MINDEVSIZE) {
182                 ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
183                     pp->name, pp->mediasize);
184                 return (NULL);
185         }
186
187         /* Do we have geom already? No? Create one. */
188         LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
189                 if (gp->flags & G_GEOM_WITHER)
190                         continue;
191                 if (strcmp(gp->name, "zfs::vdev") != 0)
192                         continue;
193                 break;
194         }
195         if (gp == NULL) {
196                 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
197                 gp->orphan = vdev_geom_orphan;
198                 gp->attrchanged = vdev_geom_attrchanged;
199                 cp = g_new_consumer(gp);
200                 error = g_attach(cp, pp);
201                 if (error != 0) {
202                         ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
203                             __LINE__, error);
204                         vdev_geom_detach(cp, B_FALSE);
205                         return (NULL);
206                 }
207                 error = g_access(cp, 1, 0, 1);
208                 if (error != 0) {
209                         ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
210                                __LINE__, error);
211                         vdev_geom_detach(cp, B_FALSE);
212                         return (NULL);
213                 }
214                 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
215         } else {
216                 /* Check if we are already connected to this provider. */
217                 LIST_FOREACH(cp, &gp->consumer, consumer) {
218                         if (cp->provider == pp) {
219                                 ZFS_LOG(1, "Found consumer for %s.", pp->name);
220                                 break;
221                         }
222                 }
223                 if (cp == NULL) {
224                         cp = g_new_consumer(gp);
225                         error = g_attach(cp, pp);
226                         if (error != 0) {
227                                 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
228                                     __func__, __LINE__, error);
229                                 vdev_geom_detach(cp, B_FALSE);
230                                 return (NULL);
231                         }
232                         error = g_access(cp, 1, 0, 1);
233                         if (error != 0) {
234                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
235                                     __func__, __LINE__, error);
236                                 vdev_geom_detach(cp, B_FALSE);
237                                 return (NULL);
238                         }
239                         ZFS_LOG(1, "Created consumer for %s.", pp->name);
240                 } else {
241                         error = g_access(cp, 1, 0, 1);
242                         if (error != 0) {
243                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
244                                     __func__, __LINE__, error);
245                                 return (NULL);
246                         }
247                         ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
248                 }
249         }
250
251         /* 
252          * BUG: cp may already belong to a vdev.  This could happen if:
253          * 1) That vdev is a shared spare, or
254          * 2) We are trying to reopen a missing vdev and we are scanning by
255          *    guid.  In that case, we'll ultimately fail to open this consumer,
256          *    but not until after setting the private field.
257          * The solution is to:
258          * 1) Don't set the private field until after the open succeeds, and
259          * 2) Set it to a linked list of vdevs, not just a single vdev
260          */
261         cp->private = vd;
262         if (vd != NULL)
263                 vd->vdev_tsd = cp;
264
265         cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
266         return (cp);
267 }
268
269 static void
270 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
271 {
272         struct g_geom *gp;
273         vdev_t *vd;
274
275         g_topology_assert();
276
277         ZFS_LOG(1, "Detaching consumer. Provider %s.",
278             cp->provider && cp->provider->name ? cp->provider->name : "NULL");
279
280         vd = cp->private;
281         cp->private = NULL;
282
283         gp = cp->geom;
284         if (open_for_read)
285                 g_access(cp, -1, 0, -1);
286         /* Destroy consumer on last close. */
287         if (cp->acr == 0 && cp->ace == 0) {
288                 if (cp->acw > 0)
289                         g_access(cp, 0, -cp->acw, 0);
290                 if (cp->provider != NULL) {
291                         ZFS_LOG(1, "Destroying consumer to %s.",
292                             cp->provider->name ? cp->provider->name : "NULL");
293                         g_detach(cp);
294                 }
295                 g_destroy_consumer(cp);
296         }
297         /* Destroy geom if there are no consumers left. */
298         if (LIST_EMPTY(&gp->consumer)) {
299                 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
300                 g_wither_geom(gp, ENXIO);
301         }
302 }
303
304 static void
305 vdev_geom_close_locked(vdev_t *vd)
306 {
307         struct g_consumer *cp;
308
309         g_topology_assert();
310
311         cp = vd->vdev_tsd;
312         vd->vdev_tsd = NULL;
313         vd->vdev_delayed_close = B_FALSE;
314         if (cp == NULL)
315                 return;
316
317         ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
318
319         vdev_geom_detach(cp, B_TRUE);
320 }
321
322 static void
323 nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
324 {
325
326         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
327         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
328 }
329
330 static int
331 vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
332 {
333         struct bio *bp;
334         u_char *p;
335         off_t off, maxio;
336         int error;
337
338         ASSERT((offset % cp->provider->sectorsize) == 0);
339         ASSERT((size % cp->provider->sectorsize) == 0);
340
341         bp = g_alloc_bio();
342         off = offset;
343         offset += size;
344         p = data;
345         maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
346         error = 0;
347
348         for (; off < offset; off += maxio, p += maxio, size -= maxio) {
349                 bzero(bp, sizeof(*bp));
350                 bp->bio_cmd = cmd;
351                 bp->bio_done = NULL;
352                 bp->bio_offset = off;
353                 bp->bio_length = MIN(size, maxio);
354                 bp->bio_data = p;
355                 g_io_request(bp, cp);
356                 error = biowait(bp, "vdev_geom_io");
357                 if (error != 0)
358                         break;
359         }
360
361         g_destroy_bio(bp);
362         return (error);
363 }
364
365 static int
366 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
367 {
368         struct g_provider *pp;
369         vdev_label_t *label;
370         char *p, *buf;
371         size_t buflen;
372         uint64_t psize;
373         off_t offset, size;
374         uint64_t state, txg;
375         int error, l, len;
376
377         g_topology_assert_not();
378
379         pp = cp->provider;
380         ZFS_LOG(1, "Reading config from %s...", pp->name);
381
382         psize = pp->mediasize;
383         psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
384
385         size = sizeof(*label) + pp->sectorsize -
386             ((sizeof(*label) - 1) % pp->sectorsize) - 1;
387
388         label = kmem_alloc(size, KM_SLEEP);
389         buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
390
391         *config = NULL;
392         for (l = 0; l < VDEV_LABELS; l++) {
393
394                 offset = vdev_label_offset(psize, l, 0);
395                 if ((offset % pp->sectorsize) != 0)
396                         continue;
397
398                 if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
399                         continue;
400                 buf = label->vl_vdev_phys.vp_nvlist;
401
402                 if (nvlist_unpack(buf, buflen, config, 0) != 0)
403                         continue;
404
405                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
406                     &state) != 0 || state > POOL_STATE_L2CACHE) {
407                         nvlist_free(*config);
408                         *config = NULL;
409                         continue;
410                 }
411
412                 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
413                     (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
414                     &txg) != 0 || txg == 0)) {
415                         nvlist_free(*config);
416                         *config = NULL;
417                         continue;
418                 }
419
420                 break;
421         }
422
423         kmem_free(label, size);
424         return (*config == NULL ? ENOENT : 0);
425 }
426
427 static void
428 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
429 {
430         nvlist_t **new_configs;
431         uint64_t i;
432
433         if (id < *count)
434                 return;
435         new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
436             KM_SLEEP);
437         for (i = 0; i < *count; i++)
438                 new_configs[i] = (*configs)[i];
439         if (*configs != NULL)
440                 kmem_free(*configs, *count * sizeof(void *));
441         *configs = new_configs;
442         *count = id + 1;
443 }
444
445 static void
446 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
447     const char *name, uint64_t* known_pool_guid)
448 {
449         nvlist_t *vdev_tree;
450         uint64_t pool_guid;
451         uint64_t vdev_guid, known_guid;
452         uint64_t id, txg, known_txg;
453         char *pname;
454         int i;
455
456         if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
457             strcmp(pname, name) != 0)
458                 goto ignore;
459
460         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
461                 goto ignore;
462
463         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
464                 goto ignore;
465
466         if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
467                 goto ignore;
468
469         if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
470                 goto ignore;
471
472         VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
473
474         if (*known_pool_guid != 0) {
475                 if (pool_guid != *known_pool_guid)
476                         goto ignore;
477         } else
478                 *known_pool_guid = pool_guid;
479
480         resize_configs(configs, count, id);
481
482         if ((*configs)[id] != NULL) {
483                 VERIFY(nvlist_lookup_uint64((*configs)[id],
484                     ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
485                 if (txg <= known_txg)
486                         goto ignore;
487                 nvlist_free((*configs)[id]);
488         }
489
490         (*configs)[id] = cfg;
491         return;
492
493 ignore:
494         nvlist_free(cfg);
495 }
496
497 int
498 vdev_geom_read_pool_label(const char *name,
499     nvlist_t ***configs, uint64_t *count)
500 {
501         struct g_class *mp;
502         struct g_geom *gp;
503         struct g_provider *pp;
504         struct g_consumer *zcp;
505         nvlist_t *vdev_cfg;
506         uint64_t pool_guid;
507         int error;
508
509         DROP_GIANT();
510         g_topology_lock();
511
512         *configs = NULL;
513         *count = 0;
514         pool_guid = 0;
515         LIST_FOREACH(mp, &g_classes, class) {
516                 if (mp == &zfs_vdev_class)
517                         continue;
518                 LIST_FOREACH(gp, &mp->geom, geom) {
519                         if (gp->flags & G_GEOM_WITHER)
520                                 continue;
521                         LIST_FOREACH(pp, &gp->provider, provider) {
522                                 if (pp->flags & G_PF_WITHER)
523                                         continue;
524                                 zcp = vdev_geom_attach(pp, NULL);
525                                 if (zcp == NULL)
526                                         continue;
527                                 g_topology_unlock();
528                                 error = vdev_geom_read_config(zcp, &vdev_cfg);
529                                 g_topology_lock();
530                                 vdev_geom_detach(zcp, B_TRUE);
531                                 if (error)
532                                         continue;
533                                 ZFS_LOG(1, "successfully read vdev config");
534
535                                 process_vdev_config(configs, count,
536                                     vdev_cfg, name, &pool_guid);
537                         }
538                 }
539         }
540         g_topology_unlock();
541         PICKUP_GIANT();
542
543         return (*count > 0 ? 0 : ENOENT);
544 }
545
546 static void
547 vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
548 {
549         nvlist_t *config;
550
551         g_topology_assert_not();
552
553         *pguid = 0;
554         *vguid = 0;
555         if (vdev_geom_read_config(cp, &config) == 0) {
556                 nvlist_get_guids(config, pguid, vguid);
557                 nvlist_free(config);
558         }
559 }
560
561 static boolean_t
562 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
563 {
564         uint64_t pool_guid;
565         uint64_t vdev_guid;
566         struct g_consumer *zcp;
567         boolean_t pool_ok;
568         boolean_t vdev_ok;
569
570         zcp = vdev_geom_attach(pp, NULL);
571         if (zcp == NULL) {
572                 ZFS_LOG(1, "Unable to attach tasting instance to %s.",
573                     pp->name);
574                 return (B_FALSE);
575         }
576         g_topology_unlock();
577         vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid);
578         g_topology_lock();
579         vdev_geom_detach(zcp, B_TRUE);
580
581         /* 
582          * Check that the label's vdev guid matches the desired guid.  If the
583          * label has a pool guid, check that it matches too. (Inactive spares
584          * and L2ARCs do not have any pool guid in the label.)
585          */
586         if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) &&
587             vdev_guid == vd->vdev_guid) {
588                 ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path);
589                 return (B_TRUE);
590         } else {
591                 ZFS_LOG(1, "guid mismatch for provider %s: "
592                     "%ju:%ju != %ju:%ju.", vd->vdev_path,
593                     (uintmax_t)spa_guid(vd->vdev_spa),
594                     (uintmax_t)vd->vdev_guid,
595                     (uintmax_t)pool_guid, (uintmax_t)vdev_guid);
596                 return (B_FALSE);
597         }
598 }
599
600 static struct g_consumer *
601 vdev_geom_attach_by_guids(vdev_t *vd)
602 {
603         struct g_class *mp;
604         struct g_geom *gp;
605         struct g_provider *pp;
606         struct g_consumer *cp;
607
608         g_topology_assert();
609
610         cp = NULL;
611         LIST_FOREACH(mp, &g_classes, class) {
612                 if (mp == &zfs_vdev_class)
613                         continue;
614                 LIST_FOREACH(gp, &mp->geom, geom) {
615                         if (gp->flags & G_GEOM_WITHER)
616                                 continue;
617                         LIST_FOREACH(pp, &gp->provider, provider) {
618                                 if (!vdev_attach_ok(vd, pp))
619                                         continue;
620                                 cp = vdev_geom_attach(pp, vd);
621                                 if (cp == NULL) {
622                                         printf("ZFS WARNING: Unable to "
623                                             "attach to %s.\n", pp->name);
624                                         continue;
625                                 }
626                                 break;
627                         }
628                         if (cp != NULL)
629                                 break;
630                 }
631                 if (cp != NULL)
632                         break;
633         }
634 end:
635         return (cp);
636 }
637
638 static struct g_consumer *
639 vdev_geom_open_by_guids(vdev_t *vd)
640 {
641         struct g_consumer *cp;
642         char *buf;
643         size_t len;
644
645         g_topology_assert();
646
647         ZFS_LOG(1, "Searching by guids [%ju:%ju].",
648                 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
649         cp = vdev_geom_attach_by_guids(vd);
650         if (cp != NULL) {
651                 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
652                 buf = kmem_alloc(len, KM_SLEEP);
653
654                 snprintf(buf, len, "/dev/%s", cp->provider->name);
655                 spa_strfree(vd->vdev_path);
656                 vd->vdev_path = buf;
657
658                 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
659                     (uintmax_t)spa_guid(vd->vdev_spa),
660                     (uintmax_t)vd->vdev_guid, vd->vdev_path);
661         } else {
662                 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
663                     (uintmax_t)spa_guid(vd->vdev_spa),
664                     (uintmax_t)vd->vdev_guid);
665         }
666
667         return (cp);
668 }
669
670 static struct g_consumer *
671 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
672 {
673         struct g_provider *pp;
674         struct g_consumer *cp;
675
676         g_topology_assert();
677
678         cp = NULL;
679         pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
680         if (pp != NULL) {
681                 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
682                 if (!check_guid || vdev_attach_ok(vd, pp))
683                         cp = vdev_geom_attach(pp, vd);
684         }
685
686         return (cp);
687 }
688
689 static int
690 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
691     uint64_t *logical_ashift, uint64_t *physical_ashift)
692 {
693         struct g_provider *pp;
694         struct g_consumer *cp;
695         size_t bufsize;
696         int error;
697
698         /* Set the TLS to indicate downstack that we should not access zvols*/
699         VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
700
701         /*
702          * We must have a pathname, and it must be absolute.
703          */
704         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
705                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
706                 return (EINVAL);
707         }
708
709         vd->vdev_tsd = NULL;
710
711         DROP_GIANT();
712         g_topology_lock();
713         error = 0;
714
715         if (vd->vdev_spa->spa_splitting_newspa ||
716             (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
717              vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
718              vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
719                 /*
720                  * We are dealing with a vdev that hasn't been previously
721                  * opened (since boot), and we are not loading an
722                  * existing pool configuration.  This looks like a
723                  * vdev add operation to a new or existing pool.
724                  * Assume the user knows what he/she is doing and find
725                  * GEOM provider by its name, ignoring GUID mismatches.
726                  *
727                  * XXPOLICY: It would be safer to only allow a device
728                  *           that is unlabeled or labeled but missing
729                  *           GUID information to be opened in this fashion,
730                  *           unless we are doing a split, in which case we
731                  *           should allow any guid.
732                  */
733                 cp = vdev_geom_open_by_path(vd, 0);
734         } else {
735                 /*
736                  * Try using the recorded path for this device, but only
737                  * accept it if its label data contains the expected GUIDs.
738                  */
739                 cp = vdev_geom_open_by_path(vd, 1);
740                 if (cp == NULL) {
741                         /*
742                          * The device at vd->vdev_path doesn't have the
743                          * expected GUIDs. The disks might have merely
744                          * moved around so try all other GEOM providers
745                          * to find one with the right GUIDs.
746                          */
747                         cp = vdev_geom_open_by_guids(vd);
748                 }
749         }
750
751         /* Clear the TLS now that tasting is done */
752         VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
753
754         if (cp == NULL) {
755                 ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
756                 error = ENOENT;
757         } else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
758             !ISP2(cp->provider->sectorsize)) {
759                 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
760                     vd->vdev_path);
761
762                 vdev_geom_close_locked(vd);
763                 error = EINVAL;
764                 cp = NULL;
765         } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
766                 int i;
767
768                 for (i = 0; i < 5; i++) {
769                         error = g_access(cp, 0, 1, 0);
770                         if (error == 0)
771                                 break;
772                         g_topology_unlock();
773                         tsleep(vd, 0, "vdev", hz / 2);
774                         g_topology_lock();
775                 }
776                 if (error != 0) {
777                         printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
778                             vd->vdev_path, error);
779                         vdev_geom_close_locked(vd);
780                         cp = NULL;
781                 }
782         }
783
784         /* Fetch initial physical path information for this device. */
785         if (cp != NULL)
786                 vdev_geom_attrchanged(cp, "GEOM::physpath");
787         
788         g_topology_unlock();
789         PICKUP_GIANT();
790         if (cp == NULL) {
791                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
792                 return (error);
793         }
794         pp = cp->provider;
795
796         /*
797          * Determine the actual size of the device.
798          */
799         *max_psize = *psize = pp->mediasize;
800
801         /*
802          * Determine the device's minimum transfer size and preferred
803          * transfer size.
804          */
805         *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
806         *physical_ashift = 0;
807         if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
808             pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
809                 *physical_ashift = highbit(pp->stripesize) - 1;
810
811         /*
812          * Clear the nowritecache settings, so that on a vdev_reopen()
813          * we will try again.
814          */
815         vd->vdev_nowritecache = B_FALSE;
816
817         /*
818          * Determine the device's rotation rate.
819          */
820         vdev_geom_set_rotation_rate(vd, cp);
821
822         return (0);
823 }
824
825 static void
826 vdev_geom_close(vdev_t *vd)
827 {
828
829         DROP_GIANT();
830         g_topology_lock();
831         vdev_geom_close_locked(vd);
832         g_topology_unlock();
833         PICKUP_GIANT();
834 }
835
836 static void
837 vdev_geom_io_intr(struct bio *bp)
838 {
839         vdev_t *vd;
840         zio_t *zio;
841
842         zio = bp->bio_caller1;
843         vd = zio->io_vd;
844         zio->io_error = bp->bio_error;
845         if (zio->io_error == 0 && bp->bio_resid != 0)
846                 zio->io_error = SET_ERROR(EIO);
847
848         switch(zio->io_error) {
849         case ENOTSUP:
850                 /*
851                  * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
852                  * that future attempts will never succeed. In this case
853                  * we set a persistent flag so that we don't bother with
854                  * requests in the future.
855                  */
856                 switch(bp->bio_cmd) {
857                 case BIO_FLUSH:
858                         vd->vdev_nowritecache = B_TRUE;
859                         break;
860                 case BIO_DELETE:
861                         vd->vdev_notrim = B_TRUE;
862                         break;
863                 }
864                 break;
865         case ENXIO:
866                 if (!vd->vdev_remove_wanted) {
867                         /*
868                          * If provider's error is set we assume it is being
869                          * removed.
870                          */
871                         if (bp->bio_to->error != 0) {
872                                 vd->vdev_remove_wanted = B_TRUE;
873                                 spa_async_request(zio->io_spa,
874                                     SPA_ASYNC_REMOVE);
875                         } else if (!vd->vdev_delayed_close) {
876                                 vd->vdev_delayed_close = B_TRUE;
877                         }
878                 }
879                 break;
880         }
881         g_destroy_bio(bp);
882         zio_delay_interrupt(zio);
883 }
884
885 static void
886 vdev_geom_io_start(zio_t *zio)
887 {
888         vdev_t *vd;
889         struct g_consumer *cp;
890         struct bio *bp;
891         int error;
892
893         vd = zio->io_vd;
894
895         switch (zio->io_type) {
896         case ZIO_TYPE_IOCTL:
897                 /* XXPOLICY */
898                 if (!vdev_readable(vd)) {
899                         zio->io_error = SET_ERROR(ENXIO);
900                         zio_interrupt(zio);
901                         return;
902                 } else {
903                         switch (zio->io_cmd) {
904                         case DKIOCFLUSHWRITECACHE:
905                                 if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
906                                         break;
907                                 if (vd->vdev_nowritecache) {
908                                         zio->io_error = SET_ERROR(ENOTSUP);
909                                         break;
910                                 }
911                                 goto sendreq;
912                         default:
913                                 zio->io_error = SET_ERROR(ENOTSUP);
914                         }
915                 }
916
917                 zio_execute(zio);
918                 return;
919         case ZIO_TYPE_FREE:
920                 if (vd->vdev_notrim) {
921                         zio->io_error = SET_ERROR(ENOTSUP);
922                 } else if (!vdev_geom_bio_delete_disable) {
923                         goto sendreq;
924                 }
925                 zio_execute(zio);
926                 return;
927         }
928 sendreq:
929         ASSERT(zio->io_type == ZIO_TYPE_READ ||
930             zio->io_type == ZIO_TYPE_WRITE ||
931             zio->io_type == ZIO_TYPE_FREE ||
932             zio->io_type == ZIO_TYPE_IOCTL);
933
934         cp = vd->vdev_tsd;
935         if (cp == NULL) {
936                 zio->io_error = SET_ERROR(ENXIO);
937                 zio_interrupt(zio);
938                 return;
939         }
940         bp = g_alloc_bio();
941         bp->bio_caller1 = zio;
942         switch (zio->io_type) {
943         case ZIO_TYPE_READ:
944         case ZIO_TYPE_WRITE:
945                 zio->io_target_timestamp = zio_handle_io_delay(zio);
946                 bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
947                 bp->bio_data = zio->io_data;
948                 bp->bio_offset = zio->io_offset;
949                 bp->bio_length = zio->io_size;
950                 break;
951         case ZIO_TYPE_FREE:
952                 bp->bio_cmd = BIO_DELETE;
953                 bp->bio_data = NULL;
954                 bp->bio_offset = zio->io_offset;
955                 bp->bio_length = zio->io_size;
956                 break;
957         case ZIO_TYPE_IOCTL:
958                 bp->bio_cmd = BIO_FLUSH;
959                 bp->bio_flags |= BIO_ORDERED;
960                 bp->bio_data = NULL;
961                 bp->bio_offset = cp->provider->mediasize;
962                 bp->bio_length = 0;
963                 break;
964         }
965         bp->bio_done = vdev_geom_io_intr;
966
967         g_io_request(bp, cp);
968 }
969
970 static void
971 vdev_geom_io_done(zio_t *zio)
972 {
973 }
974
975 static void
976 vdev_geom_hold(vdev_t *vd)
977 {
978 }
979
980 static void
981 vdev_geom_rele(vdev_t *vd)
982 {
983 }
984
985 vdev_ops_t vdev_geom_ops = {
986         vdev_geom_open,
987         vdev_geom_close,
988         vdev_default_asize,
989         vdev_geom_io_start,
990         vdev_geom_io_done,
991         NULL,
992         vdev_geom_hold,
993         vdev_geom_rele,
994         VDEV_TYPE_DISK,         /* name of this vdev type */
995         B_TRUE                  /* leaf vdev */
996 };