]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
MFC r368207,368607:
[FreeBSD/stable/10.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_geom.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23  * All rights reserved.
24  *
25  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26  */
27
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/bio.h>
32 #include <sys/disk.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zio.h>
38 #include <geom/geom.h>
39 #include <geom/geom_int.h>
40
41 /*
42  * Virtual device vector for GEOM.
43  */
44
45 static g_attrchanged_t vdev_geom_attrchanged;
46 struct g_class zfs_vdev_class = {
47         .name = "ZFS::VDEV",
48         .version = G_VERSION,
49         .attrchanged = vdev_geom_attrchanged,
50 };
51
52 struct consumer_vdev_elem {
53         SLIST_ENTRY(consumer_vdev_elem) elems;
54         vdev_t                          *vd;
55 };
56
57 SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
58 _Static_assert(sizeof(((struct g_consumer*)NULL)->private)
59     == sizeof(struct consumer_priv_t*),
60     "consumer_priv_t* can't be stored in g_consumer.private");
61
62 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
63
64 SYSCTL_DECL(_vfs_zfs_vdev);
65 /* Don't send BIO_FLUSH. */
66 static int vdev_geom_bio_flush_disable = 0;
67 TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
68 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
69     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
70 /* Don't send BIO_DELETE. */
71 static int vdev_geom_bio_delete_disable = 0;
72 TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
73 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
74     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
75
76 /* Declare local functions */
77 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
78
79 /*
80  * Thread local storage used to indicate when a thread is probing geoms
81  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
82  * it is looking for a replacement for the vdev_t* that is its value.
83  */
84 uint_t zfs_geom_probe_vdev_key;
85
86 static void
87 vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
88
89         int error;
90         uint16_t rate;
91
92         error = g_getattr("GEOM::rotation_rate", cp, &rate);
93         if (error == 0)
94                 vd->vdev_rotation_rate = rate;
95         else
96                 vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
97 }
98
99 static void
100 vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
101                        boolean_t do_null_update)
102 {
103         boolean_t needs_update = B_FALSE;
104         char *physpath;
105         int error, physpath_len;
106
107         physpath_len = MAXPATHLEN;
108         physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
109         error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
110         if (error == 0) {
111                 char *old_physpath;
112
113                 /* g_topology lock ensures that vdev has not been closed */
114                 g_topology_assert();
115                 old_physpath = vd->vdev_physpath;
116                 vd->vdev_physpath = spa_strdup(physpath);
117
118                 if (old_physpath != NULL) {
119                         needs_update = (strcmp(old_physpath,
120                                                 vd->vdev_physpath) != 0);
121                         spa_strfree(old_physpath);
122                 } else
123                         needs_update = do_null_update;
124         }
125         g_free(physpath);
126
127         /*
128          * If the physical path changed, update the config.
129          * Only request an update for previously unset physpaths if
130          * requested by the caller.
131          */
132         if (needs_update)
133                 spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
134
135 }
136
137 static void
138 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
139 {
140         char *old_physpath;
141         struct consumer_priv_t *priv;
142         struct consumer_vdev_elem *elem;
143         int error;
144
145         priv = (struct consumer_priv_t*)&cp->private;
146         if (SLIST_EMPTY(priv))
147                 return;
148
149         SLIST_FOREACH(elem, priv, elems) {
150                 vdev_t *vd = elem->vd;
151                 if (strcmp(attr, "GEOM::rotation_rate") == 0) {
152                         vdev_geom_set_rotation_rate(vd, cp);
153                         return;
154                 }
155                 if (strcmp(attr, "GEOM::physpath") == 0) {
156                         vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
157                         return;
158                 }
159         }
160 }
161
162 static void
163 vdev_geom_orphan(struct g_consumer *cp)
164 {
165         struct consumer_priv_t *priv;
166         struct consumer_vdev_elem *elem;
167
168         g_topology_assert();
169
170         priv = (struct consumer_priv_t*)&cp->private;
171         if (SLIST_EMPTY(priv))
172                 /* Vdev close in progress.  Ignore the event. */
173                 return;
174
175         /*
176          * Orphan callbacks occur from the GEOM event thread.
177          * Concurrent with this call, new I/O requests may be
178          * working their way through GEOM about to find out
179          * (only once executed by the g_down thread) that we've
180          * been orphaned from our disk provider.  These I/Os
181          * must be retired before we can detach our consumer.
182          * This is most easily achieved by acquiring the
183          * SPA ZIO configuration lock as a writer, but doing
184          * so with the GEOM topology lock held would cause
185          * a lock order reversal.  Instead, rely on the SPA's
186          * async removal support to invoke a close on this
187          * vdev once it is safe to do so.
188          */
189         SLIST_FOREACH(elem, priv, elems) {
190                 vdev_t *vd = elem->vd;
191
192                 vd->vdev_remove_wanted = B_TRUE;
193                 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
194         }
195 }
196
197 static struct g_consumer *
198 vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
199 {
200         struct g_geom *gp;
201         struct g_consumer *cp;
202         int error;
203
204         g_topology_assert();
205
206         ZFS_LOG(1, "Attaching to %s.", pp->name);
207
208         if (sanity) {
209                 if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
210                         ZFS_LOG(1, "Failing attach of %s. "
211                                    "Incompatible sectorsize %d\n",
212                             pp->name, pp->sectorsize);
213                         return (NULL);
214                 } else if (pp->mediasize < SPA_MINDEVSIZE) {
215                         ZFS_LOG(1, "Failing attach of %s. "
216                                    "Incompatible mediasize %ju\n",
217                             pp->name, pp->mediasize);
218                         return (NULL);
219                 }
220         }
221
222         /* Do we have geom already? No? Create one. */
223         LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
224                 if (gp->flags & G_GEOM_WITHER)
225                         continue;
226                 if (strcmp(gp->name, "zfs::vdev") != 0)
227                         continue;
228                 break;
229         }
230         if (gp == NULL) {
231                 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
232                 gp->orphan = vdev_geom_orphan;
233                 gp->attrchanged = vdev_geom_attrchanged;
234                 cp = g_new_consumer(gp);
235                 error = g_attach(cp, pp);
236                 if (error != 0) {
237                         ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
238                             __LINE__, error);
239                         vdev_geom_detach(cp, B_FALSE);
240                         return (NULL);
241                 }
242                 error = g_access(cp, 1, 0, 1);
243                 if (error != 0) {
244                         ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__,
245                                __LINE__, error);
246                         vdev_geom_detach(cp, B_FALSE);
247                         return (NULL);
248                 }
249                 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
250         } else {
251                 /* Check if we are already connected to this provider. */
252                 LIST_FOREACH(cp, &gp->consumer, consumer) {
253                         if (cp->provider == pp) {
254                                 ZFS_LOG(1, "Found consumer for %s.", pp->name);
255                                 break;
256                         }
257                 }
258                 if (cp == NULL) {
259                         cp = g_new_consumer(gp);
260                         error = g_attach(cp, pp);
261                         if (error != 0) {
262                                 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
263                                     __func__, __LINE__, error);
264                                 vdev_geom_detach(cp, B_FALSE);
265                                 return (NULL);
266                         }
267                         error = g_access(cp, 1, 0, 1);
268                         if (error != 0) {
269                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
270                                     __func__, __LINE__, error);
271                                 vdev_geom_detach(cp, B_FALSE);
272                                 return (NULL);
273                         }
274                         ZFS_LOG(1, "Created consumer for %s.", pp->name);
275                 } else {
276                         error = g_access(cp, 1, 0, 1);
277                         if (error != 0) {
278                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
279                                     __func__, __LINE__, error);
280                                 return (NULL);
281                         }
282                         ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
283                 }
284         }
285
286         if (vd != NULL)
287                 vd->vdev_tsd = cp;
288
289         cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
290         return (cp);
291 }
292
293 static void
294 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
295 {
296         struct g_geom *gp;
297
298         g_topology_assert();
299
300         ZFS_LOG(1, "Detaching from %s.",
301             cp->provider && cp->provider->name ? cp->provider->name : "NULL");
302
303         gp = cp->geom;
304         if (open_for_read)
305                 g_access(cp, -1, 0, -1);
306         /* Destroy consumer on last close. */
307         if (cp->acr == 0 && cp->ace == 0) {
308                 if (cp->acw > 0)
309                         g_access(cp, 0, -cp->acw, 0);
310                 if (cp->provider != NULL) {
311                         ZFS_LOG(1, "Destroying consumer for %s.",
312                             cp->provider->name ? cp->provider->name : "NULL");
313                         g_detach(cp);
314                 }
315                 g_destroy_consumer(cp);
316         }
317         /* Destroy geom if there are no consumers left. */
318         if (LIST_EMPTY(&gp->consumer)) {
319                 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
320                 g_wither_geom(gp, ENXIO);
321         }
322 }
323
324 static void
325 vdev_geom_close_locked(vdev_t *vd)
326 {
327         struct g_consumer *cp;
328         struct consumer_priv_t *priv;
329         struct consumer_vdev_elem *elem, *elem_temp;
330
331         g_topology_assert();
332
333         cp = vd->vdev_tsd;
334         vd->vdev_delayed_close = B_FALSE;
335         if (cp == NULL)
336                 return;
337
338         ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
339         KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
340         priv = (struct consumer_priv_t*)&cp->private;
341         vd->vdev_tsd = NULL;
342         SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
343                 if (elem->vd == vd) {
344                         SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
345                         g_free(elem);
346                 }
347         }
348
349         vdev_geom_detach(cp, B_TRUE);
350 }
351
352 /*
353  * Issue one or more bios to the vdev in parallel
354  * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
355  * operation is described by parallel entries from each array.  There may be
356  * more bios actually issued than entries in the array
357  */
358 static void
359 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
360     off_t *sizes, int *errors, int ncmds)
361 {
362         struct bio **bios;
363         u_char *p;
364         off_t off, maxio, s, end;
365         int i, n_bios, j;
366         size_t bios_size;
367
368         maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
369         n_bios = 0;
370
371         /* How many bios are required for all commands ? */
372         for (i = 0; i < ncmds; i++)
373                 n_bios += (sizes[i] + maxio - 1) / maxio;
374
375         /* Allocate memory for the bios */
376         bios_size = n_bios * sizeof(struct bio*);
377         bios = kmem_zalloc(bios_size, KM_SLEEP);
378
379         /* Prepare and issue all of the bios */
380         for (i = j = 0; i < ncmds; i++) {
381                 off = offsets[i];
382                 p = datas[i];
383                 s = sizes[i];
384                 end = off + s;
385                 ASSERT((off % cp->provider->sectorsize) == 0);
386                 ASSERT((s % cp->provider->sectorsize) == 0);
387
388                 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
389                         bios[j] = g_alloc_bio();
390                         bios[j]->bio_cmd = cmds[i];
391                         bios[j]->bio_done = NULL;
392                         bios[j]->bio_offset = off;
393                         bios[j]->bio_length = MIN(s, maxio);
394                         bios[j]->bio_data = p;
395                         g_io_request(bios[j], cp);
396                 }
397         }
398         ASSERT(j == n_bios);
399
400         /* Wait for all of the bios to complete, and clean them up */
401         for (i = j = 0; i < ncmds; i++) {
402                 off = offsets[i];
403                 s = sizes[i];
404                 end = off + s;
405
406                 for (; off < end; off += maxio, s -= maxio, j++) {
407                         errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
408                         g_destroy_bio(bios[j]);
409                 }
410         }
411         kmem_free(bios, bios_size);
412 }
413
414 /* 
415  * Read the vdev config from a device.  Return the number of valid labels that
416  * were found.  The vdev config will be returned in config if and only if at
417  * least one valid label was found.
418  */
419 static int
420 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
421 {
422         struct g_provider *pp;
423         nvlist_t *config;
424         vdev_phys_t *vdev_lists[VDEV_LABELS];
425         char *buf;
426         size_t buflen;
427         uint64_t psize, state, txg;
428         off_t offsets[VDEV_LABELS];
429         off_t size;
430         off_t sizes[VDEV_LABELS];
431         int cmds[VDEV_LABELS];
432         int errors[VDEV_LABELS];
433         int l, nlabels;
434
435         g_topology_assert_not();
436
437         pp = cp->provider;
438         ZFS_LOG(1, "Reading config from %s...", pp->name);
439
440         psize = pp->mediasize;
441         psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
442
443         size = sizeof(*vdev_lists[0]) + pp->sectorsize -
444             ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
445
446         buflen = sizeof(vdev_lists[0]->vp_nvlist);
447
448         /* Create all of the IO requests */
449         for (l = 0; l < VDEV_LABELS; l++) {
450                 cmds[l] = BIO_READ;
451                 vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
452                 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
453                 sizes[l] = size;
454                 errors[l] = 0;
455                 ASSERT(offsets[l] % pp->sectorsize == 0);
456         }
457
458         /* Issue the IO requests */
459         vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
460             VDEV_LABELS);
461
462         /* Parse the labels */
463         config = *configp = NULL;
464         nlabels = 0;
465         for (l = 0; l < VDEV_LABELS; l++) {
466                 if (errors[l] != 0)
467                         continue;
468
469                 buf = vdev_lists[l]->vp_nvlist;
470
471                 if (nvlist_unpack(buf, buflen, &config, 0) != 0)
472                         continue;
473
474                 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
475                     &state) != 0 || state > POOL_STATE_L2CACHE) {
476                         nvlist_free(config);
477                         continue;
478                 }
479
480                 if (state != POOL_STATE_SPARE &&
481                     state != POOL_STATE_L2CACHE &&
482                     (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
483                     &txg) != 0 || txg == 0)) {
484                         nvlist_free(config);
485                         continue;
486                 }
487
488                 if (*configp != NULL)
489                         nvlist_free(*configp);
490                 *configp = config;
491
492                 nlabels++;
493         }
494
495         /* Free the label storage */
496         for (l = 0; l < VDEV_LABELS; l++)
497                 kmem_free(vdev_lists[l], size);
498
499         return (nlabels);
500 }
501
502 static void
503 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
504 {
505         nvlist_t **new_configs;
506         uint64_t i;
507
508         if (id < *count)
509                 return;
510         new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
511             KM_SLEEP);
512         for (i = 0; i < *count; i++)
513                 new_configs[i] = (*configs)[i];
514         if (*configs != NULL)
515                 kmem_free(*configs, *count * sizeof(void *));
516         *configs = new_configs;
517         *count = id + 1;
518 }
519
520 static void
521 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
522     const char *name, uint64_t* known_pool_guid)
523 {
524         nvlist_t *vdev_tree;
525         uint64_t pool_guid;
526         uint64_t vdev_guid, known_guid;
527         uint64_t id, txg, known_txg;
528         char *pname;
529         int i;
530
531         if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
532             strcmp(pname, name) != 0)
533                 goto ignore;
534
535         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
536                 goto ignore;
537
538         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
539                 goto ignore;
540
541         if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
542                 goto ignore;
543
544         if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
545                 goto ignore;
546
547         VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
548
549         if (*known_pool_guid != 0) {
550                 if (pool_guid != *known_pool_guid)
551                         goto ignore;
552         } else
553                 *known_pool_guid = pool_guid;
554
555         resize_configs(configs, count, id);
556
557         if ((*configs)[id] != NULL) {
558                 VERIFY(nvlist_lookup_uint64((*configs)[id],
559                     ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
560                 if (txg <= known_txg)
561                         goto ignore;
562                 nvlist_free((*configs)[id]);
563         }
564
565         (*configs)[id] = cfg;
566         return;
567
568 ignore:
569         nvlist_free(cfg);
570 }
571
572 int
573 vdev_geom_read_pool_label(const char *name,
574     nvlist_t ***configs, uint64_t *count)
575 {
576         struct g_class *mp;
577         struct g_geom *gp;
578         struct g_provider *pp;
579         struct g_consumer *zcp;
580         nvlist_t *vdev_cfg;
581         uint64_t pool_guid;
582         int error, nlabels;
583
584         DROP_GIANT();
585         g_topology_lock();
586
587         *configs = NULL;
588         *count = 0;
589         pool_guid = 0;
590         LIST_FOREACH(mp, &g_classes, class) {
591                 if (mp == &zfs_vdev_class)
592                         continue;
593                 LIST_FOREACH(gp, &mp->geom, geom) {
594                         if (gp->flags & G_GEOM_WITHER)
595                                 continue;
596                         LIST_FOREACH(pp, &gp->provider, provider) {
597                                 if (pp->flags & G_PF_WITHER)
598                                         continue;
599                                 zcp = vdev_geom_attach(pp, NULL, B_TRUE);
600                                 if (zcp == NULL)
601                                         continue;
602                                 g_topology_unlock();
603                                 nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
604                                 g_topology_lock();
605                                 vdev_geom_detach(zcp, B_TRUE);
606                                 if (nlabels == 0)
607                                         continue;
608                                 ZFS_LOG(1, "successfully read vdev config");
609
610                                 process_vdev_config(configs, count,
611                                     vdev_cfg, name, &pool_guid);
612                         }
613                 }
614         }
615         g_topology_unlock();
616         PICKUP_GIANT();
617
618         return (*count > 0 ? 0 : ENOENT);
619 }
620
621 enum match {
622         NO_MATCH = 0,           /* No matching labels found */
623         TOPGUID_MATCH = 1,      /* Labels match top guid, not vdev guid*/
624         ZERO_MATCH = 1,         /* Should never be returned */
625         ONE_MATCH = 2,          /* 1 label matching the vdev_guid */
626         TWO_MATCH = 3,          /* 2 label matching the vdev_guid */
627         THREE_MATCH = 4,        /* 3 label matching the vdev_guid */
628         FULL_MATCH = 5          /* all labels match the vdev_guid */
629 };
630
631 static enum match
632 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
633 {
634         nvlist_t *config;
635         uint64_t pool_guid, top_guid, vdev_guid;
636         struct g_consumer *cp;
637         int nlabels;
638
639         cp = vdev_geom_attach(pp, NULL, B_TRUE);
640         if (cp == NULL) {
641                 ZFS_LOG(1, "Unable to attach tasting instance to %s.",
642                     pp->name);
643                 return (NO_MATCH);
644         }
645         g_topology_unlock();
646         nlabels = vdev_geom_read_config(cp, &config);
647         g_topology_lock();
648         vdev_geom_detach(cp, B_TRUE);
649         if (nlabels == 0) {
650                 ZFS_LOG(1, "Unable to read config from %s.", pp->name);
651                 return (NO_MATCH);
652         }
653
654         pool_guid = 0;
655         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
656         top_guid = 0;
657         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
658         vdev_guid = 0;
659         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
660         nvlist_free(config);
661
662         /*
663          * Check that the label's pool guid matches the desired guid.
664          * Inactive spares and L2ARCs do not have any pool guid in the label.
665          */
666         if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
667                 ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
668                     pp->name,
669                     (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
670                 return (NO_MATCH);
671         }
672
673         /*
674          * Check that the label's vdev guid matches the desired guid.
675          * The second condition handles possible race on vdev detach, when
676          * remaining vdev receives GUID of destroyed top level mirror vdev.
677          */
678         if (vdev_guid == vd->vdev_guid) {
679                 ZFS_LOG(1, "guids match for provider %s.", pp->name);
680                 return (ZERO_MATCH + nlabels);
681         } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
682                 ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
683                 return (TOPGUID_MATCH);
684         }
685         ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
686             pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
687         return (NO_MATCH);
688 }
689
690 static struct g_consumer *
691 vdev_geom_attach_by_guids(vdev_t *vd)
692 {
693         struct g_class *mp;
694         struct g_geom *gp;
695         struct g_provider *pp, *best_pp;
696         struct g_consumer *cp;
697         enum match match, best_match;
698
699         g_topology_assert();
700
701         cp = NULL;
702         best_pp = NULL;
703         best_match = NO_MATCH;
704         LIST_FOREACH(mp, &g_classes, class) {
705                 if (mp == &zfs_vdev_class)
706                         continue;
707                 LIST_FOREACH(gp, &mp->geom, geom) {
708                         if (gp->flags & G_GEOM_WITHER)
709                                 continue;
710                         LIST_FOREACH(pp, &gp->provider, provider) {
711                                 match = vdev_attach_ok(vd, pp);
712                                 if (match > best_match) {
713                                         best_match = match;
714                                         best_pp = pp;
715                                 }
716                                 if (match == FULL_MATCH)
717                                         goto out;
718                         }
719                 }
720         }
721
722 out:
723         if (best_pp) {
724                 cp = vdev_geom_attach(best_pp, vd, B_TRUE);
725                 if (cp == NULL) {
726                         printf("ZFS WARNING: Unable to attach to %s.\n",
727                             best_pp->name);
728                 }
729         }
730         return (cp);
731 }
732
733 static struct g_consumer *
734 vdev_geom_open_by_guids(vdev_t *vd)
735 {
736         struct g_consumer *cp;
737         char *buf;
738         size_t len;
739
740         g_topology_assert();
741
742         ZFS_LOG(1, "Searching by guids [%ju:%ju].",
743                 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
744         cp = vdev_geom_attach_by_guids(vd);
745         if (cp != NULL) {
746                 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
747                 buf = kmem_alloc(len, KM_SLEEP);
748
749                 snprintf(buf, len, "/dev/%s", cp->provider->name);
750                 spa_strfree(vd->vdev_path);
751                 vd->vdev_path = buf;
752
753                 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
754                     (uintmax_t)spa_guid(vd->vdev_spa),
755                     (uintmax_t)vd->vdev_guid, cp->provider->name);
756         } else {
757                 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
758                     (uintmax_t)spa_guid(vd->vdev_spa),
759                     (uintmax_t)vd->vdev_guid);
760         }
761
762         return (cp);
763 }
764
765 static struct g_consumer *
766 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
767 {
768         struct g_provider *pp;
769         struct g_consumer *cp;
770
771         g_topology_assert();
772
773         cp = NULL;
774         pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
775         if (pp != NULL) {
776                 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
777                 if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
778                         cp = vdev_geom_attach(pp, vd, B_FALSE);
779         }
780
781         return (cp);
782 }
783
784 static int
785 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
786     uint64_t *logical_ashift, uint64_t *physical_ashift)
787 {
788         struct g_provider *pp;
789         struct g_consumer *cp;
790         size_t bufsize;
791         int error;
792
793         /* Set the TLS to indicate downstack that we should not access zvols*/
794         VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
795
796         /*
797          * We must have a pathname, and it must be absolute.
798          */
799         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
800                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
801                 return (EINVAL);
802         }
803
804         /*
805          * Reopen the device if it's not currently open. Otherwise,
806          * just update the physical size of the device.
807          */
808         if ((cp = vd->vdev_tsd) != NULL) {
809                 ASSERT(vd->vdev_reopening);
810                 goto skip_open;
811         }
812
813         DROP_GIANT();
814         g_topology_lock();
815         error = 0;
816
817         if (vd->vdev_spa->spa_splitting_newspa ||
818             (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
819              vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
820              vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
821                 /*
822                  * We are dealing with a vdev that hasn't been previously
823                  * opened (since boot), and we are not loading an
824                  * existing pool configuration.  This looks like a
825                  * vdev add operation to a new or existing pool.
826                  * Assume the user knows what he/she is doing and find
827                  * GEOM provider by its name, ignoring GUID mismatches.
828                  *
829                  * XXPOLICY: It would be safer to only allow a device
830                  *           that is unlabeled or labeled but missing
831                  *           GUID information to be opened in this fashion,
832                  *           unless we are doing a split, in which case we
833                  *           should allow any guid.
834                  */
835                 cp = vdev_geom_open_by_path(vd, 0);
836         } else {
837                 /*
838                  * Try using the recorded path for this device, but only
839                  * accept it if its label data contains the expected GUIDs.
840                  */
841                 cp = vdev_geom_open_by_path(vd, 1);
842                 if (cp == NULL) {
843                         /*
844                          * The device at vd->vdev_path doesn't have the
845                          * expected GUIDs. The disks might have merely
846                          * moved around so try all other GEOM providers
847                          * to find one with the right GUIDs.
848                          */
849                         cp = vdev_geom_open_by_guids(vd);
850                 }
851         }
852
853         /* Clear the TLS now that tasting is done */
854         VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
855
856         if (cp == NULL) {
857                 ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
858                 error = ENOENT;
859         } else {
860                 struct consumer_priv_t *priv;
861                 struct consumer_vdev_elem *elem;
862                 int spamode;
863
864                 priv = (struct consumer_priv_t*)&cp->private;
865                 if (cp->private == NULL)
866                         SLIST_INIT(priv);
867                 elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
868                 elem->vd = vd;
869                 SLIST_INSERT_HEAD(priv, elem, elems);
870
871                 spamode = spa_mode(vd->vdev_spa);
872                 if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
873                     !ISP2(cp->provider->sectorsize)) {
874                         ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
875                             cp->provider->name);
876
877                         vdev_geom_close_locked(vd);
878                         error = EINVAL;
879                         cp = NULL;
880                 } else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
881                         int i;
882
883                         for (i = 0; i < 5; i++) {
884                                 error = g_access(cp, 0, 1, 0);
885                                 if (error == 0)
886                                         break;
887                                 g_topology_unlock();
888                                 tsleep(vd, 0, "vdev", hz / 2);
889                                 g_topology_lock();
890                         }
891                         if (error != 0) {
892                                 printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
893                                     cp->provider->name, error);
894                                 vdev_geom_close_locked(vd);
895                                 cp = NULL;
896                         }
897                 }
898         }
899
900         /* Fetch initial physical path information for this device. */
901         if (cp != NULL) {
902                 vdev_geom_attrchanged(cp, "GEOM::physpath");
903         
904                 /* Set other GEOM characteristics */
905                 vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
906                 vdev_geom_set_rotation_rate(vd, cp);
907         }
908
909         g_topology_unlock();
910         PICKUP_GIANT();
911         if (cp == NULL) {
912                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
913                 return (error);
914         }
915 skip_open:
916         pp = cp->provider;
917
918         /*
919          * Determine the actual size of the device.
920          */
921         *max_psize = *psize = pp->mediasize;
922
923         /*
924          * Determine the device's minimum transfer size and preferred
925          * transfer size.
926          */
927         *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
928         *physical_ashift = 0;
929         if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
930             pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
931                 *physical_ashift = highbit(pp->stripesize) - 1;
932
933         /*
934          * Clear the nowritecache settings, so that on a vdev_reopen()
935          * we will try again.
936          */
937         vd->vdev_nowritecache = B_FALSE;
938
939         return (0);
940 }
941
942 static void
943 vdev_geom_close(vdev_t *vd)
944 {
945         struct g_consumer *cp;
946
947         cp = vd->vdev_tsd;
948
949         DROP_GIANT();
950         g_topology_lock();
951
952         if (!vd->vdev_reopening ||
953             (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
954             (cp->provider != NULL && cp->provider->error != 0))))
955                 vdev_geom_close_locked(vd);
956
957         g_topology_unlock();
958         PICKUP_GIANT();
959 }
960
961 static void
962 vdev_geom_io_intr(struct bio *bp)
963 {
964         vdev_t *vd;
965         zio_t *zio;
966
967         zio = bp->bio_caller1;
968         vd = zio->io_vd;
969         zio->io_error = bp->bio_error;
970         if (zio->io_error == 0 && bp->bio_resid != 0)
971                 zio->io_error = SET_ERROR(EIO);
972
973         switch(zio->io_error) {
974         case ENOTSUP:
975                 /*
976                  * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
977                  * that future attempts will never succeed. In this case
978                  * we set a persistent flag so that we don't bother with
979                  * requests in the future.
980                  */
981                 switch(bp->bio_cmd) {
982                 case BIO_FLUSH:
983                         vd->vdev_nowritecache = B_TRUE;
984                         break;
985                 case BIO_DELETE:
986                         vd->vdev_notrim = B_TRUE;
987                         break;
988                 }
989                 break;
990         case ENXIO:
991                 if (!vd->vdev_remove_wanted) {
992                         /*
993                          * If provider's error is set we assume it is being
994                          * removed.
995                          */
996                         if (bp->bio_to->error != 0) {
997                                 vd->vdev_remove_wanted = B_TRUE;
998                                 spa_async_request(zio->io_spa,
999                                     SPA_ASYNC_REMOVE);
1000                         } else if (!vd->vdev_delayed_close) {
1001                                 vd->vdev_delayed_close = B_TRUE;
1002                         }
1003                 }
1004                 break;
1005         }
1006         g_destroy_bio(bp);
1007         zio_delay_interrupt(zio);
1008 }
1009
1010 static void
1011 vdev_geom_io_start(zio_t *zio)
1012 {
1013         vdev_t *vd;
1014         struct g_consumer *cp;
1015         struct bio *bp;
1016         int error;
1017
1018         vd = zio->io_vd;
1019
1020         switch (zio->io_type) {
1021         case ZIO_TYPE_IOCTL:
1022                 /* XXPOLICY */
1023                 if (!vdev_readable(vd)) {
1024                         zio->io_error = SET_ERROR(ENXIO);
1025                         zio_interrupt(zio);
1026                         return;
1027                 } else {
1028                         switch (zio->io_cmd) {
1029                         case DKIOCFLUSHWRITECACHE:
1030                                 if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
1031                                         break;
1032                                 if (vd->vdev_nowritecache) {
1033                                         zio->io_error = SET_ERROR(ENOTSUP);
1034                                         break;
1035                                 }
1036                                 goto sendreq;
1037                         default:
1038                                 zio->io_error = SET_ERROR(ENOTSUP);
1039                         }
1040                 }
1041
1042                 zio_execute(zio);
1043                 return;
1044         case ZIO_TYPE_FREE:
1045                 if (vd->vdev_notrim) {
1046                         zio->io_error = SET_ERROR(ENOTSUP);
1047                 } else if (!vdev_geom_bio_delete_disable) {
1048                         goto sendreq;
1049                 }
1050                 zio_execute(zio);
1051                 return;
1052         }
1053 sendreq:
1054         ASSERT(zio->io_type == ZIO_TYPE_READ ||
1055             zio->io_type == ZIO_TYPE_WRITE ||
1056             zio->io_type == ZIO_TYPE_FREE ||
1057             zio->io_type == ZIO_TYPE_IOCTL);
1058
1059         cp = vd->vdev_tsd;
1060         if (cp == NULL) {
1061                 zio->io_error = SET_ERROR(ENXIO);
1062                 zio_interrupt(zio);
1063                 return;
1064         }
1065         bp = g_alloc_bio();
1066         bp->bio_caller1 = zio;
1067         switch (zio->io_type) {
1068         case ZIO_TYPE_READ:
1069         case ZIO_TYPE_WRITE:
1070                 zio->io_target_timestamp = zio_handle_io_delay(zio);
1071                 bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
1072                 bp->bio_data = zio->io_data;
1073                 bp->bio_offset = zio->io_offset;
1074                 bp->bio_length = zio->io_size;
1075                 break;
1076         case ZIO_TYPE_FREE:
1077                 bp->bio_cmd = BIO_DELETE;
1078                 bp->bio_data = NULL;
1079                 bp->bio_offset = zio->io_offset;
1080                 bp->bio_length = zio->io_size;
1081                 break;
1082         case ZIO_TYPE_IOCTL:
1083                 bp->bio_cmd = BIO_FLUSH;
1084                 bp->bio_flags |= BIO_ORDERED;
1085                 bp->bio_data = NULL;
1086                 bp->bio_offset = cp->provider->mediasize;
1087                 bp->bio_length = 0;
1088                 break;
1089         }
1090         bp->bio_done = vdev_geom_io_intr;
1091
1092         g_io_request(bp, cp);
1093 }
1094
1095 static void
1096 vdev_geom_io_done(zio_t *zio)
1097 {
1098 }
1099
1100 static void
1101 vdev_geom_hold(vdev_t *vd)
1102 {
1103 }
1104
1105 static void
1106 vdev_geom_rele(vdev_t *vd)
1107 {
1108 }
1109
1110 vdev_ops_t vdev_geom_ops = {
1111         vdev_geom_open,
1112         vdev_geom_close,
1113         vdev_default_asize,
1114         vdev_geom_io_start,
1115         vdev_geom_io_done,
1116         NULL,
1117         vdev_geom_hold,
1118         vdev_geom_rele,
1119         VDEV_TYPE_DISK,         /* name of this vdev type */
1120         B_TRUE                  /* leaf vdev */
1121 };