]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
MFV r306422: 7254 ztest failed assertion in ztest_dataset_dirobj_verify: dirobjs...
[FreeBSD/FreeBSD.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_geom.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23  * All rights reserved.
24  *
25  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26  */
27
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/bio.h>
32 #include <sys/disk.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zio.h>
38 #include <geom/geom.h>
39 #include <geom/geom_int.h>
40
41 /*
42  * Virtual device vector for GEOM.
43  */
44
45 static g_attrchanged_t vdev_geom_attrchanged;
46 struct g_class zfs_vdev_class = {
47         .name = "ZFS::VDEV",
48         .version = G_VERSION,
49         .attrchanged = vdev_geom_attrchanged,
50 };
51
52 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54 SYSCTL_DECL(_vfs_zfs_vdev);
55 /* Don't send BIO_FLUSH. */
56 static int vdev_geom_bio_flush_disable;
57 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
58     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
59 /* Don't send BIO_DELETE. */
60 static int vdev_geom_bio_delete_disable;
61 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
62     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
63
64 /* Declare local functions */
65 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
66
67 /*
68  * Thread local storage used to indicate when a thread is probing geoms
69  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
70  * it is looking for a replacement for the vdev_t* that is its value.
71  */
72 uint_t zfs_geom_probe_vdev_key;
73
74 static void
75 vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
76
77         int error;
78         uint16_t rate;
79
80         error = g_getattr("GEOM::rotation_rate", cp, &rate);
81         if (error == 0)
82                 vd->vdev_rotation_rate = rate;
83         else
84                 vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
85 }
86
87 static void
88 vdev_geom_set_physpath(struct g_consumer *cp, boolean_t do_null_update)
89 {
90         boolean_t needs_update = B_FALSE;
91         vdev_t *vd;
92         char *physpath;
93         int error, physpath_len;
94
95         if (g_access(cp, 1, 0, 0) != 0)
96                 return;
97
98         vd = cp->private;
99         physpath_len = MAXPATHLEN;
100         physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
101         error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
102         g_access(cp, -1, 0, 0);
103         if (error == 0) {
104                 char *old_physpath;
105
106                 /* g_topology lock ensures that vdev has not been closed */
107                 g_topology_assert();
108                 old_physpath = vd->vdev_physpath;
109                 vd->vdev_physpath = spa_strdup(physpath);
110
111                 if (old_physpath != NULL) {
112                         needs_update = (strcmp(old_physpath,
113                                                 vd->vdev_physpath) != 0);
114                         spa_strfree(old_physpath);
115                 } else
116                         needs_update = do_null_update;
117         }
118         g_free(physpath);
119
120         /*
121          * If the physical path changed, update the config.
122          * Only request an update for previously unset physpaths if
123          * requested by the caller.
124          */
125         if (needs_update)
126                 spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
127
128 }
129
130 static void
131 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
132 {
133         vdev_t *vd;
134         char *old_physpath;
135         int error;
136
137         vd = cp->private;
138         if (vd == NULL)
139                 return;
140
141         if (strcmp(attr, "GEOM::rotation_rate") == 0) {
142                 vdev_geom_set_rotation_rate(vd, cp);
143                 return;
144         }
145
146         if (strcmp(attr, "GEOM::physpath") == 0) {
147                 vdev_geom_set_physpath(cp, /*do_null_update*/B_TRUE);
148                 return;
149         }
150 }
151
152 static void
153 vdev_geom_orphan(struct g_consumer *cp)
154 {
155         vdev_t *vd;
156
157         g_topology_assert();
158
159         vd = cp->private;
160         if (vd == NULL) {
161                 /* Vdev close in progress.  Ignore the event. */
162                 return;
163         }
164
165         /*
166          * Orphan callbacks occur from the GEOM event thread.
167          * Concurrent with this call, new I/O requests may be
168          * working their way through GEOM about to find out
169          * (only once executed by the g_down thread) that we've
170          * been orphaned from our disk provider.  These I/Os
171          * must be retired before we can detach our consumer.
172          * This is most easily achieved by acquiring the
173          * SPA ZIO configuration lock as a writer, but doing
174          * so with the GEOM topology lock held would cause
175          * a lock order reversal.  Instead, rely on the SPA's
176          * async removal support to invoke a close on this
177          * vdev once it is safe to do so.
178          */
179         vd->vdev_remove_wanted = B_TRUE;
180         spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
181 }
182
183 static struct g_consumer *
184 vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
185 {
186         struct g_geom *gp;
187         struct g_consumer *cp;
188         int error;
189
190         g_topology_assert();
191
192         ZFS_LOG(1, "Attaching to %s.", pp->name);
193
194         if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
195                 ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
196                     pp->name, pp->sectorsize);
197                 return (NULL);
198         } else if (pp->mediasize < SPA_MINDEVSIZE) {
199                 ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
200                     pp->name, pp->mediasize);
201                 return (NULL);
202         }
203
204         /* Do we have geom already? No? Create one. */
205         LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
206                 if (gp->flags & G_GEOM_WITHER)
207                         continue;
208                 if (strcmp(gp->name, "zfs::vdev") != 0)
209                         continue;
210                 break;
211         }
212         if (gp == NULL) {
213                 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
214                 gp->orphan = vdev_geom_orphan;
215                 gp->attrchanged = vdev_geom_attrchanged;
216                 cp = g_new_consumer(gp);
217                 error = g_attach(cp, pp);
218                 if (error != 0) {
219                         ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
220                             __LINE__, error);
221                         vdev_geom_detach(cp, B_FALSE);
222                         return (NULL);
223                 }
224                 error = g_access(cp, 1, 0, 1);
225                 if (error != 0) {
226                         ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
227                                __LINE__, error);
228                         vdev_geom_detach(cp, B_FALSE);
229                         return (NULL);
230                 }
231                 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
232         } else {
233                 /* Check if we are already connected to this provider. */
234                 LIST_FOREACH(cp, &gp->consumer, consumer) {
235                         if (cp->provider == pp) {
236                                 ZFS_LOG(1, "Found consumer for %s.", pp->name);
237                                 break;
238                         }
239                 }
240                 if (cp == NULL) {
241                         cp = g_new_consumer(gp);
242                         error = g_attach(cp, pp);
243                         if (error != 0) {
244                                 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
245                                     __func__, __LINE__, error);
246                                 vdev_geom_detach(cp, B_FALSE);
247                                 return (NULL);
248                         }
249                         error = g_access(cp, 1, 0, 1);
250                         if (error != 0) {
251                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
252                                     __func__, __LINE__, error);
253                                 vdev_geom_detach(cp, B_FALSE);
254                                 return (NULL);
255                         }
256                         ZFS_LOG(1, "Created consumer for %s.", pp->name);
257                 } else {
258                         error = g_access(cp, 1, 0, 1);
259                         if (error != 0) {
260                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
261                                     __func__, __LINE__, error);
262                                 return (NULL);
263                         }
264                         ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
265                 }
266         }
267
268         /* 
269          * BUG: cp may already belong to a vdev.  This could happen if:
270          * 1) That vdev is a shared spare, or
271          * 2) We are trying to reopen a missing vdev and we are scanning by
272          *    guid.  In that case, we'll ultimately fail to open this consumer,
273          *    but not until after setting the private field.
274          * The solution is to:
275          * 1) Don't set the private field until after the open succeeds, and
276          * 2) Set it to a linked list of vdevs, not just a single vdev
277          */
278         cp->private = vd;
279         if (vd != NULL) {
280                 vd->vdev_tsd = cp;
281                 vdev_geom_set_physpath(cp, /*do_null_update*/B_FALSE);
282         }
283
284         cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
285         return (cp);
286 }
287
288 static void
289 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
290 {
291         struct g_geom *gp;
292         vdev_t *vd;
293
294         g_topology_assert();
295
296         ZFS_LOG(1, "Detaching consumer. Provider %s.",
297             cp->provider && cp->provider->name ? cp->provider->name : "NULL");
298
299         vd = cp->private;
300         cp->private = NULL;
301
302         gp = cp->geom;
303         if (open_for_read)
304                 g_access(cp, -1, 0, -1);
305         /* Destroy consumer on last close. */
306         if (cp->acr == 0 && cp->ace == 0) {
307                 if (cp->acw > 0)
308                         g_access(cp, 0, -cp->acw, 0);
309                 if (cp->provider != NULL) {
310                         ZFS_LOG(1, "Destroying consumer to %s.",
311                             cp->provider->name ? cp->provider->name : "NULL");
312                         g_detach(cp);
313                 }
314                 g_destroy_consumer(cp);
315         }
316         /* Destroy geom if there are no consumers left. */
317         if (LIST_EMPTY(&gp->consumer)) {
318                 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
319                 g_wither_geom(gp, ENXIO);
320         }
321 }
322
323 static void
324 vdev_geom_close_locked(vdev_t *vd)
325 {
326         struct g_consumer *cp;
327
328         g_topology_assert();
329
330         cp = vd->vdev_tsd;
331         vd->vdev_tsd = NULL;
332         vd->vdev_delayed_close = B_FALSE;
333         if (cp == NULL)
334                 return;
335
336         ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
337
338         vdev_geom_detach(cp, B_TRUE);
339 }
340
341 static void
342 nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
343 {
344
345         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
346         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
347 }
348
349 /*
350  * Issue one or more bios to the vdev in parallel
351  * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
352  * operation is described by parallel entries from each array.  There may be
353  * more bios actually issued than entries in the array
354  */
355 static void
356 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
357     off_t *sizes, int *errors, int ncmds)
358 {
359         struct bio **bios;
360         u_char *p;
361         off_t off, maxio, s, end;
362         int i, n_bios, j;
363         size_t bios_size;
364
365         maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
366         n_bios = 0;
367
368         /* How many bios are required for all commands ? */
369         for (i = 0; i < ncmds; i++)
370                 n_bios += (sizes[i] + maxio - 1) / maxio;
371
372         /* Allocate memory for the bios */
373         bios_size = n_bios * sizeof(struct bio*);
374         bios = kmem_zalloc(bios_size, KM_SLEEP);
375
376         /* Prepare and issue all of the bios */
377         for (i = j = 0; i < ncmds; i++) {
378                 off = offsets[i];
379                 p = datas[i];
380                 s = sizes[i];
381                 end = off + s;
382                 ASSERT((off % cp->provider->sectorsize) == 0);
383                 ASSERT((s % cp->provider->sectorsize) == 0);
384
385                 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
386                         bios[j] = g_alloc_bio();
387                         bios[j]->bio_cmd = cmds[i];
388                         bios[j]->bio_done = NULL;
389                         bios[j]->bio_offset = off;
390                         bios[j]->bio_length = MIN(s, maxio);
391                         bios[j]->bio_data = p;
392                         g_io_request(bios[j], cp);
393                 }
394         }
395         ASSERT(j == n_bios);
396
397         /* Wait for all of the bios to complete, and clean them up */
398         for (i = j = 0; i < ncmds; i++) {
399                 off = offsets[i];
400                 s = sizes[i];
401                 end = off + s;
402
403                 for (; off < end; off += maxio, s -= maxio, j++) {
404                         errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
405                         g_destroy_bio(bios[j]);
406                 }
407         }
408         kmem_free(bios, bios_size);
409 }
410
411 static int
412 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
413 {
414         struct g_provider *pp;
415         vdev_phys_t *vdev_lists[VDEV_LABELS];
416         char *p, *buf;
417         size_t buflen;
418         uint64_t psize, state, txg;
419         off_t offsets[VDEV_LABELS];
420         off_t size;
421         off_t sizes[VDEV_LABELS];
422         int cmds[VDEV_LABELS];
423         int errors[VDEV_LABELS];
424         int l, len;
425
426         g_topology_assert_not();
427
428         pp = cp->provider;
429         ZFS_LOG(1, "Reading config from %s...", pp->name);
430
431         psize = pp->mediasize;
432         psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
433
434         size = sizeof(*vdev_lists[0]) + pp->sectorsize -
435             ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
436
437         buflen = sizeof(vdev_lists[0]->vp_nvlist);
438
439         *config = NULL;
440         /* Create all of the IO requests */
441         for (l = 0; l < VDEV_LABELS; l++) {
442                 cmds[l] = BIO_READ;
443                 vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
444                 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
445                 sizes[l] = size;
446                 errors[l] = 0;
447                 ASSERT(offsets[l] % pp->sectorsize == 0);
448         }
449
450         /* Issue the IO requests */
451         vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
452             VDEV_LABELS);
453
454         /* Parse the labels */
455         for (l = 0; l < VDEV_LABELS; l++) {
456                 if (errors[l] != 0)
457                         continue;
458
459                 buf = vdev_lists[l]->vp_nvlist;
460
461                 if (nvlist_unpack(buf, buflen, config, 0) != 0)
462                         continue;
463
464                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
465                     &state) != 0 || state > POOL_STATE_L2CACHE) {
466                         nvlist_free(*config);
467                         *config = NULL;
468                         continue;
469                 }
470
471                 if (state != POOL_STATE_SPARE &&
472                     state != POOL_STATE_L2CACHE &&
473                     (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
474                     &txg) != 0 || txg == 0)) {
475                         nvlist_free(*config);
476                         *config = NULL;
477                         continue;
478                 }
479
480                 break;
481         }
482
483         /* Free the label storage */
484         for (l = 0; l < VDEV_LABELS; l++)
485                 kmem_free(vdev_lists[l], size);
486
487         return (*config == NULL ? ENOENT : 0);
488 }
489
490 static void
491 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
492 {
493         nvlist_t **new_configs;
494         uint64_t i;
495
496         if (id < *count)
497                 return;
498         new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
499             KM_SLEEP);
500         for (i = 0; i < *count; i++)
501                 new_configs[i] = (*configs)[i];
502         if (*configs != NULL)
503                 kmem_free(*configs, *count * sizeof(void *));
504         *configs = new_configs;
505         *count = id + 1;
506 }
507
508 static void
509 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
510     const char *name, uint64_t* known_pool_guid)
511 {
512         nvlist_t *vdev_tree;
513         uint64_t pool_guid;
514         uint64_t vdev_guid, known_guid;
515         uint64_t id, txg, known_txg;
516         char *pname;
517         int i;
518
519         if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
520             strcmp(pname, name) != 0)
521                 goto ignore;
522
523         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
524                 goto ignore;
525
526         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
527                 goto ignore;
528
529         if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
530                 goto ignore;
531
532         if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
533                 goto ignore;
534
535         VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
536
537         if (*known_pool_guid != 0) {
538                 if (pool_guid != *known_pool_guid)
539                         goto ignore;
540         } else
541                 *known_pool_guid = pool_guid;
542
543         resize_configs(configs, count, id);
544
545         if ((*configs)[id] != NULL) {
546                 VERIFY(nvlist_lookup_uint64((*configs)[id],
547                     ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
548                 if (txg <= known_txg)
549                         goto ignore;
550                 nvlist_free((*configs)[id]);
551         }
552
553         (*configs)[id] = cfg;
554         return;
555
556 ignore:
557         nvlist_free(cfg);
558 }
559
560 int
561 vdev_geom_read_pool_label(const char *name,
562     nvlist_t ***configs, uint64_t *count)
563 {
564         struct g_class *mp;
565         struct g_geom *gp;
566         struct g_provider *pp;
567         struct g_consumer *zcp;
568         nvlist_t *vdev_cfg;
569         uint64_t pool_guid;
570         int error;
571
572         DROP_GIANT();
573         g_topology_lock();
574
575         *configs = NULL;
576         *count = 0;
577         pool_guid = 0;
578         LIST_FOREACH(mp, &g_classes, class) {
579                 if (mp == &zfs_vdev_class)
580                         continue;
581                 LIST_FOREACH(gp, &mp->geom, geom) {
582                         if (gp->flags & G_GEOM_WITHER)
583                                 continue;
584                         LIST_FOREACH(pp, &gp->provider, provider) {
585                                 if (pp->flags & G_PF_WITHER)
586                                         continue;
587                                 zcp = vdev_geom_attach(pp, NULL);
588                                 if (zcp == NULL)
589                                         continue;
590                                 g_topology_unlock();
591                                 error = vdev_geom_read_config(zcp, &vdev_cfg);
592                                 g_topology_lock();
593                                 vdev_geom_detach(zcp, B_TRUE);
594                                 if (error)
595                                         continue;
596                                 ZFS_LOG(1, "successfully read vdev config");
597
598                                 process_vdev_config(configs, count,
599                                     vdev_cfg, name, &pool_guid);
600                         }
601                 }
602         }
603         g_topology_unlock();
604         PICKUP_GIANT();
605
606         return (*count > 0 ? 0 : ENOENT);
607 }
608
609 static void
610 vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
611 {
612         nvlist_t *config;
613
614         g_topology_assert_not();
615
616         *pguid = 0;
617         *vguid = 0;
618         if (vdev_geom_read_config(cp, &config) == 0) {
619                 nvlist_get_guids(config, pguid, vguid);
620                 nvlist_free(config);
621         }
622 }
623
624 static boolean_t
625 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
626 {
627         uint64_t pool_guid;
628         uint64_t vdev_guid;
629         struct g_consumer *zcp;
630         boolean_t pool_ok;
631         boolean_t vdev_ok;
632
633         zcp = vdev_geom_attach(pp, NULL);
634         if (zcp == NULL) {
635                 ZFS_LOG(1, "Unable to attach tasting instance to %s.",
636                     pp->name);
637                 return (B_FALSE);
638         }
639         g_topology_unlock();
640         vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid);
641         g_topology_lock();
642         vdev_geom_detach(zcp, B_TRUE);
643
644         /* 
645          * Check that the label's vdev guid matches the desired guid.  If the
646          * label has a pool guid, check that it matches too. (Inactive spares
647          * and L2ARCs do not have any pool guid in the label.)
648          */
649         if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) &&
650             vdev_guid == vd->vdev_guid) {
651                 ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path);
652                 return (B_TRUE);
653         } else {
654                 ZFS_LOG(1, "guid mismatch for provider %s: "
655                     "%ju:%ju != %ju:%ju.", vd->vdev_path,
656                     (uintmax_t)spa_guid(vd->vdev_spa),
657                     (uintmax_t)vd->vdev_guid,
658                     (uintmax_t)pool_guid, (uintmax_t)vdev_guid);
659                 return (B_FALSE);
660         }
661 }
662
663 static struct g_consumer *
664 vdev_geom_attach_by_guids(vdev_t *vd)
665 {
666         struct g_class *mp;
667         struct g_geom *gp;
668         struct g_provider *pp;
669         struct g_consumer *cp;
670
671         g_topology_assert();
672
673         cp = NULL;
674         LIST_FOREACH(mp, &g_classes, class) {
675                 if (mp == &zfs_vdev_class)
676                         continue;
677                 LIST_FOREACH(gp, &mp->geom, geom) {
678                         if (gp->flags & G_GEOM_WITHER)
679                                 continue;
680                         LIST_FOREACH(pp, &gp->provider, provider) {
681                                 if (!vdev_attach_ok(vd, pp))
682                                         continue;
683                                 cp = vdev_geom_attach(pp, vd);
684                                 if (cp == NULL) {
685                                         printf("ZFS WARNING: Unable to "
686                                             "attach to %s.\n", pp->name);
687                                         continue;
688                                 }
689                                 break;
690                         }
691                         if (cp != NULL)
692                                 break;
693                 }
694                 if (cp != NULL)
695                         break;
696         }
697 end:
698         return (cp);
699 }
700
701 static struct g_consumer *
702 vdev_geom_open_by_guids(vdev_t *vd)
703 {
704         struct g_consumer *cp;
705         char *buf;
706         size_t len;
707
708         g_topology_assert();
709
710         ZFS_LOG(1, "Searching by guids [%ju:%ju].",
711                 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
712         cp = vdev_geom_attach_by_guids(vd);
713         if (cp != NULL) {
714                 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
715                 buf = kmem_alloc(len, KM_SLEEP);
716
717                 snprintf(buf, len, "/dev/%s", cp->provider->name);
718                 spa_strfree(vd->vdev_path);
719                 vd->vdev_path = buf;
720
721                 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
722                     (uintmax_t)spa_guid(vd->vdev_spa),
723                     (uintmax_t)vd->vdev_guid, vd->vdev_path);
724         } else {
725                 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
726                     (uintmax_t)spa_guid(vd->vdev_spa),
727                     (uintmax_t)vd->vdev_guid);
728         }
729
730         return (cp);
731 }
732
733 static struct g_consumer *
734 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
735 {
736         struct g_provider *pp;
737         struct g_consumer *cp;
738
739         g_topology_assert();
740
741         cp = NULL;
742         pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
743         if (pp != NULL) {
744                 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
745                 if (!check_guid || vdev_attach_ok(vd, pp))
746                         cp = vdev_geom_attach(pp, vd);
747         }
748
749         return (cp);
750 }
751
752 static int
753 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
754     uint64_t *logical_ashift, uint64_t *physical_ashift)
755 {
756         struct g_provider *pp;
757         struct g_consumer *cp;
758         size_t bufsize;
759         int error;
760
761         /* Set the TLS to indicate downstack that we should not access zvols*/
762         VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
763
764         /*
765          * We must have a pathname, and it must be absolute.
766          */
767         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
768                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
769                 return (EINVAL);
770         }
771
772         vd->vdev_tsd = NULL;
773
774         DROP_GIANT();
775         g_topology_lock();
776         error = 0;
777
778         if (vd->vdev_spa->spa_splitting_newspa ||
779             (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
780              vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
781              vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
782                 /*
783                  * We are dealing with a vdev that hasn't been previously
784                  * opened (since boot), and we are not loading an
785                  * existing pool configuration.  This looks like a
786                  * vdev add operation to a new or existing pool.
787                  * Assume the user knows what he/she is doing and find
788                  * GEOM provider by its name, ignoring GUID mismatches.
789                  *
790                  * XXPOLICY: It would be safer to only allow a device
791                  *           that is unlabeled or labeled but missing
792                  *           GUID information to be opened in this fashion,
793                  *           unless we are doing a split, in which case we
794                  *           should allow any guid.
795                  */
796                 cp = vdev_geom_open_by_path(vd, 0);
797         } else {
798                 /*
799                  * Try using the recorded path for this device, but only
800                  * accept it if its label data contains the expected GUIDs.
801                  */
802                 cp = vdev_geom_open_by_path(vd, 1);
803                 if (cp == NULL) {
804                         /*
805                          * The device at vd->vdev_path doesn't have the
806                          * expected GUIDs. The disks might have merely
807                          * moved around so try all other GEOM providers
808                          * to find one with the right GUIDs.
809                          */
810                         cp = vdev_geom_open_by_guids(vd);
811                 }
812         }
813
814         /* Clear the TLS now that tasting is done */
815         VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
816
817         if (cp == NULL) {
818                 ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
819                 error = ENOENT;
820         } else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
821             !ISP2(cp->provider->sectorsize)) {
822                 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
823                     vd->vdev_path);
824
825                 vdev_geom_close_locked(vd);
826                 error = EINVAL;
827                 cp = NULL;
828         } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
829                 int i;
830
831                 for (i = 0; i < 5; i++) {
832                         error = g_access(cp, 0, 1, 0);
833                         if (error == 0)
834                                 break;
835                         g_topology_unlock();
836                         tsleep(vd, 0, "vdev", hz / 2);
837                         g_topology_lock();
838                 }
839                 if (error != 0) {
840                         printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
841                             vd->vdev_path, error);
842                         vdev_geom_close_locked(vd);
843                         cp = NULL;
844                 }
845         }
846
847         /* Fetch initial physical path information for this device. */
848         if (cp != NULL)
849                 vdev_geom_attrchanged(cp, "GEOM::physpath");
850         
851         g_topology_unlock();
852         PICKUP_GIANT();
853         if (cp == NULL) {
854                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
855                 return (error);
856         }
857         pp = cp->provider;
858
859         /*
860          * Determine the actual size of the device.
861          */
862         *max_psize = *psize = pp->mediasize;
863
864         /*
865          * Determine the device's minimum transfer size and preferred
866          * transfer size.
867          */
868         *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
869         *physical_ashift = 0;
870         if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
871             pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
872                 *physical_ashift = highbit(pp->stripesize) - 1;
873
874         /*
875          * Clear the nowritecache settings, so that on a vdev_reopen()
876          * we will try again.
877          */
878         vd->vdev_nowritecache = B_FALSE;
879
880         /*
881          * Determine the device's rotation rate.
882          */
883         vdev_geom_set_rotation_rate(vd, cp);
884
885         return (0);
886 }
887
888 static void
889 vdev_geom_close(vdev_t *vd)
890 {
891
892         DROP_GIANT();
893         g_topology_lock();
894         vdev_geom_close_locked(vd);
895         g_topology_unlock();
896         PICKUP_GIANT();
897 }
898
899 static void
900 vdev_geom_io_intr(struct bio *bp)
901 {
902         vdev_t *vd;
903         zio_t *zio;
904
905         zio = bp->bio_caller1;
906         vd = zio->io_vd;
907         zio->io_error = bp->bio_error;
908         if (zio->io_error == 0 && bp->bio_resid != 0)
909                 zio->io_error = SET_ERROR(EIO);
910
911         switch(zio->io_error) {
912         case ENOTSUP:
913                 /*
914                  * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
915                  * that future attempts will never succeed. In this case
916                  * we set a persistent flag so that we don't bother with
917                  * requests in the future.
918                  */
919                 switch(bp->bio_cmd) {
920                 case BIO_FLUSH:
921                         vd->vdev_nowritecache = B_TRUE;
922                         break;
923                 case BIO_DELETE:
924                         vd->vdev_notrim = B_TRUE;
925                         break;
926                 }
927                 break;
928         case ENXIO:
929                 if (!vd->vdev_remove_wanted) {
930                         /*
931                          * If provider's error is set we assume it is being
932                          * removed.
933                          */
934                         if (bp->bio_to->error != 0) {
935                                 vd->vdev_remove_wanted = B_TRUE;
936                                 spa_async_request(zio->io_spa,
937                                     SPA_ASYNC_REMOVE);
938                         } else if (!vd->vdev_delayed_close) {
939                                 vd->vdev_delayed_close = B_TRUE;
940                         }
941                 }
942                 break;
943         }
944         g_destroy_bio(bp);
945         zio_delay_interrupt(zio);
946 }
947
948 static void
949 vdev_geom_io_start(zio_t *zio)
950 {
951         vdev_t *vd;
952         struct g_consumer *cp;
953         struct bio *bp;
954         int error;
955
956         vd = zio->io_vd;
957
958         switch (zio->io_type) {
959         case ZIO_TYPE_IOCTL:
960                 /* XXPOLICY */
961                 if (!vdev_readable(vd)) {
962                         zio->io_error = SET_ERROR(ENXIO);
963                         zio_interrupt(zio);
964                         return;
965                 } else {
966                         switch (zio->io_cmd) {
967                         case DKIOCFLUSHWRITECACHE:
968                                 if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
969                                         break;
970                                 if (vd->vdev_nowritecache) {
971                                         zio->io_error = SET_ERROR(ENOTSUP);
972                                         break;
973                                 }
974                                 goto sendreq;
975                         default:
976                                 zio->io_error = SET_ERROR(ENOTSUP);
977                         }
978                 }
979
980                 zio_execute(zio);
981                 return;
982         case ZIO_TYPE_FREE:
983                 if (vd->vdev_notrim) {
984                         zio->io_error = SET_ERROR(ENOTSUP);
985                 } else if (!vdev_geom_bio_delete_disable) {
986                         goto sendreq;
987                 }
988                 zio_execute(zio);
989                 return;
990         }
991 sendreq:
992         ASSERT(zio->io_type == ZIO_TYPE_READ ||
993             zio->io_type == ZIO_TYPE_WRITE ||
994             zio->io_type == ZIO_TYPE_FREE ||
995             zio->io_type == ZIO_TYPE_IOCTL);
996
997         cp = vd->vdev_tsd;
998         if (cp == NULL) {
999                 zio->io_error = SET_ERROR(ENXIO);
1000                 zio_interrupt(zio);
1001                 return;
1002         }
1003         bp = g_alloc_bio();
1004         bp->bio_caller1 = zio;
1005         switch (zio->io_type) {
1006         case ZIO_TYPE_READ:
1007         case ZIO_TYPE_WRITE:
1008                 zio->io_target_timestamp = zio_handle_io_delay(zio);
1009                 bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
1010                 bp->bio_data = zio->io_data;
1011                 bp->bio_offset = zio->io_offset;
1012                 bp->bio_length = zio->io_size;
1013                 break;
1014         case ZIO_TYPE_FREE:
1015                 bp->bio_cmd = BIO_DELETE;
1016                 bp->bio_data = NULL;
1017                 bp->bio_offset = zio->io_offset;
1018                 bp->bio_length = zio->io_size;
1019                 break;
1020         case ZIO_TYPE_IOCTL:
1021                 bp->bio_cmd = BIO_FLUSH;
1022                 bp->bio_flags |= BIO_ORDERED;
1023                 bp->bio_data = NULL;
1024                 bp->bio_offset = cp->provider->mediasize;
1025                 bp->bio_length = 0;
1026                 break;
1027         }
1028         bp->bio_done = vdev_geom_io_intr;
1029
1030         g_io_request(bp, cp);
1031 }
1032
1033 static void
1034 vdev_geom_io_done(zio_t *zio)
1035 {
1036 }
1037
1038 static void
1039 vdev_geom_hold(vdev_t *vd)
1040 {
1041 }
1042
1043 static void
1044 vdev_geom_rele(vdev_t *vd)
1045 {
1046 }
1047
1048 vdev_ops_t vdev_geom_ops = {
1049         vdev_geom_open,
1050         vdev_geom_close,
1051         vdev_default_asize,
1052         vdev_geom_io_start,
1053         vdev_geom_io_done,
1054         NULL,
1055         vdev_geom_hold,
1056         vdev_geom_rele,
1057         VDEV_TYPE_DISK,         /* name of this vdev type */
1058         B_TRUE                  /* leaf vdev */
1059 };