]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
MFV r303080: 6451 ztest fails due to checksum errors
[FreeBSD/FreeBSD.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_geom.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23  * All rights reserved.
24  *
25  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26  */
27
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/bio.h>
32 #include <sys/disk.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zio.h>
38 #include <geom/geom.h>
39 #include <geom/geom_int.h>
40
41 /*
42  * Virtual device vector for GEOM.
43  */
44
45 static g_attrchanged_t vdev_geom_attrchanged;
46 struct g_class zfs_vdev_class = {
47         .name = "ZFS::VDEV",
48         .version = G_VERSION,
49         .attrchanged = vdev_geom_attrchanged,
50 };
51
52 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54 SYSCTL_DECL(_vfs_zfs_vdev);
55 /* Don't send BIO_FLUSH. */
56 static int vdev_geom_bio_flush_disable;
57 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
58     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
59 /* Don't send BIO_DELETE. */
60 static int vdev_geom_bio_delete_disable;
61 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
62     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
63
64 /* Declare local functions */
65 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
66
67 /*
68  * Thread local storage used to indicate when a thread is probing geoms
69  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
70  * it is looking for a replacement for the vdev_t* that is its value.
71  */
72 uint_t zfs_geom_probe_vdev_key;
73
74 static void
75 vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
76
77         int error;
78         uint16_t rate;
79
80         error = g_getattr("GEOM::rotation_rate", cp, &rate);
81         if (error == 0)
82                 vd->vdev_rotation_rate = rate;
83         else
84                 vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
85 }
86
87 static void
88 vdev_geom_set_physpath(struct g_consumer *cp, boolean_t do_null_update)
89 {
90         boolean_t needs_update = B_FALSE;
91         vdev_t *vd;
92         char *physpath;
93         int error, physpath_len;
94
95         if (g_access(cp, 1, 0, 0) != 0)
96                 return;
97
98         vd = cp->private;
99         physpath_len = MAXPATHLEN;
100         physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
101         error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
102         g_access(cp, -1, 0, 0);
103         if (error == 0) {
104                 char *old_physpath;
105
106                 /* g_topology lock ensures that vdev has not been closed */
107                 g_topology_assert();
108                 old_physpath = vd->vdev_physpath;
109                 vd->vdev_physpath = spa_strdup(physpath);
110
111                 if (old_physpath != NULL) {
112                         needs_update = (strcmp(old_physpath,
113                                                 vd->vdev_physpath) != 0);
114                         spa_strfree(old_physpath);
115                 } else
116                         needs_update = do_null_update;
117         }
118         g_free(physpath);
119
120         /*
121          * If the physical path changed, update the config.
122          * Only request an update for previously unset physpaths if
123          * requested by the caller.
124          */
125         if (needs_update)
126                 spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
127
128 }
129
130 static void
131 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
132 {
133         vdev_t *vd;
134         char *old_physpath;
135         int error;
136
137         vd = cp->private;
138         if (vd == NULL)
139                 return;
140
141         if (strcmp(attr, "GEOM::rotation_rate") == 0) {
142                 vdev_geom_set_rotation_rate(vd, cp);
143                 return;
144         }
145
146         if (strcmp(attr, "GEOM::physpath") == 0) {
147                 vdev_geom_set_physpath(cp, /*do_null_update*/B_TRUE);
148                 return;
149         }
150 }
151
152 static void
153 vdev_geom_orphan(struct g_consumer *cp)
154 {
155         vdev_t *vd;
156
157         g_topology_assert();
158
159         vd = cp->private;
160         if (vd == NULL) {
161                 /* Vdev close in progress.  Ignore the event. */
162                 return;
163         }
164
165         /*
166          * Orphan callbacks occur from the GEOM event thread.
167          * Concurrent with this call, new I/O requests may be
168          * working their way through GEOM about to find out
169          * (only once executed by the g_down thread) that we've
170          * been orphaned from our disk provider.  These I/Os
171          * must be retired before we can detach our consumer.
172          * This is most easily achieved by acquiring the
173          * SPA ZIO configuration lock as a writer, but doing
174          * so with the GEOM topology lock held would cause
175          * a lock order reversal.  Instead, rely on the SPA's
176          * async removal support to invoke a close on this
177          * vdev once it is safe to do so.
178          */
179         vd->vdev_remove_wanted = B_TRUE;
180         spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
181 }
182
183 static struct g_consumer *
184 vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
185 {
186         struct g_geom *gp;
187         struct g_consumer *cp;
188         int error;
189
190         g_topology_assert();
191
192         ZFS_LOG(1, "Attaching to %s.", pp->name);
193
194         if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
195                 ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
196                     pp->name, pp->sectorsize);
197                 return (NULL);
198         } else if (pp->mediasize < SPA_MINDEVSIZE) {
199                 ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
200                     pp->name, pp->mediasize);
201                 return (NULL);
202         }
203
204         /* Do we have geom already? No? Create one. */
205         LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
206                 if (gp->flags & G_GEOM_WITHER)
207                         continue;
208                 if (strcmp(gp->name, "zfs::vdev") != 0)
209                         continue;
210                 break;
211         }
212         if (gp == NULL) {
213                 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
214                 gp->orphan = vdev_geom_orphan;
215                 gp->attrchanged = vdev_geom_attrchanged;
216                 cp = g_new_consumer(gp);
217                 error = g_attach(cp, pp);
218                 if (error != 0) {
219                         ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
220                             __LINE__, error);
221                         vdev_geom_detach(cp, B_FALSE);
222                         return (NULL);
223                 }
224                 error = g_access(cp, 1, 0, 1);
225                 if (error != 0) {
226                         ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
227                                __LINE__, error);
228                         vdev_geom_detach(cp, B_FALSE);
229                         return (NULL);
230                 }
231                 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
232         } else {
233                 /* Check if we are already connected to this provider. */
234                 LIST_FOREACH(cp, &gp->consumer, consumer) {
235                         if (cp->provider == pp) {
236                                 ZFS_LOG(1, "Found consumer for %s.", pp->name);
237                                 break;
238                         }
239                 }
240                 if (cp == NULL) {
241                         cp = g_new_consumer(gp);
242                         error = g_attach(cp, pp);
243                         if (error != 0) {
244                                 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
245                                     __func__, __LINE__, error);
246                                 vdev_geom_detach(cp, B_FALSE);
247                                 return (NULL);
248                         }
249                         error = g_access(cp, 1, 0, 1);
250                         if (error != 0) {
251                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
252                                     __func__, __LINE__, error);
253                                 vdev_geom_detach(cp, B_FALSE);
254                                 return (NULL);
255                         }
256                         ZFS_LOG(1, "Created consumer for %s.", pp->name);
257                 } else {
258                         error = g_access(cp, 1, 0, 1);
259                         if (error != 0) {
260                                 ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
261                                     __func__, __LINE__, error);
262                                 return (NULL);
263                         }
264                         ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
265                 }
266         }
267
268         /* 
269          * BUG: cp may already belong to a vdev.  This could happen if:
270          * 1) That vdev is a shared spare, or
271          * 2) We are trying to reopen a missing vdev and we are scanning by
272          *    guid.  In that case, we'll ultimately fail to open this consumer,
273          *    but not until after setting the private field.
274          * The solution is to:
275          * 1) Don't set the private field until after the open succeeds, and
276          * 2) Set it to a linked list of vdevs, not just a single vdev
277          */
278         cp->private = vd;
279         if (vd != NULL) {
280                 vd->vdev_tsd = cp;
281                 vdev_geom_set_physpath(cp, /*do_null_update*/B_FALSE);
282         }
283
284         cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
285         return (cp);
286 }
287
288 static void
289 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
290 {
291         struct g_geom *gp;
292         vdev_t *vd;
293
294         g_topology_assert();
295
296         ZFS_LOG(1, "Detaching consumer. Provider %s.",
297             cp->provider && cp->provider->name ? cp->provider->name : "NULL");
298
299         vd = cp->private;
300         cp->private = NULL;
301
302         gp = cp->geom;
303         if (open_for_read)
304                 g_access(cp, -1, 0, -1);
305         /* Destroy consumer on last close. */
306         if (cp->acr == 0 && cp->ace == 0) {
307                 if (cp->acw > 0)
308                         g_access(cp, 0, -cp->acw, 0);
309                 if (cp->provider != NULL) {
310                         ZFS_LOG(1, "Destroying consumer to %s.",
311                             cp->provider->name ? cp->provider->name : "NULL");
312                         g_detach(cp);
313                 }
314                 g_destroy_consumer(cp);
315         }
316         /* Destroy geom if there are no consumers left. */
317         if (LIST_EMPTY(&gp->consumer)) {
318                 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
319                 g_wither_geom(gp, ENXIO);
320         }
321 }
322
323 static void
324 vdev_geom_close_locked(vdev_t *vd)
325 {
326         struct g_consumer *cp;
327
328         g_topology_assert();
329
330         cp = vd->vdev_tsd;
331         vd->vdev_tsd = NULL;
332         vd->vdev_delayed_close = B_FALSE;
333         if (cp == NULL)
334                 return;
335
336         ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
337
338         vdev_geom_detach(cp, B_TRUE);
339 }
340
341 static void
342 nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
343 {
344
345         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
346         (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
347 }
348
349 /*
350  * Issue one or more bios to the vdev in parallel
351  * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
352  * operation is described by parallel entries from each array.  There may be
353  * more bios actually issued than entries in the array
354  */
355 static void
356 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
357     off_t *sizes, int *errors, int ncmds)
358 {
359         struct bio **bios;
360         u_char *p;
361         off_t off, maxio, s, end;
362         int i, n_bios, j;
363         size_t bios_size;
364
365         maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
366         n_bios = 0;
367
368         /* How many bios are required for all commands ? */
369         for (i = 0; i < ncmds; i++)
370                 n_bios += (sizes[i] + maxio - 1) / maxio;
371
372         /* Allocate memory for the bios */
373         bios_size = n_bios * sizeof(struct bio*);
374         bios = kmem_zalloc(bios_size, KM_SLEEP);
375
376         /* Prepare and issue all of the bios */
377         for (i = j = 0; i < ncmds; i++) {
378                 off = offsets[i];
379                 p = datas[i];
380                 s = sizes[i];
381                 end = off + s;
382                 ASSERT((off % cp->provider->sectorsize) == 0);
383                 ASSERT((s % cp->provider->sectorsize) == 0);
384
385                 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
386                         bios[j] = g_alloc_bio();
387                         bios[j]->bio_cmd = cmds[i];
388                         bios[j]->bio_done = NULL;
389                         bios[j]->bio_offset = off;
390                         bios[j]->bio_length = MIN(s, maxio);
391                         bios[j]->bio_data = p;
392                         g_io_request(bios[j], cp);
393                 }
394         }
395         ASSERT(j == n_bios);
396
397         /* Wait for all of the bios to complete, and clean them up */
398         for (i = j = 0; i < ncmds; i++) {
399                 off = offsets[i];
400                 s = sizes[i];
401                 end = off + s;
402
403                 for (; off < end; off += maxio, s -= maxio, j++) {
404                         errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
405                         g_destroy_bio(bios[j]);
406                 }
407         }
408         kmem_free(bios, bios_size);
409 }
410
411 static int
412 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
413 {
414         struct g_provider *pp;
415         vdev_phys_t *vdev_lists[VDEV_LABELS];
416         char *p, *buf;
417         size_t buflen;
418         uint64_t psize, state, txg;
419         off_t offsets[VDEV_LABELS];
420         off_t size;
421         off_t sizes[VDEV_LABELS];
422         int cmds[VDEV_LABELS];
423         int errors[VDEV_LABELS];
424         int l, len;
425
426         g_topology_assert_not();
427
428         pp = cp->provider;
429         ZFS_LOG(1, "Reading config from %s...", pp->name);
430
431         psize = pp->mediasize;
432         psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
433
434         size = sizeof(*vdev_lists[0]) + pp->sectorsize -
435             ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
436
437         buflen = sizeof(vdev_lists[0]->vp_nvlist);
438
439         *config = NULL;
440         /* Create all of the IO requests */
441         for (l = 0; l < VDEV_LABELS; l++) {
442                 cmds[l] = BIO_READ;
443                 vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
444                 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
445                 sizes[l] = size;
446                 errors[l] = 0;
447                 ASSERT(offsets[l] % pp->sectorsize == 0);
448         }
449
450         /* Issue the IO requests */
451         vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
452             VDEV_LABELS);
453
454         /* Parse the labels */
455         for (l = 0; l < VDEV_LABELS; l++) {
456                 if (errors[l] != 0)
457                         continue;
458
459                 buf = vdev_lists[l]->vp_nvlist;
460
461                 if (nvlist_unpack(buf, buflen, config, 0) != 0)
462                         continue;
463
464                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
465                     &state) != 0 || state > POOL_STATE_L2CACHE) {
466                         nvlist_free(*config);
467                         *config = NULL;
468                         continue;
469                 }
470
471                 if (state != POOL_STATE_SPARE &&
472                     state != POOL_STATE_L2CACHE &&
473                     (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
474                     &txg) != 0 || txg == 0)) {
475                         nvlist_free(*config);
476                         *config = NULL;
477                         continue;
478                 }
479
480                 break;
481         }
482
483         /* Free the label storage */
484         for (l = 0; l < VDEV_LABELS; l++)
485                 kmem_free(vdev_lists[l], size);
486
487         return (*config == NULL ? ENOENT : 0);
488 }
489
490 static void
491 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
492 {
493         nvlist_t **new_configs;
494         uint64_t i;
495
496         if (id < *count)
497                 return;
498         new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
499             KM_SLEEP);
500         for (i = 0; i < *count; i++)
501                 new_configs[i] = (*configs)[i];
502         if (*configs != NULL)
503                 kmem_free(*configs, *count * sizeof(void *));
504         *configs = new_configs;
505         *count = id + 1;
506 }
507
508 static void
509 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
510     const char *name, uint64_t* known_pool_guid)
511 {
512         nvlist_t *vdev_tree;
513         uint64_t pool_guid;
514         uint64_t vdev_guid, known_guid;
515         uint64_t id, txg, known_txg;
516         char *pname;
517         int i;
518
519         if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
520             strcmp(pname, name) != 0)
521                 goto ignore;
522
523         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
524                 goto ignore;
525
526         if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
527                 goto ignore;
528
529         if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
530                 goto ignore;
531
532         if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
533                 goto ignore;
534
535         VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
536
537         if (*known_pool_guid != 0) {
538                 if (pool_guid != *known_pool_guid)
539                         goto ignore;
540         } else
541                 *known_pool_guid = pool_guid;
542
543         resize_configs(configs, count, id);
544
545         if ((*configs)[id] != NULL) {
546                 VERIFY(nvlist_lookup_uint64((*configs)[id],
547                     ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
548                 if (txg <= known_txg)
549                         goto ignore;
550                 nvlist_free((*configs)[id]);
551         }
552
553         (*configs)[id] = cfg;
554         return;
555
556 ignore:
557         nvlist_free(cfg);
558 }
559
560 int
561 vdev_geom_read_pool_label(const char *name,
562     nvlist_t ***configs, uint64_t *count)
563 {
564         struct g_class *mp;
565         struct g_geom *gp;
566         struct g_provider *pp;
567         struct g_consumer *zcp;
568         nvlist_t *vdev_cfg;
569         uint64_t pool_guid;
570         int error;
571
572         DROP_GIANT();
573         g_topology_lock();
574
575         *configs = NULL;
576         *count = 0;
577         pool_guid = 0;
578         LIST_FOREACH(mp, &g_classes, class) {
579                 if (mp == &zfs_vdev_class)
580                         continue;
581                 LIST_FOREACH(gp, &mp->geom, geom) {
582                         if (gp->flags & G_GEOM_WITHER)
583                                 continue;
584                         LIST_FOREACH(pp, &gp->provider, provider) {
585                                 if (pp->flags & G_PF_WITHER)
586                                         continue;
587                                 zcp = vdev_geom_attach(pp, NULL);
588                                 if (zcp == NULL)
589                                         continue;
590                                 g_topology_unlock();
591                                 error = vdev_geom_read_config(zcp, &vdev_cfg);
592                                 g_topology_lock();
593                                 vdev_geom_detach(zcp, B_TRUE);
594                                 if (error)
595                                         continue;
596                                 ZFS_LOG(1, "successfully read vdev config");
597
598                                 process_vdev_config(configs, count,
599                                     vdev_cfg, name, &pool_guid);
600                         }
601                 }
602         }
603         g_topology_unlock();
604         PICKUP_GIANT();
605
606         return (*count > 0 ? 0 : ENOENT);
607 }
608
609 static void
610 vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
611 {
612         nvlist_t *config;
613
614         g_topology_assert_not();
615
616         *pguid = 0;
617         *vguid = 0;
618         if (vdev_geom_read_config(cp, &config) == 0) {
619                 nvlist_get_guids(config, pguid, vguid);
620                 nvlist_free(config);
621         }
622 }
623
624 static boolean_t
625 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
626 {
627         uint64_t pool_guid;
628         uint64_t vdev_guid;
629         struct g_consumer *zcp;
630         boolean_t pool_ok;
631         boolean_t vdev_ok;
632
633         zcp = vdev_geom_attach(pp, NULL);
634         if (zcp == NULL) {
635                 ZFS_LOG(1, "Unable to attach tasting instance to %s.",
636                     pp->name);
637                 return (B_FALSE);
638         }
639         g_topology_unlock();
640         vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid);
641         g_topology_lock();
642         vdev_geom_detach(zcp, B_TRUE);
643
644         /* 
645          * Check that the label's vdev guid matches the desired guid.  If the
646          * label has a pool guid, check that it matches too. (Inactive spares
647          * and L2ARCs do not have any pool guid in the label.)
648          */
649         if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) &&
650             vdev_guid == vd->vdev_guid) {
651                 ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path);
652                 return (B_TRUE);
653         } else {
654                 ZFS_LOG(1, "guid mismatch for provider %s: "
655                     "%ju:%ju != %ju:%ju.", vd->vdev_path,
656                     (uintmax_t)spa_guid(vd->vdev_spa),
657                     (uintmax_t)vd->vdev_guid,
658                     (uintmax_t)pool_guid, (uintmax_t)vdev_guid);
659                 return (B_FALSE);
660         }
661 }
662
663 static struct g_consumer *
664 vdev_geom_attach_by_guids(vdev_t *vd)
665 {
666         struct g_class *mp;
667         struct g_geom *gp;
668         struct g_provider *pp;
669         struct g_consumer *cp;
670
671         g_topology_assert();
672
673         cp = NULL;
674         LIST_FOREACH(mp, &g_classes, class) {
675                 if (mp == &zfs_vdev_class)
676                         continue;
677                 LIST_FOREACH(gp, &mp->geom, geom) {
678                         if (gp->flags & G_GEOM_WITHER)
679                                 continue;
680                         LIST_FOREACH(pp, &gp->provider, provider) {
681                                 if (!vdev_attach_ok(vd, pp))
682                                         continue;
683                                 cp = vdev_geom_attach(pp, vd);
684                                 if (cp == NULL) {
685                                         printf("ZFS WARNING: Unable to "
686                                             "attach to %s.\n", pp->name);
687                                         continue;
688                                 }
689                                 break;
690                         }
691                         if (cp != NULL)
692                                 break;
693                 }
694                 if (cp != NULL)
695                         break;
696         }
697 end:
698         return (cp);
699 }
700
701 static struct g_consumer *
702 vdev_geom_open_by_guids(vdev_t *vd)
703 {
704         struct g_consumer *cp;
705         char *buf;
706         size_t len;
707
708         g_topology_assert();
709
710         ZFS_LOG(1, "Searching by guids [%ju:%ju].",
711                 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
712         cp = vdev_geom_attach_by_guids(vd);
713         if (cp != NULL) {
714                 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
715                 buf = kmem_alloc(len, KM_SLEEP);
716
717                 snprintf(buf, len, "/dev/%s", cp->provider->name);
718                 spa_strfree(vd->vdev_path);
719                 vd->vdev_path = buf;
720
721                 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
722                     (uintmax_t)spa_guid(vd->vdev_spa),
723                     (uintmax_t)vd->vdev_guid, vd->vdev_path);
724         } else {
725                 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
726                     (uintmax_t)spa_guid(vd->vdev_spa),
727                     (uintmax_t)vd->vdev_guid);
728         }
729
730         return (cp);
731 }
732
733 static struct g_consumer *
734 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
735 {
736         struct g_provider *pp;
737         struct g_consumer *cp;
738
739         g_topology_assert();
740
741         cp = NULL;
742         pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
743         if (pp != NULL) {
744                 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
745                 if (!check_guid || vdev_attach_ok(vd, pp))
746                         cp = vdev_geom_attach(pp, vd);
747         }
748
749         return (cp);
750 }
751
752 static int
753 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
754     uint64_t *logical_ashift, uint64_t *physical_ashift)
755 {
756         struct g_provider *pp;
757         struct g_consumer *cp;
758         size_t bufsize;
759         int error;
760
761         /* Set the TLS to indicate downstack that we should not access zvols*/
762         VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
763
764         /*
765          * We must have a pathname, and it must be absolute.
766          */
767         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
768                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
769                 return (EINVAL);
770         }
771
772         vd->vdev_tsd = NULL;
773
774         DROP_GIANT();
775         g_topology_lock();
776         error = 0;
777
778         if (vd->vdev_spa->spa_splitting_newspa ||
779             (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
780              vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) {
781                 /*
782                  * We are dealing with a vdev that hasn't been previously
783                  * opened (since boot), and we are not loading an
784                  * existing pool configuration.  This looks like a
785                  * vdev add operation to a new or existing pool.
786                  * Assume the user knows what he/she is doing and find
787                  * GEOM provider by its name, ignoring GUID mismatches.
788                  *
789                  * XXPOLICY: It would be safer to only allow a device
790                  *           that is unlabeled or labeled but missing
791                  *           GUID information to be opened in this fashion,
792                  *           unless we are doing a split, in which case we
793                  *           should allow any guid.
794                  */
795                 cp = vdev_geom_open_by_path(vd, 0);
796         } else {
797                 /*
798                  * Try using the recorded path for this device, but only
799                  * accept it if its label data contains the expected GUIDs.
800                  */
801                 cp = vdev_geom_open_by_path(vd, 1);
802                 if (cp == NULL) {
803                         /*
804                          * The device at vd->vdev_path doesn't have the
805                          * expected GUIDs. The disks might have merely
806                          * moved around so try all other GEOM providers
807                          * to find one with the right GUIDs.
808                          */
809                         cp = vdev_geom_open_by_guids(vd);
810                 }
811         }
812
813         /* Clear the TLS now that tasting is done */
814         VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
815
816         if (cp == NULL) {
817                 ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
818                 error = ENOENT;
819         } else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
820             !ISP2(cp->provider->sectorsize)) {
821                 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
822                     vd->vdev_path);
823
824                 vdev_geom_close_locked(vd);
825                 error = EINVAL;
826                 cp = NULL;
827         } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
828                 int i;
829
830                 for (i = 0; i < 5; i++) {
831                         error = g_access(cp, 0, 1, 0);
832                         if (error == 0)
833                                 break;
834                         g_topology_unlock();
835                         tsleep(vd, 0, "vdev", hz / 2);
836                         g_topology_lock();
837                 }
838                 if (error != 0) {
839                         printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
840                             vd->vdev_path, error);
841                         vdev_geom_close_locked(vd);
842                         cp = NULL;
843                 }
844         }
845
846         /* Fetch initial physical path information for this device. */
847         if (cp != NULL)
848                 vdev_geom_attrchanged(cp, "GEOM::physpath");
849         
850         g_topology_unlock();
851         PICKUP_GIANT();
852         if (cp == NULL) {
853                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
854                 return (error);
855         }
856         pp = cp->provider;
857
858         /*
859          * Determine the actual size of the device.
860          */
861         *max_psize = *psize = pp->mediasize;
862
863         /*
864          * Determine the device's minimum transfer size and preferred
865          * transfer size.
866          */
867         *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
868         *physical_ashift = 0;
869         if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
870             pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
871                 *physical_ashift = highbit(pp->stripesize) - 1;
872
873         /*
874          * Clear the nowritecache settings, so that on a vdev_reopen()
875          * we will try again.
876          */
877         vd->vdev_nowritecache = B_FALSE;
878
879         /*
880          * Determine the device's rotation rate.
881          */
882         vdev_geom_set_rotation_rate(vd, cp);
883
884         return (0);
885 }
886
887 static void
888 vdev_geom_close(vdev_t *vd)
889 {
890
891         DROP_GIANT();
892         g_topology_lock();
893         vdev_geom_close_locked(vd);
894         g_topology_unlock();
895         PICKUP_GIANT();
896 }
897
898 static void
899 vdev_geom_io_intr(struct bio *bp)
900 {
901         vdev_t *vd;
902         zio_t *zio;
903
904         zio = bp->bio_caller1;
905         vd = zio->io_vd;
906         zio->io_error = bp->bio_error;
907         if (zio->io_error == 0 && bp->bio_resid != 0)
908                 zio->io_error = SET_ERROR(EIO);
909
910         switch(zio->io_error) {
911         case ENOTSUP:
912                 /*
913                  * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
914                  * that future attempts will never succeed. In this case
915                  * we set a persistent flag so that we don't bother with
916                  * requests in the future.
917                  */
918                 switch(bp->bio_cmd) {
919                 case BIO_FLUSH:
920                         vd->vdev_nowritecache = B_TRUE;
921                         break;
922                 case BIO_DELETE:
923                         vd->vdev_notrim = B_TRUE;
924                         break;
925                 }
926                 break;
927         case ENXIO:
928                 if (!vd->vdev_remove_wanted) {
929                         /*
930                          * If provider's error is set we assume it is being
931                          * removed.
932                          */
933                         if (bp->bio_to->error != 0) {
934                                 vd->vdev_remove_wanted = B_TRUE;
935                                 spa_async_request(zio->io_spa,
936                                     SPA_ASYNC_REMOVE);
937                         } else if (!vd->vdev_delayed_close) {
938                                 vd->vdev_delayed_close = B_TRUE;
939                         }
940                 }
941                 break;
942         }
943         g_destroy_bio(bp);
944         zio_delay_interrupt(zio);
945 }
946
947 static void
948 vdev_geom_io_start(zio_t *zio)
949 {
950         vdev_t *vd;
951         struct g_consumer *cp;
952         struct bio *bp;
953         int error;
954
955         vd = zio->io_vd;
956
957         switch (zio->io_type) {
958         case ZIO_TYPE_IOCTL:
959                 /* XXPOLICY */
960                 if (!vdev_readable(vd)) {
961                         zio->io_error = SET_ERROR(ENXIO);
962                         zio_interrupt(zio);
963                         return;
964                 } else {
965                         switch (zio->io_cmd) {
966                         case DKIOCFLUSHWRITECACHE:
967                                 if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
968                                         break;
969                                 if (vd->vdev_nowritecache) {
970                                         zio->io_error = SET_ERROR(ENOTSUP);
971                                         break;
972                                 }
973                                 goto sendreq;
974                         default:
975                                 zio->io_error = SET_ERROR(ENOTSUP);
976                         }
977                 }
978
979                 zio_execute(zio);
980                 return;
981         case ZIO_TYPE_FREE:
982                 if (vd->vdev_notrim) {
983                         zio->io_error = SET_ERROR(ENOTSUP);
984                 } else if (!vdev_geom_bio_delete_disable) {
985                         goto sendreq;
986                 }
987                 zio_execute(zio);
988                 return;
989         }
990 sendreq:
991         ASSERT(zio->io_type == ZIO_TYPE_READ ||
992             zio->io_type == ZIO_TYPE_WRITE ||
993             zio->io_type == ZIO_TYPE_FREE ||
994             zio->io_type == ZIO_TYPE_IOCTL);
995
996         cp = vd->vdev_tsd;
997         if (cp == NULL) {
998                 zio->io_error = SET_ERROR(ENXIO);
999                 zio_interrupt(zio);
1000                 return;
1001         }
1002         bp = g_alloc_bio();
1003         bp->bio_caller1 = zio;
1004         switch (zio->io_type) {
1005         case ZIO_TYPE_READ:
1006         case ZIO_TYPE_WRITE:
1007                 zio->io_target_timestamp = zio_handle_io_delay(zio);
1008                 bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
1009                 bp->bio_data = zio->io_data;
1010                 bp->bio_offset = zio->io_offset;
1011                 bp->bio_length = zio->io_size;
1012                 break;
1013         case ZIO_TYPE_FREE:
1014                 bp->bio_cmd = BIO_DELETE;
1015                 bp->bio_data = NULL;
1016                 bp->bio_offset = zio->io_offset;
1017                 bp->bio_length = zio->io_size;
1018                 break;
1019         case ZIO_TYPE_IOCTL:
1020                 bp->bio_cmd = BIO_FLUSH;
1021                 bp->bio_flags |= BIO_ORDERED;
1022                 bp->bio_data = NULL;
1023                 bp->bio_offset = cp->provider->mediasize;
1024                 bp->bio_length = 0;
1025                 break;
1026         }
1027         bp->bio_done = vdev_geom_io_intr;
1028
1029         g_io_request(bp, cp);
1030 }
1031
1032 static void
1033 vdev_geom_io_done(zio_t *zio)
1034 {
1035 }
1036
1037 static void
1038 vdev_geom_hold(vdev_t *vd)
1039 {
1040 }
1041
1042 static void
1043 vdev_geom_rele(vdev_t *vd)
1044 {
1045 }
1046
1047 vdev_ops_t vdev_geom_ops = {
1048         vdev_geom_open,
1049         vdev_geom_close,
1050         vdev_default_asize,
1051         vdev_geom_io_start,
1052         vdev_geom_io_done,
1053         NULL,
1054         vdev_geom_hold,
1055         vdev_geom_rele,
1056         VDEV_TYPE_DISK,         /* name of this vdev type */
1057         B_TRUE                  /* leaf vdev */
1058 };