sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  23  * All rights reserved.
  24  */
  25
  26 #include <sys/zfs_context.h>
  27 #include <sys/param.h>
  28 #include <sys/kernel.h>
  29 #include <sys/bio.h>
  30 #include <sys/disk.h>
  31 #include <sys/spa.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/fs/zfs.h>
  35 #include <sys/zio.h>
  36 #include <geom/geom.h>
  37 #include <geom/geom_int.h>
  38
  39 /*
  40  * Virtual device vector for GEOM.
  41  */
  42
  43 struct g_class zfs_vdev_class = {
  44         .name = "ZFS::VDEV",
  45         .version = G_VERSION,
  46 };
  47
  48 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
  49
  50 /*
  51  * Don't send BIO_FLUSH.
  52  */
  53 static int vdev_geom_bio_flush_disable = 0;
  54 TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
  55 SYSCTL_DECL(_vfs_zfs_vdev);
  56 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
  57     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
  58
  59 static void
  60 vdev_geom_orphan(struct g_consumer *cp)
  61 {
  62         vdev_t *vd;
  63
  64         g_topology_assert();
  65
  66         vd = cp->private;
  67
  68         /*
  69          * Orphan callbacks occur from the GEOM event thread.
  70          * Concurrent with this call, new I/O requests may be
  71          * working their way through GEOM about to find out
  72          * (only once executed by the g_down thread) that we've
  73          * been orphaned from our disk provider.  These I/Os
  74          * must be retired before we can detach our consumer.
  75          * This is most easily achieved by acquiring the
  76          * SPA ZIO configuration lock as a writer, but doing
  77          * so with the GEOM topology lock held would cause
  78          * a lock order reversal.  Instead, rely on the SPA's
  79          * async removal support to invoke a close on this
  80          * vdev once it is safe to do so.
  81          */
  82         zfs_post_remove(vd->vdev_spa, vd);
  83         vd->vdev_remove_wanted = B_TRUE;
  84         spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
  85 }
  86
  87 static struct g_consumer *
  88 vdev_geom_attach(struct g_provider *pp)
  89 {
  90         struct g_geom *gp;
  91         struct g_consumer *cp;
  92
  93         g_topology_assert();
  94
  95         ZFS_LOG(1, "Attaching to %s.", pp->name);
  96         /* Do we have geom already? No? Create one. */
  97         LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
  98                 if (gp->flags & G_GEOM_WITHER)
  99                         continue;
 100                 if (strcmp(gp->name, "zfs::vdev") != 0)
 101                         continue;
 102                 break;
 103         }
 104         if (gp == NULL) {
 105                 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
 106                 gp->orphan = vdev_geom_orphan;
 107                 cp = g_new_consumer(gp);
 108                 if (g_attach(cp, pp) != 0) {
 109                         g_wither_geom(gp, ENXIO);
 110                         return (NULL);
 111                 }
 112                 if (g_access(cp, 1, 0, 1) != 0) {
 113                         g_wither_geom(gp, ENXIO);
 114                         return (NULL);
 115                 }
 116                 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
 117         } else {
 118                 /* Check if we are already connected to this provider. */
 119                 LIST_FOREACH(cp, &gp->consumer, consumer) {
 120                         if (cp->provider == pp) {
 121                                 ZFS_LOG(1, "Found consumer for %s.", pp->name);
 122                                 break;
 123                         }
 124                 }
 125                 if (cp == NULL) {
 126                         cp = g_new_consumer(gp);
 127                         if (g_attach(cp, pp) != 0) {
 128                                 g_destroy_consumer(cp);
 129                                 return (NULL);
 130                         }
 131                         if (g_access(cp, 1, 0, 1) != 0) {
 132                                 g_detach(cp);
 133                                 g_destroy_consumer(cp);
 134                                 return (NULL);
 135                         }
 136                         ZFS_LOG(1, "Created consumer for %s.", pp->name);
 137                 } else {
 138                         if (g_access(cp, 1, 0, 1) != 0)
 139                                 return (NULL);
 140                         ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
 141                 }
 142         }
 143         return (cp);
 144 }
 145
 146 static void
 147 vdev_geom_detach(void *arg, int flag __unused)
 148 {
 149         struct g_geom *gp;
 150         struct g_consumer *cp;
 151
 152         g_topology_assert();
 153         cp = arg;
 154         gp = cp->geom;
 155
 156         ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
 157         g_access(cp, -1, 0, -1);
 158         /* Destroy consumer on last close. */
 159         if (cp->acr == 0 && cp->ace == 0) {
 160                 ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
 161                 if (cp->acw > 0)
 162                         g_access(cp, 0, -cp->acw, 0);
 163                 g_detach(cp);
 164                 g_destroy_consumer(cp);
 165         }
 166         /* Destroy geom if there are no consumers left. */
 167         if (LIST_EMPTY(&gp->consumer)) {
 168                 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
 169                 g_wither_geom(gp, ENXIO);
 170         }
 171 }
 172
 173 static uint64_t
 174 nvlist_get_guid(nvlist_t *list)
 175 {
 176         nvpair_t *elem = NULL;
 177         uint64_t value;
 178
 179         while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
 180                 if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
 181                     strcmp(nvpair_name(elem), "guid") == 0) {
 182                         VERIFY(nvpair_value_uint64(elem, &value) == 0);
 183                         return (value);
 184                 }
 185         }
 186         return (0);
 187 }
 188
 189 static int
 190 vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
 191 {
 192         struct bio *bp;
 193         u_char *p;
 194         off_t off, maxio;
 195         int error;
 196
 197         ASSERT((offset % cp->provider->sectorsize) == 0);
 198         ASSERT((size % cp->provider->sectorsize) == 0);
 199
 200         bp = g_alloc_bio();
 201         off = offset;
 202         offset += size;
 203         p = data;
 204         maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
 205         error = 0;
 206
 207         for (; off < offset; off += maxio, p += maxio, size -= maxio) {
 208                 bzero(bp, sizeof(*bp));
 209                 bp->bio_cmd = cmd;
 210                 bp->bio_done = NULL;
 211                 bp->bio_offset = off;
 212                 bp->bio_length = MIN(size, maxio);
 213                 bp->bio_data = p;
 214                 g_io_request(bp, cp);
 215                 error = biowait(bp, "vdev_geom_io");
 216                 if (error != 0)
 217                         break;
 218         }
 219
 220         g_destroy_bio(bp);
 221         return (error);
 222 }
 223
 224 static uint64_t
 225 vdev_geom_read_guid(struct g_consumer *cp)
 226 {
 227         struct g_provider *pp;
 228         vdev_label_t *label;
 229         char *p, *buf;
 230         size_t buflen;
 231         uint64_t psize;
 232         off_t offset, size;
 233         uint64_t guid;
 234         int error, l, len;
 235
 236         g_topology_assert_not();
 237
 238         pp = cp->provider;
 239         ZFS_LOG(1, "Reading guid from %s...", pp->name);
 240
 241         psize = pp->mediasize;
 242         psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
 243
 244         size = sizeof(*label) + pp->sectorsize -
 245             ((sizeof(*label) - 1) % pp->sectorsize) - 1;
 246
 247         guid = 0;
 248         label = kmem_alloc(size, KM_SLEEP);
 249         buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
 250
 251         for (l = 0; l < VDEV_LABELS; l++) {
 252                 nvlist_t *config = NULL;
 253
 254                 offset = vdev_label_offset(psize, l, 0);
 255                 if ((offset % pp->sectorsize) != 0)
 256                         continue;
 257
 258                 if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
 259                         continue;
 260                 buf = label->vl_vdev_phys.vp_nvlist;
 261
 262                 if (nvlist_unpack(buf, buflen, &config, 0) != 0)
 263                         continue;
 264
 265                 guid = nvlist_get_guid(config);
 266                 nvlist_free(config);
 267                 if (guid != 0)
 268                         break;
 269         }
 270
 271         kmem_free(label, size);
 272         if (guid != 0)
 273                 ZFS_LOG(1, "guid for %s is %ju", pp->name, (uintmax_t)guid);
 274         return (guid);
 275 }
 276
 277 static void
 278 vdev_geom_taste_orphan(struct g_consumer *cp)
 279 {
 280
 281         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 282             cp->provider->name));
 283 }
 284
 285 static struct g_consumer *
 286 vdev_geom_attach_by_guid(uint64_t guid)
 287 {
 288         struct g_class *mp;
 289         struct g_geom *gp, *zgp;
 290         struct g_provider *pp;
 291         struct g_consumer *cp, *zcp;
 292         uint64_t pguid;
 293
 294         g_topology_assert();
 295
 296         zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
 297         /* This orphan function should be never called. */
 298         zgp->orphan = vdev_geom_taste_orphan;
 299         zcp = g_new_consumer(zgp);
 300
 301         cp = NULL;
 302         LIST_FOREACH(mp, &g_classes, class) {
 303                 if (mp == &zfs_vdev_class)
 304                         continue;
 305                 LIST_FOREACH(gp, &mp->geom, geom) {
 306                         if (gp->flags & G_GEOM_WITHER)
 307                                 continue;
 308                         LIST_FOREACH(pp, &gp->provider, provider) {
 309                                 if (pp->flags & G_PF_WITHER)
 310                                         continue;
 311                                 g_attach(zcp, pp);
 312                                 if (g_access(zcp, 1, 0, 0) != 0) {
 313                                         g_detach(zcp);
 314                                         continue;
 315                                 }
 316                                 g_topology_unlock();
 317                                 pguid = vdev_geom_read_guid(zcp);
 318                                 g_topology_lock();
 319                                 g_access(zcp, -1, 0, 0);
 320                                 g_detach(zcp);
 321                                 if (pguid != guid)
 322                                         continue;
 323                                 cp = vdev_geom_attach(pp);
 324                                 if (cp == NULL) {
 325                                         printf("ZFS WARNING: Unable to attach to %s.\n",
 326                                             pp->name);
 327                                         continue;
 328                                 }
 329                                 break;
 330                         }
 331                         if (cp != NULL)
 332                                 break;
 333                 }
 334                 if (cp != NULL)
 335                         break;
 336         }
 337 end:
 338         g_destroy_consumer(zcp);
 339         g_destroy_geom(zgp);
 340         return (cp);
 341 }
 342
 343 static struct g_consumer *
 344 vdev_geom_open_by_guid(vdev_t *vd)
 345 {
 346         struct g_consumer *cp;
 347         char *buf;
 348         size_t len;
 349
 350         g_topology_assert();
 351
 352         ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
 353         cp = vdev_geom_attach_by_guid(vd->vdev_guid);
 354         if (cp != NULL) {
 355                 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
 356                 buf = kmem_alloc(len, KM_SLEEP);
 357
 358                 snprintf(buf, len, "/dev/%s", cp->provider->name);
 359                 spa_strfree(vd->vdev_path);
 360                 vd->vdev_path = buf;
 361
 362                 ZFS_LOG(1, "Attach by guid [%ju] succeeded, provider %s.",
 363                     (uintmax_t)vd->vdev_guid, vd->vdev_path);
 364         } else {
 365                 ZFS_LOG(1, "Search by guid [%ju] failed.",
 366                     (uintmax_t)vd->vdev_guid);
 367         }
 368
 369         return (cp);
 370 }
 371
 372 static struct g_consumer *
 373 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
 374 {
 375         struct g_provider *pp;
 376         struct g_consumer *cp;
 377         uint64_t guid;
 378
 379         g_topology_assert();
 380
 381         cp = NULL;
 382         pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
 383         if (pp != NULL) {
 384                 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
 385                 cp = vdev_geom_attach(pp);
 386                 if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
 387                     pp->sectorsize <= VDEV_PAD_SIZE) {
 388                         g_topology_unlock();
 389                         guid = vdev_geom_read_guid(cp);
 390                         g_topology_lock();
 391                         if (guid != vd->vdev_guid) {
 392                                 vdev_geom_detach(cp, 0);
 393                                 cp = NULL;
 394                                 ZFS_LOG(1, "guid mismatch for provider %s: "
 395                                     "%ju != %ju.", vd->vdev_path,
 396                                     (uintmax_t)vd->vdev_guid, (uintmax_t)guid);
 397                         } else {
 398                                 ZFS_LOG(1, "guid match for provider %s.",
 399                                     vd->vdev_path);
 400                         }
 401                 }
 402         }
 403
 404         return (cp);
 405 }
 406
 407 static int
 408 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 409 {
 410         struct g_provider *pp;
 411         struct g_consumer *cp;
 412         size_t bufsize;
 413         int error;
 414
 415         /*
 416          * We must have a pathname, and it must be absolute.
 417          */
 418         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 419                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 420                 return (EINVAL);
 421         }
 422
 423         vd->vdev_tsd = NULL;
 424
 425         DROP_GIANT();
 426         g_topology_lock();
 427         error = 0;
 428
 429         /*
 430          * If we're creating or splitting a pool, just find the GEOM provider
 431          * by its name and ignore GUID mismatches.
 432          */
 433         if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
 434             vd->vdev_spa->spa_splitting_newspa == B_TRUE)
 435                 cp = vdev_geom_open_by_path(vd, 0);
 436         else {
 437                 cp = vdev_geom_open_by_path(vd, 1);
 438                 if (cp == NULL) {
 439                         /*
 440                          * The device at vd->vdev_path doesn't have the
 441                          * expected guid. The disks might have merely
 442                          * moved around so try all other GEOM providers
 443                          * to find one with the right guid.
 444                          */
 445                         cp = vdev_geom_open_by_guid(vd);
 446                 }
 447         }
 448
 449         if (cp == NULL) {
 450                 ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
 451                 error = ENOENT;
 452         } else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
 453             !ISP2(cp->provider->sectorsize)) {
 454                 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
 455                     vd->vdev_path);
 456                 vdev_geom_detach(cp, 0);
 457                 error = EINVAL;
 458                 cp = NULL;
 459         } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
 460                 int i;
 461
 462                 for (i = 0; i < 5; i++) {
 463                         error = g_access(cp, 0, 1, 0);
 464                         if (error == 0)
 465                                 break;
 466                         g_topology_unlock();
 467                         tsleep(vd, 0, "vdev", hz / 2);
 468                         g_topology_lock();
 469                 }
 470                 if (error != 0) {
 471                         printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
 472                             vd->vdev_path, error);
 473                         vdev_geom_detach(cp, 0);
 474                         cp = NULL;
 475                 }
 476         }
 477         g_topology_unlock();
 478         PICKUP_GIANT();
 479         if (cp == NULL) {
 480                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 481                 return (error);
 482         }
 483
 484         cp->private = vd;
 485         vd->vdev_tsd = cp;
 486         pp = cp->provider;
 487
 488         /*
 489          * Determine the actual size of the device.
 490          */
 491         *psize = pp->mediasize;
 492
 493         /*
 494          * Determine the device's minimum transfer size.
 495          */
 496         *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
 497
 498         /*
 499          * Clear the nowritecache bit, so that on a vdev_reopen() we will
 500          * try again.
 501          */
 502         vd->vdev_nowritecache = B_FALSE;
 503
 504         if (vd->vdev_physpath != NULL)
 505                 spa_strfree(vd->vdev_physpath);
 506         bufsize = sizeof("/dev/") + strlen(pp->name);
 507         vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
 508         snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
 509
 510         return (0);
 511 }
 512
 513 static void
 514 vdev_geom_close(vdev_t *vd)
 515 {
 516         struct g_consumer *cp;
 517
 518         cp = vd->vdev_tsd;
 519         if (cp == NULL)
 520                 return;
 521         vd->vdev_tsd = NULL;
 522         vd->vdev_delayed_close = B_FALSE;
 523         g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
 524 }
 525
 526 static void
 527 vdev_geom_io_intr(struct bio *bp)
 528 {
 529         vdev_t *vd;
 530         zio_t *zio;
 531
 532         zio = bp->bio_caller1;
 533         vd = zio->io_vd;
 534         zio->io_error = bp->bio_error;
 535         if (zio->io_error == 0 && bp->bio_resid != 0)
 536                 zio->io_error = EIO;
 537         if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
 538                 /*
 539                  * If we get ENOTSUP, we know that no future
 540                  * attempts will ever succeed.  In this case we
 541                  * set a persistent bit so that we don't bother
 542                  * with the ioctl in the future.
 543                  */
 544                 vd->vdev_nowritecache = B_TRUE;
 545         }
 546         if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
 547                 /*
 548                  * If provider's error is set we assume it is being
 549                  * removed.
 550                  */
 551                 if (bp->bio_to->error != 0) {
 552                         /*
 553                          * We post the resource as soon as possible, instead of
 554                          * when the async removal actually happens, because the
 555                          * DE is using this information to discard previous I/O
 556                          * errors.
 557                          */
 558                         /* XXX: zfs_post_remove() can sleep. */
 559                         zfs_post_remove(zio->io_spa, vd);
 560                         vd->vdev_remove_wanted = B_TRUE;
 561                         spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 562                 } else if (!vd->vdev_delayed_close) {
 563                         vd->vdev_delayed_close = B_TRUE;
 564                 }
 565         }
 566         g_destroy_bio(bp);
 567         zio_interrupt(zio);
 568 }
 569
 570 static int
 571 vdev_geom_io_start(zio_t *zio)
 572 {
 573         vdev_t *vd;
 574         struct g_consumer *cp;
 575         struct bio *bp;
 576         int error;
 577
 578         vd = zio->io_vd;
 579
 580         if (zio->io_type == ZIO_TYPE_IOCTL) {
 581                 /* XXPOLICY */
 582                 if (!vdev_readable(vd)) {
 583                         zio->io_error = ENXIO;
 584                         return (ZIO_PIPELINE_CONTINUE);
 585                 }
 586
 587                 switch (zio->io_cmd) {
 588
 589                 case DKIOCFLUSHWRITECACHE:
 590
 591                         if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
 592                                 break;
 593
 594                         if (vd->vdev_nowritecache) {
 595                                 zio->io_error = ENOTSUP;
 596                                 break;
 597                         }
 598
 599                         goto sendreq;
 600                 default:
 601                         zio->io_error = ENOTSUP;
 602                 }
 603
 604                 return (ZIO_PIPELINE_CONTINUE);
 605         }
 606 sendreq:
 607         cp = vd->vdev_tsd;
 608         if (cp == NULL) {
 609                 zio->io_error = ENXIO;
 610                 return (ZIO_PIPELINE_CONTINUE);
 611         }
 612         bp = g_alloc_bio();
 613         bp->bio_caller1 = zio;
 614         switch (zio->io_type) {
 615         case ZIO_TYPE_READ:
 616         case ZIO_TYPE_WRITE:
 617                 bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
 618                 bp->bio_data = zio->io_data;
 619                 bp->bio_offset = zio->io_offset;
 620                 bp->bio_length = zio->io_size;
 621                 break;
 622         case ZIO_TYPE_IOCTL:
 623                 bp->bio_cmd = BIO_FLUSH;
 624                 bp->bio_flags |= BIO_ORDERED;
 625                 bp->bio_data = NULL;
 626                 bp->bio_offset = cp->provider->mediasize;
 627                 bp->bio_length = 0;
 628                 break;
 629         }
 630         bp->bio_done = vdev_geom_io_intr;
 631
 632         g_io_request(bp, cp);
 633
 634         return (ZIO_PIPELINE_STOP);
 635 }
 636
 637 static void
 638 vdev_geom_io_done(zio_t *zio)
 639 {
 640 }
 641
 642 static void
 643 vdev_geom_hold(vdev_t *vd)
 644 {
 645 }
 646
 647 static void
 648 vdev_geom_rele(vdev_t *vd)
 649 {
 650 }
 651
 652 vdev_ops_t vdev_geom_ops = {
 653         vdev_geom_open,
 654         vdev_geom_close,
 655         vdev_default_asize,
 656         vdev_geom_io_start,
 657         vdev_geom_io_done,
 658         NULL,
 659         vdev_geom_hold,
 660         vdev_geom_rele,
 661         VDEV_TYPE_DISK,         /* name of this vdev type */
 662         B_TRUE                  /* leaf vdev */
 663 };