sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  23  * All rights reserved.
  24  *
  25  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/param.h>
  30 #include <sys/kernel.h>
  31 #include <sys/bio.h>
  32 #include <sys/disk.h>
  33 #include <sys/spa.h>
  34 #include <sys/spa_impl.h>
  35 #include <sys/vdev_impl.h>
  36 #include <sys/fs/zfs.h>
  37 #include <sys/zio.h>
  38 #include <geom/geom.h>
  39 #include <geom/geom_int.h>
  40
  41 /*
  42  * Virtual device vector for GEOM.
  43  */
  44
  45 struct g_class zfs_vdev_class = {
  46         .name = "ZFS::VDEV",
  47         .version = G_VERSION,
  48 };
  49
  50 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
  51
  52 /*
  53  * Don't send BIO_FLUSH.
  54  */
  55 static int vdev_geom_bio_flush_disable = 0;
  56 TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
  57 SYSCTL_DECL(_vfs_zfs_vdev);
  58 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
  59     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
  60
  61 static void
  62 vdev_geom_orphan(struct g_consumer *cp)
  63 {
  64         vdev_t *vd;
  65
  66         g_topology_assert();
  67
  68         vd = cp->private;
  69
  70         /*
  71          * Orphan callbacks occur from the GEOM event thread.
  72          * Concurrent with this call, new I/O requests may be
  73          * working their way through GEOM about to find out
  74          * (only once executed by the g_down thread) that we've
  75          * been orphaned from our disk provider.  These I/Os
  76          * must be retired before we can detach our consumer.
  77          * This is most easily achieved by acquiring the
  78          * SPA ZIO configuration lock as a writer, but doing
  79          * so with the GEOM topology lock held would cause
  80          * a lock order reversal.  Instead, rely on the SPA's
  81          * async removal support to invoke a close on this
  82          * vdev once it is safe to do so.
  83          */
  84         zfs_post_remove(vd->vdev_spa, vd);
  85         vd->vdev_remove_wanted = B_TRUE;
  86         spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
  87 }
  88
  89 static struct g_consumer *
  90 vdev_geom_attach(struct g_provider *pp)
  91 {
  92         struct g_geom *gp;
  93         struct g_consumer *cp;
  94
  95         g_topology_assert();
  96
  97         ZFS_LOG(1, "Attaching to %s.", pp->name);
  98         /* Do we have geom already? No? Create one. */
  99         LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
 100                 if (gp->flags & G_GEOM_WITHER)
 101                         continue;
 102                 if (strcmp(gp->name, "zfs::vdev") != 0)
 103                         continue;
 104                 break;
 105         }
 106         if (gp == NULL) {
 107                 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
 108                 gp->orphan = vdev_geom_orphan;
 109                 cp = g_new_consumer(gp);
 110                 if (g_attach(cp, pp) != 0) {
 111                         g_wither_geom(gp, ENXIO);
 112                         return (NULL);
 113                 }
 114                 if (g_access(cp, 1, 0, 1) != 0) {
 115                         g_wither_geom(gp, ENXIO);
 116                         return (NULL);
 117                 }
 118                 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
 119         } else {
 120                 /* Check if we are already connected to this provider. */
 121                 LIST_FOREACH(cp, &gp->consumer, consumer) {
 122                         if (cp->provider == pp) {
 123                                 ZFS_LOG(1, "Found consumer for %s.", pp->name);
 124                                 break;
 125                         }
 126                 }
 127                 if (cp == NULL) {
 128                         cp = g_new_consumer(gp);
 129                         if (g_attach(cp, pp) != 0) {
 130                                 g_destroy_consumer(cp);
 131                                 return (NULL);
 132                         }
 133                         if (g_access(cp, 1, 0, 1) != 0) {
 134                                 g_detach(cp);
 135                                 g_destroy_consumer(cp);
 136                                 return (NULL);
 137                         }
 138                         ZFS_LOG(1, "Created consumer for %s.", pp->name);
 139                 } else {
 140                         if (g_access(cp, 1, 0, 1) != 0)
 141                                 return (NULL);
 142                         ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
 143                 }
 144         }
 145         return (cp);
 146 }
 147
 148 static void
 149 vdev_geom_detach(void *arg, int flag __unused)
 150 {
 151         struct g_geom *gp;
 152         struct g_consumer *cp;
 153
 154         g_topology_assert();
 155         cp = arg;
 156         gp = cp->geom;
 157
 158         ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
 159         g_access(cp, -1, 0, -1);
 160         /* Destroy consumer on last close. */
 161         if (cp->acr == 0 && cp->ace == 0) {
 162                 ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
 163                 if (cp->acw > 0)
 164                         g_access(cp, 0, -cp->acw, 0);
 165                 g_detach(cp);
 166                 g_destroy_consumer(cp);
 167         }
 168         /* Destroy geom if there are no consumers left. */
 169         if (LIST_EMPTY(&gp->consumer)) {
 170                 ZFS_LOG(1, "Destroyed geom %s.", gp->name);
 171                 g_wither_geom(gp, ENXIO);
 172         }
 173 }
 174
 175 static uint64_t
 176 nvlist_get_guid(nvlist_t *list)
 177 {
 178         nvpair_t *elem = NULL;
 179         uint64_t value;
 180
 181         while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
 182                 if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
 183                     strcmp(nvpair_name(elem), "guid") == 0) {
 184                         VERIFY(nvpair_value_uint64(elem, &value) == 0);
 185                         return (value);
 186                 }
 187         }
 188         return (0);
 189 }
 190
 191 static int
 192 vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
 193 {
 194         struct bio *bp;
 195         u_char *p;
 196         off_t off, maxio;
 197         int error;
 198
 199         ASSERT((offset % cp->provider->sectorsize) == 0);
 200         ASSERT((size % cp->provider->sectorsize) == 0);
 201
 202         bp = g_alloc_bio();
 203         off = offset;
 204         offset += size;
 205         p = data;
 206         maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
 207         error = 0;
 208
 209         for (; off < offset; off += maxio, p += maxio, size -= maxio) {
 210                 bzero(bp, sizeof(*bp));
 211                 bp->bio_cmd = cmd;
 212                 bp->bio_done = NULL;
 213                 bp->bio_offset = off;
 214                 bp->bio_length = MIN(size, maxio);
 215                 bp->bio_data = p;
 216                 g_io_request(bp, cp);
 217                 error = biowait(bp, "vdev_geom_io");
 218                 if (error != 0)
 219                         break;
 220         }
 221
 222         g_destroy_bio(bp);
 223         return (error);
 224 }
 225
 226 static uint64_t
 227 vdev_geom_read_guid(struct g_consumer *cp)
 228 {
 229         struct g_provider *pp;
 230         vdev_label_t *label;
 231         char *p, *buf;
 232         size_t buflen;
 233         uint64_t psize;
 234         off_t offset, size;
 235         uint64_t guid;
 236         int error, l, len;
 237
 238         g_topology_assert_not();
 239
 240         pp = cp->provider;
 241         ZFS_LOG(1, "Reading guid from %s...", pp->name);
 242
 243         psize = pp->mediasize;
 244         psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
 245
 246         size = sizeof(*label) + pp->sectorsize -
 247             ((sizeof(*label) - 1) % pp->sectorsize) - 1;
 248
 249         guid = 0;
 250         label = kmem_alloc(size, KM_SLEEP);
 251         buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
 252
 253         for (l = 0; l < VDEV_LABELS; l++) {
 254                 nvlist_t *config = NULL;
 255
 256                 offset = vdev_label_offset(psize, l, 0);
 257                 if ((offset % pp->sectorsize) != 0)
 258                         continue;
 259
 260                 if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
 261                         continue;
 262                 buf = label->vl_vdev_phys.vp_nvlist;
 263
 264                 if (nvlist_unpack(buf, buflen, &config, 0) != 0)
 265                         continue;
 266
 267                 guid = nvlist_get_guid(config);
 268                 nvlist_free(config);
 269                 if (guid != 0)
 270                         break;
 271         }
 272
 273         kmem_free(label, size);
 274         if (guid != 0)
 275                 ZFS_LOG(1, "guid for %s is %ju", pp->name, (uintmax_t)guid);
 276         return (guid);
 277 }
 278
 279 static void
 280 vdev_geom_taste_orphan(struct g_consumer *cp)
 281 {
 282
 283         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 284             cp->provider->name));
 285 }
 286
 287 static struct g_consumer *
 288 vdev_geom_attach_by_guid(uint64_t guid)
 289 {
 290         struct g_class *mp;
 291         struct g_geom *gp, *zgp;
 292         struct g_provider *pp;
 293         struct g_consumer *cp, *zcp;
 294         uint64_t pguid;
 295
 296         g_topology_assert();
 297
 298         zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
 299         /* This orphan function should be never called. */
 300         zgp->orphan = vdev_geom_taste_orphan;
 301         zcp = g_new_consumer(zgp);
 302
 303         cp = NULL;
 304         LIST_FOREACH(mp, &g_classes, class) {
 305                 if (mp == &zfs_vdev_class)
 306                         continue;
 307                 LIST_FOREACH(gp, &mp->geom, geom) {
 308                         if (gp->flags & G_GEOM_WITHER)
 309                                 continue;
 310                         LIST_FOREACH(pp, &gp->provider, provider) {
 311                                 if (pp->flags & G_PF_WITHER)
 312                                         continue;
 313                                 g_attach(zcp, pp);
 314                                 if (g_access(zcp, 1, 0, 0) != 0) {
 315                                         g_detach(zcp);
 316                                         continue;
 317                                 }
 318                                 g_topology_unlock();
 319                                 pguid = vdev_geom_read_guid(zcp);
 320                                 g_topology_lock();
 321                                 g_access(zcp, -1, 0, 0);
 322                                 g_detach(zcp);
 323                                 if (pguid != guid)
 324                                         continue;
 325                                 cp = vdev_geom_attach(pp);
 326                                 if (cp == NULL) {
 327                                         printf("ZFS WARNING: Unable to attach to %s.\n",
 328                                             pp->name);
 329                                         continue;
 330                                 }
 331                                 break;
 332                         }
 333                         if (cp != NULL)
 334                                 break;
 335                 }
 336                 if (cp != NULL)
 337                         break;
 338         }
 339 end:
 340         g_destroy_consumer(zcp);
 341         g_destroy_geom(zgp);
 342         return (cp);
 343 }
 344
 345 static struct g_consumer *
 346 vdev_geom_open_by_guid(vdev_t *vd)
 347 {
 348         struct g_consumer *cp;
 349         char *buf;
 350         size_t len;
 351
 352         g_topology_assert();
 353
 354         ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
 355         cp = vdev_geom_attach_by_guid(vd->vdev_guid);
 356         if (cp != NULL) {
 357                 len = strlen(cp->provider->name) + strlen("/dev/") + 1;
 358                 buf = kmem_alloc(len, KM_SLEEP);
 359
 360                 snprintf(buf, len, "/dev/%s", cp->provider->name);
 361                 spa_strfree(vd->vdev_path);
 362                 vd->vdev_path = buf;
 363
 364                 ZFS_LOG(1, "Attach by guid [%ju] succeeded, provider %s.",
 365                     (uintmax_t)vd->vdev_guid, vd->vdev_path);
 366         } else {
 367                 ZFS_LOG(1, "Search by guid [%ju] failed.",
 368                     (uintmax_t)vd->vdev_guid);
 369         }
 370
 371         return (cp);
 372 }
 373
 374 static struct g_consumer *
 375 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
 376 {
 377         struct g_provider *pp;
 378         struct g_consumer *cp;
 379         uint64_t guid;
 380
 381         g_topology_assert();
 382
 383         cp = NULL;
 384         pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
 385         if (pp != NULL) {
 386                 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
 387                 cp = vdev_geom_attach(pp);
 388                 if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
 389                     pp->sectorsize <= VDEV_PAD_SIZE) {
 390                         g_topology_unlock();
 391                         guid = vdev_geom_read_guid(cp);
 392                         g_topology_lock();
 393                         if (guid != vd->vdev_guid) {
 394                                 vdev_geom_detach(cp, 0);
 395                                 cp = NULL;
 396                                 ZFS_LOG(1, "guid mismatch for provider %s: "
 397                                     "%ju != %ju.", vd->vdev_path,
 398                                     (uintmax_t)vd->vdev_guid, (uintmax_t)guid);
 399                         } else {
 400                                 ZFS_LOG(1, "guid match for provider %s.",
 401                                     vd->vdev_path);
 402                         }
 403                 }
 404         }
 405
 406         return (cp);
 407 }
 408
 409 static int
 410 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
 411     uint64_t *ashift)
 412 {
 413         struct g_provider *pp;
 414         struct g_consumer *cp;
 415         size_t bufsize;
 416         int error;
 417
 418         /*
 419          * We must have a pathname, and it must be absolute.
 420          */
 421         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 422                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 423                 return (EINVAL);
 424         }
 425
 426         vd->vdev_tsd = NULL;
 427
 428         DROP_GIANT();
 429         g_topology_lock();
 430         error = 0;
 431
 432         /*
 433          * If we're creating or splitting a pool, just find the GEOM provider
 434          * by its name and ignore GUID mismatches.
 435          */
 436         if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
 437             vd->vdev_spa->spa_splitting_newspa == B_TRUE)
 438                 cp = vdev_geom_open_by_path(vd, 0);
 439         else {
 440                 cp = vdev_geom_open_by_path(vd, 1);
 441                 if (cp == NULL) {
 442                         /*
 443                          * The device at vd->vdev_path doesn't have the
 444                          * expected guid. The disks might have merely
 445                          * moved around so try all other GEOM providers
 446                          * to find one with the right guid.
 447                          */
 448                         cp = vdev_geom_open_by_guid(vd);
 449                 }
 450         }
 451
 452         if (cp == NULL) {
 453                 ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
 454                 error = ENOENT;
 455         } else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
 456             !ISP2(cp->provider->sectorsize)) {
 457                 ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
 458                     vd->vdev_path);
 459                 vdev_geom_detach(cp, 0);
 460                 error = EINVAL;
 461                 cp = NULL;
 462         } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
 463                 int i;
 464
 465                 for (i = 0; i < 5; i++) {
 466                         error = g_access(cp, 0, 1, 0);
 467                         if (error == 0)
 468                                 break;
 469                         g_topology_unlock();
 470                         tsleep(vd, 0, "vdev", hz / 2);
 471                         g_topology_lock();
 472                 }
 473                 if (error != 0) {
 474                         printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
 475                             vd->vdev_path, error);
 476                         vdev_geom_detach(cp, 0);
 477                         cp = NULL;
 478                 }
 479         }
 480         g_topology_unlock();
 481         PICKUP_GIANT();
 482         if (cp == NULL) {
 483                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 484                 return (error);
 485         }
 486
 487         cp->private = vd;
 488         vd->vdev_tsd = cp;
 489         pp = cp->provider;
 490
 491         /*
 492          * Determine the actual size of the device.
 493          */
 494         *max_psize = *psize = pp->mediasize;
 495
 496         /*
 497          * Determine the device's minimum transfer size.
 498          */
 499         *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
 500
 501         /*
 502          * Clear the nowritecache bit, so that on a vdev_reopen() we will
 503          * try again.
 504          */
 505         vd->vdev_nowritecache = B_FALSE;
 506
 507         if (vd->vdev_physpath != NULL)
 508                 spa_strfree(vd->vdev_physpath);
 509         bufsize = sizeof("/dev/") + strlen(pp->name);
 510         vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
 511         snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
 512
 513         return (0);
 514 }
 515
 516 static void
 517 vdev_geom_close(vdev_t *vd)
 518 {
 519         struct g_consumer *cp;
 520
 521         cp = vd->vdev_tsd;
 522         if (cp == NULL)
 523                 return;
 524         vd->vdev_tsd = NULL;
 525         vd->vdev_delayed_close = B_FALSE;
 526         g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
 527 }
 528
 529 static void
 530 vdev_geom_io_intr(struct bio *bp)
 531 {
 532         vdev_t *vd;
 533         zio_t *zio;
 534
 535         zio = bp->bio_caller1;
 536         vd = zio->io_vd;
 537         zio->io_error = bp->bio_error;
 538         if (zio->io_error == 0 && bp->bio_resid != 0)
 539                 zio->io_error = EIO;
 540         if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
 541                 /*
 542                  * If we get ENOTSUP, we know that no future
 543                  * attempts will ever succeed.  In this case we
 544                  * set a persistent bit so that we don't bother
 545                  * with the ioctl in the future.
 546                  */
 547                 vd->vdev_nowritecache = B_TRUE;
 548         }
 549         if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
 550                 /*
 551                  * If provider's error is set we assume it is being
 552                  * removed.
 553                  */
 554                 if (bp->bio_to->error != 0) {
 555                         /*
 556                          * We post the resource as soon as possible, instead of
 557                          * when the async removal actually happens, because the
 558                          * DE is using this information to discard previous I/O
 559                          * errors.
 560                          */
 561                         /* XXX: zfs_post_remove() can sleep. */
 562                         zfs_post_remove(zio->io_spa, vd);
 563                         vd->vdev_remove_wanted = B_TRUE;
 564                         spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 565                 } else if (!vd->vdev_delayed_close) {
 566                         vd->vdev_delayed_close = B_TRUE;
 567                 }
 568         }
 569         g_destroy_bio(bp);
 570         zio_interrupt(zio);
 571 }
 572
 573 static int
 574 vdev_geom_io_start(zio_t *zio)
 575 {
 576         vdev_t *vd;
 577         struct g_consumer *cp;
 578         struct bio *bp;
 579         int error;
 580
 581         vd = zio->io_vd;
 582
 583         if (zio->io_type == ZIO_TYPE_IOCTL) {
 584                 /* XXPOLICY */
 585                 if (!vdev_readable(vd)) {
 586                         zio->io_error = ENXIO;
 587                         return (ZIO_PIPELINE_CONTINUE);
 588                 }
 589
 590                 switch (zio->io_cmd) {
 591
 592                 case DKIOCFLUSHWRITECACHE:
 593
 594                         if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
 595                                 break;
 596
 597                         if (vd->vdev_nowritecache) {
 598                                 zio->io_error = ENOTSUP;
 599                                 break;
 600                         }
 601
 602                         goto sendreq;
 603                 default:
 604                         zio->io_error = ENOTSUP;
 605                 }
 606
 607                 return (ZIO_PIPELINE_CONTINUE);
 608         }
 609 sendreq:
 610         cp = vd->vdev_tsd;
 611         if (cp == NULL) {
 612                 zio->io_error = ENXIO;
 613                 return (ZIO_PIPELINE_CONTINUE);
 614         }
 615         bp = g_alloc_bio();
 616         bp->bio_caller1 = zio;
 617         switch (zio->io_type) {
 618         case ZIO_TYPE_READ:
 619         case ZIO_TYPE_WRITE:
 620                 bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
 621                 bp->bio_data = zio->io_data;
 622                 bp->bio_offset = zio->io_offset;
 623                 bp->bio_length = zio->io_size;
 624                 break;
 625         case ZIO_TYPE_IOCTL:
 626                 bp->bio_cmd = BIO_FLUSH;
 627                 bp->bio_flags |= BIO_ORDERED;
 628                 bp->bio_data = NULL;
 629                 bp->bio_offset = cp->provider->mediasize;
 630                 bp->bio_length = 0;
 631                 break;
 632         }
 633         bp->bio_done = vdev_geom_io_intr;
 634
 635         g_io_request(bp, cp);
 636
 637         return (ZIO_PIPELINE_STOP);
 638 }
 639
 640 static void
 641 vdev_geom_io_done(zio_t *zio)
 642 {
 643 }
 644
 645 static void
 646 vdev_geom_hold(vdev_t *vd)
 647 {
 648 }
 649
 650 static void
 651 vdev_geom_rele(vdev_t *vd)
 652 {
 653 }
 654
 655 vdev_ops_t vdev_geom_ops = {
 656         vdev_geom_open,
 657         vdev_geom_close,
 658         vdev_default_asize,
 659         vdev_geom_io_start,
 660         vdev_geom_io_done,
 661         NULL,
 662         vdev_geom_hold,
 663         vdev_geom_rele,
 664         VDEV_TYPE_DISK,         /* name of this vdev type */
 665         B_TRUE                  /* leaf vdev */
 666 };