sys/dev/nvme/nvme_ctrlr.c

   1 /*-
   2  * Copyright (C) 2012 Intel Corporation
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include <sys/param.h>
  31 #include <sys/bus.h>
  32 #include <sys/conf.h>
  33 #include <sys/ioccom.h>
  34 #include <sys/smp.h>
  35
  36 #include <dev/pci/pcireg.h>
  37 #include <dev/pci/pcivar.h>
  38
  39 #include "nvme_private.h"
  40
  41 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
  42                                                 struct nvme_async_event_request *aer);
  43
  44 static void
  45 nvme_ctrlr_cb(void *arg, const struct nvme_completion *status)
  46 {
  47         struct nvme_completion  *cpl = arg;
  48         struct mtx              *mtx;
  49
  50         /*
  51          * Copy status into the argument passed by the caller, so that
  52          *  the caller can check the status to determine if the
  53          *  the request passed or failed.
  54          */
  55         memcpy(cpl, status, sizeof(*cpl));
  56         mtx = mtx_pool_find(mtxpool_sleep, cpl);
  57         mtx_lock(mtx);
  58         wakeup(cpl);
  59         mtx_unlock(mtx);
  60 }
  61
  62 static int
  63 nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr)
  64 {
  65
  66         /* Chatham puts the NVMe MMRs behind BAR 2/3, not BAR 0/1. */
  67         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
  68                 ctrlr->resource_id = PCIR_BAR(2);
  69         else
  70                 ctrlr->resource_id = PCIR_BAR(0);
  71
  72         ctrlr->resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
  73             &ctrlr->resource_id, 0, ~0, 1, RF_ACTIVE);
  74
  75         if(ctrlr->resource == NULL) {
  76                 device_printf(ctrlr->dev, "unable to allocate pci resource\n");
  77                 return (ENOMEM);
  78         }
  79
  80         ctrlr->bus_tag = rman_get_bustag(ctrlr->resource);
  81         ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource);
  82         ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle;
  83
  84         /*
  85          * The NVMe spec allows for the MSI-X table to be placed behind
  86          *  BAR 4/5, separate from the control/doorbell registers.  Always
  87          *  try to map this bar, because it must be mapped prior to calling
  88          *  pci_alloc_msix().  If the table isn't behind BAR 4/5,
  89          *  bus_alloc_resource() will just return NULL which is OK.
  90          */
  91         ctrlr->bar4_resource_id = PCIR_BAR(4);
  92         ctrlr->bar4_resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
  93             &ctrlr->bar4_resource_id, 0, ~0, 1, RF_ACTIVE);
  94
  95         return (0);
  96 }
  97
  98 #ifdef CHATHAM2
  99 static int
 100 nvme_ctrlr_allocate_chatham_bar(struct nvme_controller *ctrlr)
 101 {
 102
 103         ctrlr->chatham_resource_id = PCIR_BAR(CHATHAM_CONTROL_BAR);
 104         ctrlr->chatham_resource = bus_alloc_resource(ctrlr->dev,
 105             SYS_RES_MEMORY, &ctrlr->chatham_resource_id, 0, ~0, 1,
 106             RF_ACTIVE);
 107
 108         if(ctrlr->chatham_resource == NULL) {
 109                 device_printf(ctrlr->dev, "unable to alloc pci resource\n");
 110                 return (ENOMEM);
 111         }
 112
 113         ctrlr->chatham_bus_tag = rman_get_bustag(ctrlr->chatham_resource);
 114         ctrlr->chatham_bus_handle =
 115             rman_get_bushandle(ctrlr->chatham_resource);
 116
 117         return (0);
 118 }
 119
 120 static void
 121 nvme_ctrlr_setup_chatham(struct nvme_controller *ctrlr)
 122 {
 123         uint64_t reg1, reg2, reg3;
 124         uint64_t temp1, temp2;
 125         uint32_t temp3;
 126         uint32_t use_flash_timings = 0;
 127
 128         DELAY(10000);
 129
 130         temp3 = chatham_read_4(ctrlr, 0x8080);
 131
 132         device_printf(ctrlr->dev, "Chatham version: 0x%x\n", temp3);
 133
 134         ctrlr->chatham_lbas = chatham_read_4(ctrlr, 0x8068) - 0x110;
 135         ctrlr->chatham_size = ctrlr->chatham_lbas * 512;
 136
 137         device_printf(ctrlr->dev, "Chatham size: %jd\n",
 138             (intmax_t)ctrlr->chatham_size);
 139
 140         reg1 = reg2 = reg3 = ctrlr->chatham_size - 1;
 141
 142         TUNABLE_INT_FETCH("hw.nvme.use_flash_timings", &use_flash_timings);
 143         if (use_flash_timings) {
 144                 device_printf(ctrlr->dev, "Chatham: using flash timings\n");
 145                 temp1 = 0x00001b58000007d0LL;
 146                 temp2 = 0x000000cb00000131LL;
 147         } else {
 148                 device_printf(ctrlr->dev, "Chatham: using DDR timings\n");
 149                 temp1 = temp2 = 0x0LL;
 150         }
 151
 152         chatham_write_8(ctrlr, 0x8000, reg1);
 153         chatham_write_8(ctrlr, 0x8008, reg2);
 154         chatham_write_8(ctrlr, 0x8010, reg3);
 155
 156         chatham_write_8(ctrlr, 0x8020, temp1);
 157         temp3 = chatham_read_4(ctrlr, 0x8020);
 158
 159         chatham_write_8(ctrlr, 0x8028, temp2);
 160         temp3 = chatham_read_4(ctrlr, 0x8028);
 161
 162         chatham_write_8(ctrlr, 0x8030, temp1);
 163         chatham_write_8(ctrlr, 0x8038, temp2);
 164         chatham_write_8(ctrlr, 0x8040, temp1);
 165         chatham_write_8(ctrlr, 0x8048, temp2);
 166         chatham_write_8(ctrlr, 0x8050, temp1);
 167         chatham_write_8(ctrlr, 0x8058, temp2);
 168
 169         DELAY(10000);
 170 }
 171
 172 static void
 173 nvme_chatham_populate_cdata(struct nvme_controller *ctrlr)
 174 {
 175         struct nvme_controller_data *cdata;
 176
 177         cdata = &ctrlr->cdata;
 178
 179         cdata->vid = 0x8086;
 180         cdata->ssvid = 0x2011;
 181
 182         /*
 183          * Chatham2 puts garbage data in these fields when we
 184          *  invoke IDENTIFY_CONTROLLER, so we need to re-zero
 185          *  the fields before calling bcopy().
 186          */
 187         memset(cdata->sn, 0, sizeof(cdata->sn));
 188         memcpy(cdata->sn, "2012", strlen("2012"));
 189         memset(cdata->mn, 0, sizeof(cdata->mn));
 190         memcpy(cdata->mn, "CHATHAM2", strlen("CHATHAM2"));
 191         memset(cdata->fr, 0, sizeof(cdata->fr));
 192         memcpy(cdata->fr, "0", strlen("0"));
 193         cdata->rab = 8;
 194         cdata->aerl = 3;
 195         cdata->lpa.ns_smart = 1;
 196         cdata->sqes.min = 6;
 197         cdata->sqes.max = 6;
 198         cdata->sqes.min = 4;
 199         cdata->sqes.max = 4;
 200         cdata->nn = 1;
 201
 202         /* Chatham2 doesn't support DSM command */
 203         cdata->oncs.dsm = 0;
 204
 205         cdata->vwc.present = 1;
 206 }
 207 #endif /* CHATHAM2 */
 208
 209 static void
 210 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
 211 {
 212         struct nvme_qpair       *qpair;
 213         uint32_t                num_entries;
 214
 215         qpair = &ctrlr->adminq;
 216
 217         num_entries = NVME_ADMIN_ENTRIES;
 218         TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
 219         /*
 220          * If admin_entries was overridden to an invalid value, revert it
 221          *  back to our default value.
 222          */
 223         if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
 224             num_entries > NVME_MAX_ADMIN_ENTRIES) {
 225                 printf("nvme: invalid hw.nvme.admin_entries=%d specified\n",
 226                     num_entries);
 227                 num_entries = NVME_ADMIN_ENTRIES;
 228         }
 229
 230         /*
 231          * The admin queue's max xfer size is treated differently than the
 232          *  max I/O xfer size.  16KB is sufficient here - maybe even less?
 233          */
 234         nvme_qpair_construct(qpair,
 235                              0, /* qpair ID */
 236                              0, /* vector */
 237                              num_entries,
 238                              NVME_ADMIN_TRACKERS,
 239                              16*1024, /* max xfer size */
 240                              ctrlr);
 241 }
 242
 243 static int
 244 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
 245 {
 246         struct nvme_qpair       *qpair;
 247         union cap_lo_register   cap_lo;
 248         int                     i, num_entries, num_trackers;
 249
 250         num_entries = NVME_IO_ENTRIES;
 251         TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
 252
 253         /*
 254          * NVMe spec sets a hard limit of 64K max entries, but
 255          *  devices may specify a smaller limit, so we need to check
 256          *  the MQES field in the capabilities register.
 257          */
 258         cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
 259         num_entries = min(num_entries, cap_lo.bits.mqes+1);
 260
 261         num_trackers = NVME_IO_TRACKERS;
 262         TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
 263
 264         num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
 265         num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
 266         /*
 267          * No need to have more trackers than entries in the submit queue.
 268          *  Note also that for a queue size of N, we can only have (N-1)
 269          *  commands outstanding, hence the "-1" here.
 270          */
 271         num_trackers = min(num_trackers, (num_entries-1));
 272
 273         ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
 274         TUNABLE_INT_FETCH("hw.nvme.max_xfer_size", &ctrlr->max_xfer_size);
 275         /*
 276          * Check that tunable doesn't specify a size greater than what our
 277          *  driver supports, and is an even PAGE_SIZE multiple.
 278          */
 279         if (ctrlr->max_xfer_size > NVME_MAX_XFER_SIZE ||
 280             ctrlr->max_xfer_size % PAGE_SIZE)
 281                 ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
 282
 283         ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
 284             M_NVME, M_ZERO | M_NOWAIT);
 285
 286         if (ctrlr->ioq == NULL)
 287                 return (ENOMEM);
 288
 289         for (i = 0; i < ctrlr->num_io_queues; i++) {
 290                 qpair = &ctrlr->ioq[i];
 291
 292                 /*
 293                  * Admin queue has ID=0. IO queues start at ID=1 -
 294                  *  hence the 'i+1' here.
 295                  *
 296                  * For I/O queues, use the controller-wide max_xfer_size
 297                  *  calculated in nvme_attach().
 298                  */
 299                 nvme_qpair_construct(qpair,
 300                                      i+1, /* qpair ID */
 301                                      ctrlr->msix_enabled ? i+1 : 0, /* vector */
 302                                      num_entries,
 303                                      num_trackers,
 304                                      ctrlr->max_xfer_size,
 305                                      ctrlr);
 306
 307                 if (ctrlr->per_cpu_io_queues)
 308                         bus_bind_intr(ctrlr->dev, qpair->res, i);
 309         }
 310
 311         return (0);
 312 }
 313
 314 static int
 315 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr)
 316 {
 317         int ms_waited;
 318         union cc_register cc;
 319         union csts_register csts;
 320
 321         cc.raw = nvme_mmio_read_4(ctrlr, cc);
 322         csts.raw = nvme_mmio_read_4(ctrlr, csts);
 323
 324         if (!cc.bits.en) {
 325                 device_printf(ctrlr->dev, "%s called with cc.en = 0\n",
 326                     __func__);
 327                 return (ENXIO);
 328         }
 329
 330         ms_waited = 0;
 331
 332         while (!csts.bits.rdy) {
 333                 DELAY(1000);
 334                 if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
 335                         device_printf(ctrlr->dev, "controller did not become "
 336                             "ready within %d ms\n", ctrlr->ready_timeout_in_ms);
 337                         return (ENXIO);
 338                 }
 339                 csts.raw = nvme_mmio_read_4(ctrlr, csts);
 340         }
 341
 342         return (0);
 343 }
 344
 345 static void
 346 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
 347 {
 348         union cc_register cc;
 349         union csts_register csts;
 350
 351         cc.raw = nvme_mmio_read_4(ctrlr, cc);
 352         csts.raw = nvme_mmio_read_4(ctrlr, csts);
 353
 354         if (cc.bits.en == 1 && csts.bits.rdy == 0)
 355                 nvme_ctrlr_wait_for_ready(ctrlr);
 356
 357         cc.bits.en = 0;
 358         nvme_mmio_write_4(ctrlr, cc, cc.raw);
 359         DELAY(5000);
 360 }
 361
 362 static int
 363 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
 364 {
 365         union cc_register       cc;
 366         union csts_register     csts;
 367         union aqa_register      aqa;
 368
 369         cc.raw = nvme_mmio_read_4(ctrlr, cc);
 370         csts.raw = nvme_mmio_read_4(ctrlr, csts);
 371
 372         if (cc.bits.en == 1) {
 373                 if (csts.bits.rdy == 1)
 374                         return (0);
 375                 else
 376                         return (nvme_ctrlr_wait_for_ready(ctrlr));
 377         }
 378
 379         nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
 380         DELAY(5000);
 381         nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
 382         DELAY(5000);
 383
 384         aqa.raw = 0;
 385         /* acqs and asqs are 0-based. */
 386         aqa.bits.acqs = ctrlr->adminq.num_entries-1;
 387         aqa.bits.asqs = ctrlr->adminq.num_entries-1;
 388         nvme_mmio_write_4(ctrlr, aqa, aqa.raw);
 389         DELAY(5000);
 390
 391         cc.bits.en = 1;
 392         cc.bits.css = 0;
 393         cc.bits.ams = 0;
 394         cc.bits.shn = 0;
 395         cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
 396         cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
 397
 398         /* This evaluates to 0, which is according to spec. */
 399         cc.bits.mps = (PAGE_SIZE >> 13);
 400
 401         nvme_mmio_write_4(ctrlr, cc, cc.raw);
 402         DELAY(5000);
 403
 404         return (nvme_ctrlr_wait_for_ready(ctrlr));
 405 }
 406
 407 int
 408 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 409 {
 410         int i;
 411
 412         nvme_admin_qpair_disable(&ctrlr->adminq);
 413         for (i = 0; i < ctrlr->num_io_queues; i++)
 414                 nvme_io_qpair_disable(&ctrlr->ioq[i]);
 415
 416         DELAY(100*1000);
 417
 418         nvme_ctrlr_disable(ctrlr);
 419         return (nvme_ctrlr_enable(ctrlr));
 420 }
 421
 422 void
 423 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
 424 {
 425         int cmpset;
 426
 427         cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
 428
 429         if (cmpset == 0)
 430                 /* Controller is already resetting. */
 431                 return;
 432
 433         taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
 434 }
 435
 436 static int
 437 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
 438 {
 439         struct mtx              *mtx;
 440         struct nvme_completion  cpl;
 441         int                     status;
 442
 443         mtx = mtx_pool_find(mtxpool_sleep, &cpl);
 444
 445         mtx_lock(mtx);
 446         nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
 447             nvme_ctrlr_cb, &cpl);
 448         status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
 449         mtx_unlock(mtx);
 450         if ((status != 0) || nvme_completion_is_error(&cpl)) {
 451                 printf("nvme_identify_controller failed!\n");
 452                 return (ENXIO);
 453         }
 454
 455 #ifdef CHATHAM2
 456         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
 457                 nvme_chatham_populate_cdata(ctrlr);
 458 #endif
 459
 460         /*
 461          * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
 462          *  controller supports.
 463          */
 464         if (ctrlr->cdata.mdts > 0)
 465                 ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
 466                     ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
 467
 468         return (0);
 469 }
 470
 471 static int
 472 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
 473 {
 474         struct mtx              *mtx;
 475         struct nvme_completion  cpl;
 476         int                     cq_allocated, sq_allocated, status;
 477
 478         mtx = mtx_pool_find(mtxpool_sleep, &cpl);
 479
 480         mtx_lock(mtx);
 481         nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
 482             nvme_ctrlr_cb, &cpl);
 483         status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
 484         mtx_unlock(mtx);
 485         if ((status != 0) || nvme_completion_is_error(&cpl)) {
 486                 printf("nvme_set_num_queues failed!\n");
 487                 return (ENXIO);
 488         }
 489
 490         /*
 491          * Data in cdw0 is 0-based.
 492          * Lower 16-bits indicate number of submission queues allocated.
 493          * Upper 16-bits indicate number of completion queues allocated.
 494          */
 495         sq_allocated = (cpl.cdw0 & 0xFFFF) + 1;
 496         cq_allocated = (cpl.cdw0 >> 16) + 1;
 497
 498         /*
 499          * Check that the controller was able to allocate the number of
 500          *  queues we requested.  If not, revert to one IO queue.
 501          */
 502         if (sq_allocated < ctrlr->num_io_queues ||
 503             cq_allocated < ctrlr->num_io_queues) {
 504                 ctrlr->num_io_queues = 1;
 505                 ctrlr->per_cpu_io_queues = 0;
 506
 507                 /* TODO: destroy extra queues that were created
 508                  *  previously but now found to be not needed.
 509                  */
 510         }
 511
 512         return (0);
 513 }
 514
 515 static int
 516 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
 517 {
 518         struct mtx              *mtx;
 519         struct nvme_qpair       *qpair;
 520         struct nvme_completion  cpl;
 521         int                     i, status;
 522
 523         mtx = mtx_pool_find(mtxpool_sleep, &cpl);
 524
 525         for (i = 0; i < ctrlr->num_io_queues; i++) {
 526                 qpair = &ctrlr->ioq[i];
 527
 528                 mtx_lock(mtx);
 529                 nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
 530                     nvme_ctrlr_cb, &cpl);
 531                 status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
 532                 mtx_unlock(mtx);
 533                 if ((status != 0) || nvme_completion_is_error(&cpl)) {
 534                         printf("nvme_create_io_cq failed!\n");
 535                         return (ENXIO);
 536                 }
 537
 538                 mtx_lock(mtx);
 539                 nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair,
 540                     nvme_ctrlr_cb, &cpl);
 541                 status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
 542                 mtx_unlock(mtx);
 543                 if ((status != 0) || nvme_completion_is_error(&cpl)) {
 544                         printf("nvme_create_io_sq failed!\n");
 545                         return (ENXIO);
 546                 }
 547         }
 548
 549         return (0);
 550 }
 551
 552 static int
 553 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
 554 {
 555         struct nvme_namespace   *ns;
 556         int                     i, status;
 557
 558         for (i = 0; i < ctrlr->cdata.nn; i++) {
 559                 ns = &ctrlr->ns[i];
 560                 status = nvme_ns_construct(ns, i+1, ctrlr);
 561                 if (status != 0)
 562                         return (status);
 563         }
 564
 565         return (0);
 566 }
 567
 568 static boolean_t
 569 is_log_page_id_valid(uint8_t page_id)
 570 {
 571
 572         switch (page_id) {
 573         case NVME_LOG_ERROR:
 574         case NVME_LOG_HEALTH_INFORMATION:
 575         case NVME_LOG_FIRMWARE_SLOT:
 576                 return (TRUE);
 577         }
 578
 579         return (FALSE);
 580 }
 581
 582 static uint32_t
 583 nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
 584 {
 585         uint32_t        log_page_size;
 586
 587         switch (page_id) {
 588         case NVME_LOG_ERROR:
 589                 log_page_size = min(
 590                     sizeof(struct nvme_error_information_entry) *
 591                     ctrlr->cdata.elpe,
 592                     NVME_MAX_AER_LOG_SIZE);
 593                 break;
 594         case NVME_LOG_HEALTH_INFORMATION:
 595                 log_page_size = sizeof(struct nvme_health_information_page);
 596                 break;
 597         case NVME_LOG_FIRMWARE_SLOT:
 598                 log_page_size = sizeof(struct nvme_firmware_page);
 599                 break;
 600         default:
 601                 log_page_size = 0;
 602                 break;
 603         }
 604
 605         return (log_page_size);
 606 }
 607
 608 static void
 609 nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
 610 {
 611         struct nvme_async_event_request *aer = arg;
 612
 613         /*
 614          * If the log page fetch for some reason completed with an error,
 615          *  don't pass log page data to the consumers.  In practice, this case
 616          *  should never happen.
 617          */
 618         if (nvme_completion_is_error(cpl))
 619                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
 620                     aer->log_page_id, NULL, 0);
 621         else
 622                 /*
 623                  * Pass the cpl data from the original async event completion,
 624                  *  not the log page fetch.
 625                  */
 626                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
 627                     aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
 628
 629         /*
 630          * Repost another asynchronous event request to replace the one
 631          *  that just completed.
 632          */
 633         nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
 634 }
 635
 636 static void
 637 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
 638 {
 639         struct nvme_async_event_request *aer = arg;
 640
 641         if (cpl->status.sc == NVME_SC_ABORTED_SQ_DELETION) {
 642                 /*
 643                  *  This is simulated when controller is being shut down, to
 644                  *  effectively abort outstanding asynchronous event requests
 645                  *  and make sure all memory is freed.  Do not repost the
 646                  *  request in this case.
 647                  */
 648                 return;
 649         }
 650
 651         printf("Asynchronous event occurred.\n");
 652
 653         /* Associated log page is in bits 23:16 of completion entry dw0. */
 654         aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
 655
 656         if (is_log_page_id_valid(aer->log_page_id)) {
 657                 aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
 658                     aer->log_page_id);
 659                 memcpy(&aer->cpl, cpl, sizeof(*cpl));
 660                 nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
 661                     NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
 662                     aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
 663                     aer);
 664                 /* Wait to notify consumers until after log page is fetched. */
 665         } else {
 666                 nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
 667                     NULL, 0);
 668
 669                 /*
 670                  * Repost another asynchronous event request to replace the one
 671                  *  that just completed.
 672                  */
 673                 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
 674         }
 675 }
 676
 677 static void
 678 nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
 679     struct nvme_async_event_request *aer)
 680 {
 681         struct nvme_request *req;
 682
 683         aer->ctrlr = ctrlr;
 684         req = nvme_allocate_request(NULL, 0, nvme_ctrlr_async_event_cb, aer);
 685         aer->req = req;
 686
 687         /*
 688          * Disable timeout here, since asynchronous event requests should by
 689          *  nature never be timed out.
 690          */
 691         req->timeout = FALSE;
 692         req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
 693         nvme_ctrlr_submit_admin_request(ctrlr, req);
 694 }
 695
 696 static void
 697 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
 698 {
 699         union nvme_critical_warning_state       state;
 700         struct nvme_async_event_request         *aer;
 701         uint32_t                                i;
 702
 703         state.raw = 0xFF;
 704         state.bits.reserved = 0;
 705         nvme_ctrlr_cmd_set_async_event_config(ctrlr, state, NULL, NULL);
 706
 707         /* aerl is a zero-based value, so we need to add 1 here. */
 708         ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
 709
 710         /* Chatham doesn't support AERs. */
 711         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
 712                 ctrlr->num_aers = 0;
 713
 714         for (i = 0; i < ctrlr->num_aers; i++) {
 715                 aer = &ctrlr->aer[i];
 716                 nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
 717         }
 718 }
 719
 720 static void
 721 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
 722 {
 723
 724         ctrlr->int_coal_time = 0;
 725         TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
 726             &ctrlr->int_coal_time);
 727
 728         ctrlr->int_coal_threshold = 0;
 729         TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
 730             &ctrlr->int_coal_threshold);
 731
 732         nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
 733             ctrlr->int_coal_threshold, NULL, NULL);
 734 }
 735
 736 static void
 737 nvme_ctrlr_start(void *ctrlr_arg)
 738 {
 739         struct nvme_controller *ctrlr = ctrlr_arg;
 740         int i;
 741
 742         nvme_qpair_reset(&ctrlr->adminq);
 743         for (i = 0; i < ctrlr->num_io_queues; i++)
 744                 nvme_qpair_reset(&ctrlr->ioq[i]);
 745
 746         nvme_admin_qpair_enable(&ctrlr->adminq);
 747
 748         if (nvme_ctrlr_identify(ctrlr) != 0)
 749                 return;
 750
 751         if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
 752                 return;
 753
 754         if (nvme_ctrlr_create_qpairs(ctrlr) != 0)
 755                 return;
 756
 757         if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
 758                 return;
 759
 760         nvme_ctrlr_configure_aer(ctrlr);
 761         nvme_ctrlr_configure_int_coalescing(ctrlr);
 762
 763         for (i = 0; i < ctrlr->num_io_queues; i++)
 764                 nvme_io_qpair_enable(&ctrlr->ioq[i]);
 765 }
 766
 767 void
 768 nvme_ctrlr_start_config_hook(void *arg)
 769 {
 770         struct nvme_controller *ctrlr = arg;
 771
 772         nvme_ctrlr_start(ctrlr);
 773         config_intrhook_disestablish(&ctrlr->config_hook);
 774 }
 775
 776 static void
 777 nvme_ctrlr_reset_task(void *arg, int pending)
 778 {
 779         struct nvme_controller  *ctrlr = arg;
 780         int                     status;
 781
 782         device_printf(ctrlr->dev, "resetting controller");
 783         status = nvme_ctrlr_hw_reset(ctrlr);
 784         /*
 785          * Use pause instead of DELAY, so that we yield to any nvme interrupt
 786          *  handlers on this CPU that were blocked on a qpair lock. We want
 787          *  all nvme interrupts completed before proceeding with restarting the
 788          *  controller.
 789          *
 790          * XXX - any way to guarantee the interrupt handlers have quiesced?
 791          */
 792         pause("nvmereset", hz / 10);
 793         if (status == 0)
 794                 nvme_ctrlr_start(ctrlr);
 795
 796         atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 797 }
 798
 799 static void
 800 nvme_ctrlr_intx_handler(void *arg)
 801 {
 802         struct nvme_controller *ctrlr = arg;
 803
 804         nvme_mmio_write_4(ctrlr, intms, 1);
 805
 806         nvme_qpair_process_completions(&ctrlr->adminq);
 807
 808         if (ctrlr->ioq[0].cpl)
 809                 nvme_qpair_process_completions(&ctrlr->ioq[0]);
 810
 811         nvme_mmio_write_4(ctrlr, intmc, 1);
 812 }
 813
 814 static int
 815 nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
 816 {
 817
 818         ctrlr->num_io_queues = 1;
 819         ctrlr->per_cpu_io_queues = 0;
 820         ctrlr->rid = 0;
 821         ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
 822             &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
 823
 824         if (ctrlr->res == NULL) {
 825                 device_printf(ctrlr->dev, "unable to allocate shared IRQ\n");
 826                 return (ENOMEM);
 827         }
 828
 829         bus_setup_intr(ctrlr->dev, ctrlr->res,
 830             INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
 831             ctrlr, &ctrlr->tag);
 832
 833         if (ctrlr->tag == NULL) {
 834                 device_printf(ctrlr->dev,
 835                     "unable to setup legacy interrupt handler\n");
 836                 return (ENOMEM);
 837         }
 838
 839         return (0);
 840 }
 841
 842 static int
 843 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 844     struct thread *td)
 845 {
 846         struct nvme_controller  *ctrlr;
 847         struct nvme_completion  cpl;
 848         struct mtx              *mtx;
 849
 850         ctrlr = cdev->si_drv1;
 851
 852         switch (cmd) {
 853         case NVME_IDENTIFY_CONTROLLER:
 854 #ifdef CHATHAM2
 855                 /*
 856                  * Don't refresh data on Chatham, since Chatham returns
 857                  *  garbage on IDENTIFY anyways.
 858                  */
 859                 if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID) {
 860                         memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
 861                         break;
 862                 }
 863 #endif
 864                 /* Refresh data before returning to user. */
 865                 mtx = mtx_pool_find(mtxpool_sleep, &cpl);
 866                 mtx_lock(mtx);
 867                 nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
 868                     nvme_ctrlr_cb, &cpl);
 869                 msleep(&cpl, mtx, PRIBIO, "nvme_ioctl", 0);
 870                 mtx_unlock(mtx);
 871                 if (nvme_completion_is_error(&cpl))
 872                         return (ENXIO);
 873                 memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
 874                 break;
 875         case NVME_RESET_CONTROLLER:
 876                 nvme_ctrlr_reset(ctrlr);
 877                 break;
 878         default:
 879                 return (ENOTTY);
 880         }
 881
 882         return (0);
 883 }
 884
 885 static struct cdevsw nvme_ctrlr_cdevsw = {
 886         .d_version =    D_VERSION,
 887         .d_flags =      0,
 888         .d_ioctl =      nvme_ctrlr_ioctl
 889 };
 890
 891 int
 892 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 893 {
 894         union cap_lo_register   cap_lo;
 895         union cap_hi_register   cap_hi;
 896         int                     num_vectors, per_cpu_io_queues, status = 0;
 897         int                     timeout_period;
 898
 899         ctrlr->dev = dev;
 900
 901         status = nvme_ctrlr_allocate_bar(ctrlr);
 902
 903         if (status != 0)
 904                 return (status);
 905
 906 #ifdef CHATHAM2
 907         if (pci_get_devid(dev) == CHATHAM_PCI_ID) {
 908                 status = nvme_ctrlr_allocate_chatham_bar(ctrlr);
 909                 if (status != 0)
 910                         return (status);
 911                 nvme_ctrlr_setup_chatham(ctrlr);
 912         }
 913 #endif
 914
 915         /*
 916          * Software emulators may set the doorbell stride to something
 917          *  other than zero, but this driver is not set up to handle that.
 918          */
 919         cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi);
 920         if (cap_hi.bits.dstrd != 0)
 921                 return (ENXIO);
 922
 923         ctrlr->min_page_size = 1 << (12 + cap_hi.bits.mpsmin);
 924
 925         /* Get ready timeout value from controller, in units of 500ms. */
 926         cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
 927         ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500;
 928
 929         timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
 930         TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
 931         timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
 932         timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
 933         ctrlr->timeout_period = timeout_period;
 934
 935         nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
 936         TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
 937
 938         per_cpu_io_queues = 1;
 939         TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
 940         ctrlr->per_cpu_io_queues = per_cpu_io_queues ? TRUE : FALSE;
 941
 942         if (ctrlr->per_cpu_io_queues)
 943                 ctrlr->num_io_queues = mp_ncpus;
 944         else
 945                 ctrlr->num_io_queues = 1;
 946
 947         ctrlr->force_intx = 0;
 948         TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
 949
 950         ctrlr->enable_aborts = 0;
 951         TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
 952
 953         ctrlr->msix_enabled = 1;
 954
 955         if (ctrlr->force_intx) {
 956                 ctrlr->msix_enabled = 0;
 957                 goto intx;
 958         }
 959
 960         /* One vector per IO queue, plus one vector for admin queue. */
 961         num_vectors = ctrlr->num_io_queues + 1;
 962
 963         if (pci_msix_count(dev) < num_vectors) {
 964                 ctrlr->msix_enabled = 0;
 965                 goto intx;
 966         }
 967
 968         if (pci_alloc_msix(dev, &num_vectors) != 0)
 969                 ctrlr->msix_enabled = 0;
 970
 971 intx:
 972
 973         if (!ctrlr->msix_enabled)
 974                 nvme_ctrlr_configure_intx(ctrlr);
 975
 976         nvme_ctrlr_construct_admin_qpair(ctrlr);
 977
 978         status = nvme_ctrlr_construct_io_qpairs(ctrlr);
 979
 980         if (status != 0)
 981                 return (status);
 982
 983         ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
 984             "nvme%d", device_get_unit(dev));
 985
 986         if (ctrlr->cdev == NULL)
 987                 return (ENXIO);
 988
 989         ctrlr->cdev->si_drv1 = (void *)ctrlr;
 990
 991         TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
 992         ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
 993             taskqueue_thread_enqueue, &ctrlr->taskqueue);
 994         taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");
 995
 996         ctrlr->is_resetting = 0;
 997
 998         return (0);
 999 }
1000
1001 void
1002 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
1003 {
1004         int                             i;
1005
1006         taskqueue_free(ctrlr->taskqueue);
1007
1008         for (i = 0; i < NVME_MAX_NAMESPACES; i++)
1009                 nvme_ns_destruct(&ctrlr->ns[i]);
1010
1011         if (ctrlr->cdev)
1012                 destroy_dev(ctrlr->cdev);
1013
1014         for (i = 0; i < ctrlr->num_io_queues; i++) {
1015                 nvme_io_qpair_destroy(&ctrlr->ioq[i]);
1016         }
1017
1018         free(ctrlr->ioq, M_NVME);
1019
1020         nvme_admin_qpair_destroy(&ctrlr->adminq);
1021
1022         if (ctrlr->resource != NULL) {
1023                 bus_release_resource(dev, SYS_RES_MEMORY,
1024                     ctrlr->resource_id, ctrlr->resource);
1025         }
1026
1027         if (ctrlr->bar4_resource != NULL) {
1028                 bus_release_resource(dev, SYS_RES_MEMORY,
1029                     ctrlr->bar4_resource_id, ctrlr->bar4_resource);
1030         }
1031
1032 #ifdef CHATHAM2
1033         if (ctrlr->chatham_resource != NULL) {
1034                 bus_release_resource(dev, SYS_RES_MEMORY,
1035                     ctrlr->chatham_resource_id, ctrlr->chatham_resource);
1036         }
1037 #endif
1038
1039         if (ctrlr->tag)
1040                 bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
1041
1042         if (ctrlr->res)
1043                 bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
1044                     rman_get_rid(ctrlr->res), ctrlr->res);
1045
1046         if (ctrlr->msix_enabled)
1047                 pci_release_msi(dev);
1048 }
1049
1050 void
1051 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
1052     struct nvme_request *req)
1053 {
1054
1055         nvme_qpair_submit_request(&ctrlr->adminq, req);
1056 }
1057
1058 void
1059 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
1060     struct nvme_request *req)
1061 {
1062         struct nvme_qpair       *qpair;
1063
1064         if (ctrlr->per_cpu_io_queues)
1065                 qpair = &ctrlr->ioq[curcpu];
1066         else
1067                 qpair = &ctrlr->ioq[0];
1068
1069         nvme_qpair_submit_request(qpair, req);
1070 }
1071
1072 device_t
1073 nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
1074 {
1075
1076         return (ctrlr->dev);
1077 }
1078
1079 const struct nvme_controller_data *
1080 nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
1081 {
1082
1083         return (&ctrlr->cdata);
1084 }