2 * Copyright (C) 2012 Intel Corporation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
30 #include <sys/param.h>
33 #include <sys/ioccom.h>
36 #include <dev/pci/pcireg.h>
37 #include <dev/pci/pcivar.h>
39 #include "nvme_private.h"
42 nvme_ctrlr_cb(void *arg, const struct nvme_completion *status)
44 struct nvme_completion *cpl = arg;
48 * Copy status into the argument passed by the caller, so that
49 * the caller can check the status to determine if the
50 * the request passed or failed.
52 memcpy(cpl, status, sizeof(*cpl));
53 mtx = mtx_pool_find(mtxpool_sleep, cpl);
60 nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr)
63 /* Chatham puts the NVMe MMRs behind BAR 2/3, not BAR 0/1. */
64 if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
65 ctrlr->resource_id = PCIR_BAR(2);
67 ctrlr->resource_id = PCIR_BAR(0);
69 ctrlr->resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
70 &ctrlr->resource_id, 0, ~0, 1, RF_ACTIVE);
72 if(ctrlr->resource == NULL) {
73 device_printf(ctrlr->dev, "unable to allocate pci resource\n");
77 ctrlr->bus_tag = rman_get_bustag(ctrlr->resource);
78 ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource);
79 ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle;
86 nvme_ctrlr_allocate_chatham_bar(struct nvme_controller *ctrlr)
89 ctrlr->chatham_resource_id = PCIR_BAR(CHATHAM_CONTROL_BAR);
90 ctrlr->chatham_resource = bus_alloc_resource(ctrlr->dev,
91 SYS_RES_MEMORY, &ctrlr->chatham_resource_id, 0, ~0, 1,
94 if(ctrlr->chatham_resource == NULL) {
95 device_printf(ctrlr->dev, "unable to alloc pci resource\n");
99 ctrlr->chatham_bus_tag = rman_get_bustag(ctrlr->chatham_resource);
100 ctrlr->chatham_bus_handle =
101 rman_get_bushandle(ctrlr->chatham_resource);
107 nvme_ctrlr_setup_chatham(struct nvme_controller *ctrlr)
109 uint64_t reg1, reg2, reg3;
110 uint64_t temp1, temp2;
112 uint32_t use_flash_timings = 0;
116 temp3 = chatham_read_4(ctrlr, 0x8080);
118 device_printf(ctrlr->dev, "Chatham version: 0x%x\n", temp3);
120 ctrlr->chatham_lbas = chatham_read_4(ctrlr, 0x8068) - 0x110;
121 ctrlr->chatham_size = ctrlr->chatham_lbas * 512;
123 device_printf(ctrlr->dev, "Chatham size: %lld\n",
124 (long long)ctrlr->chatham_size);
126 reg1 = reg2 = reg3 = ctrlr->chatham_size - 1;
128 TUNABLE_INT_FETCH("hw.nvme.use_flash_timings", &use_flash_timings);
129 if (use_flash_timings) {
130 device_printf(ctrlr->dev, "Chatham: using flash timings\n");
131 temp1 = 0x00001b58000007d0LL;
132 temp2 = 0x000000cb00000131LL;
134 device_printf(ctrlr->dev, "Chatham: using DDR timings\n");
135 temp1 = temp2 = 0x0LL;
138 chatham_write_8(ctrlr, 0x8000, reg1);
139 chatham_write_8(ctrlr, 0x8008, reg2);
140 chatham_write_8(ctrlr, 0x8010, reg3);
142 chatham_write_8(ctrlr, 0x8020, temp1);
143 temp3 = chatham_read_4(ctrlr, 0x8020);
145 chatham_write_8(ctrlr, 0x8028, temp2);
146 temp3 = chatham_read_4(ctrlr, 0x8028);
148 chatham_write_8(ctrlr, 0x8030, temp1);
149 chatham_write_8(ctrlr, 0x8038, temp2);
150 chatham_write_8(ctrlr, 0x8040, temp1);
151 chatham_write_8(ctrlr, 0x8048, temp2);
152 chatham_write_8(ctrlr, 0x8050, temp1);
153 chatham_write_8(ctrlr, 0x8058, temp2);
159 nvme_chatham_populate_cdata(struct nvme_controller *ctrlr)
161 struct nvme_controller_data *cdata;
163 cdata = &ctrlr->cdata;
166 cdata->ssvid = 0x2011;
169 * Chatham2 puts garbage data in these fields when we
170 * invoke IDENTIFY_CONTROLLER, so we need to re-zero
171 * the fields before calling bcopy().
173 memset(cdata->sn, 0, sizeof(cdata->sn));
174 memcpy(cdata->sn, "2012", strlen("2012"));
175 memset(cdata->mn, 0, sizeof(cdata->mn));
176 memcpy(cdata->mn, "CHATHAM2", strlen("CHATHAM2"));
177 memset(cdata->fr, 0, sizeof(cdata->fr));
178 memcpy(cdata->fr, "0", strlen("0"));
181 cdata->lpa.ns_smart = 1;
188 /* Chatham2 doesn't support DSM command */
191 cdata->vwc.present = 1;
193 #endif /* CHATHAM2 */
196 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
198 struct nvme_qpair *qpair;
199 uint32_t num_entries;
201 qpair = &ctrlr->adminq;
203 num_entries = NVME_ADMIN_ENTRIES;
204 TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
206 * If admin_entries was overridden to an invalid value, revert it
207 * back to our default value.
209 if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
210 num_entries > NVME_MAX_ADMIN_ENTRIES) {
211 printf("nvme: invalid hw.nvme.admin_entries=%d specified\n",
213 num_entries = NVME_ADMIN_ENTRIES;
217 * The admin queue's max xfer size is treated differently than the
218 * max I/O xfer size. 16KB is sufficient here - maybe even less?
220 nvme_qpair_construct(qpair, 0, 0, num_entries, 16*1024, ctrlr);
224 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
226 struct nvme_qpair *qpair;
227 union cap_lo_register cap_lo;
230 num_entries = NVME_IO_ENTRIES;
231 TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
233 num_entries = max(num_entries, NVME_MIN_IO_ENTRIES);
236 * NVMe spec sets a hard limit of 64K max entries, but
237 * devices may specify a smaller limit, so we need to check
238 * the MQES field in the capabilities register.
240 cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
241 num_entries = min(num_entries, cap_lo.bits.mqes+1);
243 ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
244 TUNABLE_INT_FETCH("hw.nvme.max_xfer_size", &ctrlr->max_xfer_size);
246 * Check that tunable doesn't specify a size greater than what our
247 * driver supports, and is an even PAGE_SIZE multiple.
249 if (ctrlr->max_xfer_size > NVME_MAX_XFER_SIZE ||
250 ctrlr->max_xfer_size % PAGE_SIZE)
251 ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
253 ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
254 M_NVME, M_ZERO | M_NOWAIT);
256 if (ctrlr->ioq == NULL)
259 for (i = 0; i < ctrlr->num_io_queues; i++) {
260 qpair = &ctrlr->ioq[i];
263 * Admin queue has ID=0. IO queues start at ID=1 -
264 * hence the 'i+1' here.
266 * For I/O queues, use the controller-wide max_xfer_size
267 * calculated in nvme_attach().
269 nvme_qpair_construct(qpair,
271 ctrlr->msix_enabled ? i+1 : 0, /* vector */
273 ctrlr->max_xfer_size,
276 if (ctrlr->per_cpu_io_queues)
277 bus_bind_intr(ctrlr->dev, qpair->res, i);
284 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr)
287 union cc_register cc;
288 union csts_register csts;
290 cc.raw = nvme_mmio_read_4(ctrlr, cc);
291 csts.raw = nvme_mmio_read_4(ctrlr, csts);
294 device_printf(ctrlr->dev, "%s called with cc.en = 0\n",
301 while (!csts.bits.rdy) {
303 if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
304 device_printf(ctrlr->dev, "controller did not become "
305 "ready within %d ms\n", ctrlr->ready_timeout_in_ms);
308 csts.raw = nvme_mmio_read_4(ctrlr, csts);
315 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
317 union cc_register cc;
318 union csts_register csts;
320 cc.raw = nvme_mmio_read_4(ctrlr, cc);
321 csts.raw = nvme_mmio_read_4(ctrlr, csts);
323 if (cc.bits.en == 1 && csts.bits.rdy == 0)
324 nvme_ctrlr_wait_for_ready(ctrlr);
327 nvme_mmio_write_4(ctrlr, cc, cc.raw);
332 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
334 union cc_register cc;
335 union csts_register csts;
336 union aqa_register aqa;
338 cc.raw = nvme_mmio_read_4(ctrlr, cc);
339 csts.raw = nvme_mmio_read_4(ctrlr, csts);
341 if (cc.bits.en == 1) {
342 if (csts.bits.rdy == 1)
345 return (nvme_ctrlr_wait_for_ready(ctrlr));
348 nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
350 nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
354 /* acqs and asqs are 0-based. */
355 aqa.bits.acqs = ctrlr->adminq.num_entries-1;
356 aqa.bits.asqs = ctrlr->adminq.num_entries-1;
357 nvme_mmio_write_4(ctrlr, aqa, aqa.raw);
364 cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
365 cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
367 /* This evaluates to 0, which is according to spec. */
368 cc.bits.mps = (PAGE_SIZE >> 13);
370 nvme_mmio_write_4(ctrlr, cc, cc.raw);
373 return (nvme_ctrlr_wait_for_ready(ctrlr));
377 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
380 nvme_ctrlr_disable(ctrlr);
381 return (nvme_ctrlr_enable(ctrlr));
385 nvme_async_event_cb(void *arg, const struct nvme_completion *status)
387 struct nvme_controller *ctrlr = arg;
389 printf("Asynchronous event occurred.\n");
391 /* TODO: decode async event type based on status */
392 /* TODO: check status for any error bits */
395 * Repost an asynchronous event request so that it can be
396 * used again by the controller.
398 nvme_ctrlr_cmd_asynchronous_event_request(ctrlr, nvme_async_event_cb,
403 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
406 struct nvme_completion cpl;
409 mtx = mtx_pool_find(mtxpool_sleep, &cpl);
412 nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
413 nvme_ctrlr_cb, &cpl);
414 status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
416 if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
417 printf("nvme_identify_controller failed!\n");
422 if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
423 nvme_chatham_populate_cdata(ctrlr);
430 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
433 struct nvme_completion cpl;
434 int cq_allocated, sq_allocated, status;
436 mtx = mtx_pool_find(mtxpool_sleep, &cpl);
439 nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
440 nvme_ctrlr_cb, &cpl);
441 status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
443 if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
444 printf("nvme_set_num_queues failed!\n");
449 * Data in cdw0 is 0-based.
450 * Lower 16-bits indicate number of submission queues allocated.
451 * Upper 16-bits indicate number of completion queues allocated.
453 sq_allocated = (cpl.cdw0 & 0xFFFF) + 1;
454 cq_allocated = (cpl.cdw0 >> 16) + 1;
457 * Check that the controller was able to allocate the number of
458 * queues we requested. If not, revert to one IO queue.
460 if (sq_allocated < ctrlr->num_io_queues ||
461 cq_allocated < ctrlr->num_io_queues) {
462 ctrlr->num_io_queues = 1;
463 ctrlr->per_cpu_io_queues = 0;
465 /* TODO: destroy extra queues that were created
466 * previously but now found to be not needed.
474 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
477 struct nvme_qpair *qpair;
478 struct nvme_completion cpl;
481 mtx = mtx_pool_find(mtxpool_sleep, &cpl);
483 for (i = 0; i < ctrlr->num_io_queues; i++) {
484 qpair = &ctrlr->ioq[i];
487 nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
488 nvme_ctrlr_cb, &cpl);
489 status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
491 if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
492 printf("nvme_create_io_cq failed!\n");
497 nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair,
498 nvme_ctrlr_cb, &cpl);
499 status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
501 if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
502 printf("nvme_create_io_sq failed!\n");
511 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
513 struct nvme_namespace *ns;
516 for (i = 0; i < ctrlr->cdata.nn; i++) {
518 status = nvme_ns_construct(ns, i+1, ctrlr);
527 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
529 union nvme_critical_warning_state state;
530 uint8_t num_async_events;
533 state.bits.reserved = 0;
534 nvme_ctrlr_cmd_set_asynchronous_event_config(ctrlr, state, NULL, NULL);
536 /* aerl is a zero-based value, so we need to add 1 here. */
537 num_async_events = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
540 * Disable this code for now, since Chatham doesn't support
541 * AERs so I have no good way to test them.
544 for (int i = 0; i < num_async_events; i++)
545 nvme_ctrlr_cmd_asynchronous_event_request(ctrlr,
546 nvme_async_event_cb, ctrlr);
551 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
554 ctrlr->int_coal_time = 0;
555 TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
556 &ctrlr->int_coal_time);
558 ctrlr->int_coal_threshold = 0;
559 TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
560 &ctrlr->int_coal_threshold);
562 nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
563 ctrlr->int_coal_threshold, NULL, NULL);
567 nvme_ctrlr_start(void *ctrlr_arg)
569 struct nvme_controller *ctrlr = ctrlr_arg;
571 if (nvme_ctrlr_identify(ctrlr) != 0)
574 if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
577 if (nvme_ctrlr_create_qpairs(ctrlr) != 0)
580 if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
583 nvme_ctrlr_configure_aer(ctrlr);
584 nvme_ctrlr_configure_int_coalescing(ctrlr);
586 ctrlr->is_started = TRUE;
591 * Initialize sysctls, even if controller failed to start, to
592 * assist with debugging admin queue pair.
594 nvme_sysctl_initialize_ctrlr(ctrlr);
595 config_intrhook_disestablish(&ctrlr->config_hook);
599 nvme_ctrlr_intx_task(void *arg, int pending)
601 struct nvme_controller *ctrlr = arg;
603 nvme_qpair_process_completions(&ctrlr->adminq);
605 if (ctrlr->ioq[0].cpl)
606 nvme_qpair_process_completions(&ctrlr->ioq[0]);
608 nvme_mmio_write_4(ctrlr, intmc, 1);
612 nvme_ctrlr_intx_handler(void *arg)
614 struct nvme_controller *ctrlr = arg;
616 nvme_mmio_write_4(ctrlr, intms, 1);
617 taskqueue_enqueue_fast(ctrlr->taskqueue, &ctrlr->task);
621 nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
624 ctrlr->num_io_queues = 1;
625 ctrlr->per_cpu_io_queues = 0;
627 ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
628 &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
630 if (ctrlr->res == NULL) {
631 device_printf(ctrlr->dev, "unable to allocate shared IRQ\n");
635 bus_setup_intr(ctrlr->dev, ctrlr->res,
636 INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
639 if (ctrlr->tag == NULL) {
640 device_printf(ctrlr->dev,
641 "unable to setup legacy interrupt handler\n");
645 TASK_INIT(&ctrlr->task, 0, nvme_ctrlr_intx_task, ctrlr);
646 ctrlr->taskqueue = taskqueue_create_fast("nvme_taskq", M_NOWAIT,
647 taskqueue_thread_enqueue, &ctrlr->taskqueue);
648 taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_NET,
649 "%s intx taskq", device_get_nameunit(ctrlr->dev));
655 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
658 struct nvme_controller *ctrlr;
659 struct nvme_completion cpl;
662 ctrlr = cdev->si_drv1;
665 case NVME_IDENTIFY_CONTROLLER:
668 * Don't refresh data on Chatham, since Chatham returns
669 * garbage on IDENTIFY anyways.
671 if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID) {
672 memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
676 /* Refresh data before returning to user. */
677 mtx = mtx_pool_find(mtxpool_sleep, &cpl);
679 nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
680 nvme_ctrlr_cb, &cpl);
681 msleep(&cpl, mtx, PRIBIO, "nvme_ioctl", 0);
683 if (cpl.sf_sc || cpl.sf_sct)
685 memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
694 static struct cdevsw nvme_ctrlr_cdevsw = {
695 .d_version = D_VERSION,
697 .d_ioctl = nvme_ctrlr_ioctl
701 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
703 union cap_lo_register cap_lo;
704 union cap_hi_register cap_hi;
705 int num_vectors, per_cpu_io_queues, status = 0;
708 ctrlr->is_started = FALSE;
710 status = nvme_ctrlr_allocate_bar(ctrlr);
716 if (pci_get_devid(dev) == CHATHAM_PCI_ID) {
717 status = nvme_ctrlr_allocate_chatham_bar(ctrlr);
720 nvme_ctrlr_setup_chatham(ctrlr);
725 * Software emulators may set the doorbell stride to something
726 * other than zero, but this driver is not set up to handle that.
728 cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi);
729 if (cap_hi.bits.dstrd != 0)
732 /* Get ready timeout value from controller, in units of 500ms. */
733 cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
734 ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500;
736 per_cpu_io_queues = 1;
737 TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
738 ctrlr->per_cpu_io_queues = per_cpu_io_queues ? TRUE : FALSE;
740 if (ctrlr->per_cpu_io_queues)
741 ctrlr->num_io_queues = mp_ncpus;
743 ctrlr->num_io_queues = 1;
745 ctrlr->force_intx = 0;
746 TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
748 ctrlr->msix_enabled = 1;
750 if (ctrlr->force_intx) {
751 ctrlr->msix_enabled = 0;
755 /* One vector per IO queue, plus one vector for admin queue. */
756 num_vectors = ctrlr->num_io_queues + 1;
758 if (pci_msix_count(dev) < num_vectors) {
759 ctrlr->msix_enabled = 0;
763 if (pci_alloc_msix(dev, &num_vectors) != 0)
764 ctrlr->msix_enabled = 0;
768 if (!ctrlr->msix_enabled)
769 nvme_ctrlr_configure_intx(ctrlr);
771 nvme_ctrlr_construct_admin_qpair(ctrlr);
773 status = nvme_ctrlr_construct_io_qpairs(ctrlr);
778 ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
779 "nvme%d", device_get_unit(dev));
781 if (ctrlr->cdev == NULL)
784 ctrlr->cdev->si_drv1 = (void *)ctrlr;