]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/nvme/nvme_ctrlr.c
Remove the is_started flag from struct nvme_controller.
[FreeBSD/FreeBSD.git] / sys / dev / nvme / nvme_ctrlr.c
1 /*-
2  * Copyright (C) 2012 Intel Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/bus.h>
32 #include <sys/conf.h>
33 #include <sys/ioccom.h>
34 #include <sys/smp.h>
35
36 #include <dev/pci/pcireg.h>
37 #include <dev/pci/pcivar.h>
38
39 #include "nvme_private.h"
40
41 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
42                                                 struct nvme_async_event_request *aer);
43
44 static void
45 nvme_ctrlr_cb(void *arg, const struct nvme_completion *status)
46 {
47         struct nvme_completion  *cpl = arg;
48         struct mtx              *mtx;
49
50         /*
51          * Copy status into the argument passed by the caller, so that
52          *  the caller can check the status to determine if the
53          *  the request passed or failed.
54          */
55         memcpy(cpl, status, sizeof(*cpl));
56         mtx = mtx_pool_find(mtxpool_sleep, cpl);
57         mtx_lock(mtx);
58         wakeup(cpl);
59         mtx_unlock(mtx);
60 }
61
62 static int
63 nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr)
64 {
65
66         /* Chatham puts the NVMe MMRs behind BAR 2/3, not BAR 0/1. */
67         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
68                 ctrlr->resource_id = PCIR_BAR(2);
69         else
70                 ctrlr->resource_id = PCIR_BAR(0);
71
72         ctrlr->resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
73             &ctrlr->resource_id, 0, ~0, 1, RF_ACTIVE);
74
75         if(ctrlr->resource == NULL) {
76                 device_printf(ctrlr->dev, "unable to allocate pci resource\n");
77                 return (ENOMEM);
78         }
79
80         ctrlr->bus_tag = rman_get_bustag(ctrlr->resource);
81         ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource);
82         ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle;
83
84         /*
85          * The NVMe spec allows for the MSI-X table to be placed behind
86          *  BAR 4/5, separate from the control/doorbell registers.  Always
87          *  try to map this bar, because it must be mapped prior to calling
88          *  pci_alloc_msix().  If the table isn't behind BAR 4/5,
89          *  bus_alloc_resource() will just return NULL which is OK.
90          */
91         ctrlr->bar4_resource_id = PCIR_BAR(4);
92         ctrlr->bar4_resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
93             &ctrlr->bar4_resource_id, 0, ~0, 1, RF_ACTIVE);
94
95         return (0);
96 }
97
98 #ifdef CHATHAM2
99 static int
100 nvme_ctrlr_allocate_chatham_bar(struct nvme_controller *ctrlr)
101 {
102
103         ctrlr->chatham_resource_id = PCIR_BAR(CHATHAM_CONTROL_BAR);
104         ctrlr->chatham_resource = bus_alloc_resource(ctrlr->dev,
105             SYS_RES_MEMORY, &ctrlr->chatham_resource_id, 0, ~0, 1,
106             RF_ACTIVE);
107
108         if(ctrlr->chatham_resource == NULL) {
109                 device_printf(ctrlr->dev, "unable to alloc pci resource\n");
110                 return (ENOMEM);
111         }
112
113         ctrlr->chatham_bus_tag = rman_get_bustag(ctrlr->chatham_resource);
114         ctrlr->chatham_bus_handle =
115             rman_get_bushandle(ctrlr->chatham_resource);
116
117         return (0);
118 }
119
120 static void
121 nvme_ctrlr_setup_chatham(struct nvme_controller *ctrlr)
122 {
123         uint64_t reg1, reg2, reg3;
124         uint64_t temp1, temp2;
125         uint32_t temp3;
126         uint32_t use_flash_timings = 0;
127
128         DELAY(10000);
129
130         temp3 = chatham_read_4(ctrlr, 0x8080);
131
132         device_printf(ctrlr->dev, "Chatham version: 0x%x\n", temp3);
133
134         ctrlr->chatham_lbas = chatham_read_4(ctrlr, 0x8068) - 0x110;
135         ctrlr->chatham_size = ctrlr->chatham_lbas * 512;
136
137         device_printf(ctrlr->dev, "Chatham size: %jd\n",
138             (intmax_t)ctrlr->chatham_size);
139
140         reg1 = reg2 = reg3 = ctrlr->chatham_size - 1;
141
142         TUNABLE_INT_FETCH("hw.nvme.use_flash_timings", &use_flash_timings);
143         if (use_flash_timings) {
144                 device_printf(ctrlr->dev, "Chatham: using flash timings\n");
145                 temp1 = 0x00001b58000007d0LL;
146                 temp2 = 0x000000cb00000131LL;
147         } else {
148                 device_printf(ctrlr->dev, "Chatham: using DDR timings\n");
149                 temp1 = temp2 = 0x0LL;
150         }
151
152         chatham_write_8(ctrlr, 0x8000, reg1);
153         chatham_write_8(ctrlr, 0x8008, reg2);
154         chatham_write_8(ctrlr, 0x8010, reg3);
155
156         chatham_write_8(ctrlr, 0x8020, temp1);
157         temp3 = chatham_read_4(ctrlr, 0x8020);
158
159         chatham_write_8(ctrlr, 0x8028, temp2);
160         temp3 = chatham_read_4(ctrlr, 0x8028);
161
162         chatham_write_8(ctrlr, 0x8030, temp1);
163         chatham_write_8(ctrlr, 0x8038, temp2);
164         chatham_write_8(ctrlr, 0x8040, temp1);
165         chatham_write_8(ctrlr, 0x8048, temp2);
166         chatham_write_8(ctrlr, 0x8050, temp1);
167         chatham_write_8(ctrlr, 0x8058, temp2);
168
169         DELAY(10000);
170 }
171
172 static void
173 nvme_chatham_populate_cdata(struct nvme_controller *ctrlr)
174 {
175         struct nvme_controller_data *cdata;
176
177         cdata = &ctrlr->cdata;
178
179         cdata->vid = 0x8086;
180         cdata->ssvid = 0x2011;
181
182         /*
183          * Chatham2 puts garbage data in these fields when we
184          *  invoke IDENTIFY_CONTROLLER, so we need to re-zero
185          *  the fields before calling bcopy().
186          */
187         memset(cdata->sn, 0, sizeof(cdata->sn));
188         memcpy(cdata->sn, "2012", strlen("2012"));
189         memset(cdata->mn, 0, sizeof(cdata->mn));
190         memcpy(cdata->mn, "CHATHAM2", strlen("CHATHAM2"));
191         memset(cdata->fr, 0, sizeof(cdata->fr));
192         memcpy(cdata->fr, "0", strlen("0"));
193         cdata->rab = 8;
194         cdata->aerl = 3;
195         cdata->lpa.ns_smart = 1;
196         cdata->sqes.min = 6;
197         cdata->sqes.max = 6;
198         cdata->sqes.min = 4;
199         cdata->sqes.max = 4;
200         cdata->nn = 1;
201
202         /* Chatham2 doesn't support DSM command */
203         cdata->oncs.dsm = 0;
204
205         cdata->vwc.present = 1;
206 }
207 #endif /* CHATHAM2 */
208
209 static void
210 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
211 {
212         struct nvme_qpair       *qpair;
213         uint32_t                num_entries;
214
215         qpair = &ctrlr->adminq;
216
217         num_entries = NVME_ADMIN_ENTRIES;
218         TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
219         /*
220          * If admin_entries was overridden to an invalid value, revert it
221          *  back to our default value.
222          */
223         if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
224             num_entries > NVME_MAX_ADMIN_ENTRIES) {
225                 printf("nvme: invalid hw.nvme.admin_entries=%d specified\n",
226                     num_entries);
227                 num_entries = NVME_ADMIN_ENTRIES;
228         }
229
230         /*
231          * The admin queue's max xfer size is treated differently than the
232          *  max I/O xfer size.  16KB is sufficient here - maybe even less?
233          */
234         nvme_qpair_construct(qpair, 
235                              0, /* qpair ID */
236                              0, /* vector */
237                              num_entries,
238                              NVME_ADMIN_TRACKERS,
239                              16*1024, /* max xfer size */
240                              ctrlr);
241 }
242
243 static int
244 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
245 {
246         struct nvme_qpair       *qpair;
247         union cap_lo_register   cap_lo;
248         int                     i, num_entries, num_trackers;
249
250         num_entries = NVME_IO_ENTRIES;
251         TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
252
253         /*
254          * NVMe spec sets a hard limit of 64K max entries, but
255          *  devices may specify a smaller limit, so we need to check
256          *  the MQES field in the capabilities register.
257          */
258         cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
259         num_entries = min(num_entries, cap_lo.bits.mqes+1);
260
261         num_trackers = NVME_IO_TRACKERS;
262         TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
263
264         num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
265         num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
266         /*
267          * No need to have more trackers than entries in the submit queue.
268          *  Note also that for a queue size of N, we can only have (N-1)
269          *  commands outstanding, hence the "-1" here.
270          */
271         num_trackers = min(num_trackers, (num_entries-1));
272
273         ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
274         TUNABLE_INT_FETCH("hw.nvme.max_xfer_size", &ctrlr->max_xfer_size);
275         /*
276          * Check that tunable doesn't specify a size greater than what our
277          *  driver supports, and is an even PAGE_SIZE multiple.
278          */
279         if (ctrlr->max_xfer_size > NVME_MAX_XFER_SIZE ||
280             ctrlr->max_xfer_size % PAGE_SIZE)
281                 ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
282
283         ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
284             M_NVME, M_ZERO | M_NOWAIT);
285
286         if (ctrlr->ioq == NULL)
287                 return (ENOMEM);
288
289         for (i = 0; i < ctrlr->num_io_queues; i++) {
290                 qpair = &ctrlr->ioq[i];
291
292                 /*
293                  * Admin queue has ID=0. IO queues start at ID=1 -
294                  *  hence the 'i+1' here.
295                  *
296                  * For I/O queues, use the controller-wide max_xfer_size
297                  *  calculated in nvme_attach().
298                  */
299                 nvme_qpair_construct(qpair,
300                                      i+1, /* qpair ID */
301                                      ctrlr->msix_enabled ? i+1 : 0, /* vector */
302                                      num_entries,
303                                      num_trackers,
304                                      ctrlr->max_xfer_size,
305                                      ctrlr);
306
307                 if (ctrlr->per_cpu_io_queues)
308                         bus_bind_intr(ctrlr->dev, qpair->res, i);
309         }
310
311         return (0);
312 }
313
314 static int
315 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr)
316 {
317         int ms_waited;
318         union cc_register cc;
319         union csts_register csts;
320
321         cc.raw = nvme_mmio_read_4(ctrlr, cc);
322         csts.raw = nvme_mmio_read_4(ctrlr, csts);
323
324         if (!cc.bits.en) {
325                 device_printf(ctrlr->dev, "%s called with cc.en = 0\n",
326                     __func__);
327                 return (ENXIO);
328         }
329
330         ms_waited = 0;
331
332         while (!csts.bits.rdy) {
333                 DELAY(1000);
334                 if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
335                         device_printf(ctrlr->dev, "controller did not become "
336                             "ready within %d ms\n", ctrlr->ready_timeout_in_ms);
337                         return (ENXIO);
338                 }
339                 csts.raw = nvme_mmio_read_4(ctrlr, csts);
340         }
341
342         return (0);
343 }
344
345 static void
346 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
347 {
348         union cc_register cc;
349         union csts_register csts;
350
351         cc.raw = nvme_mmio_read_4(ctrlr, cc);
352         csts.raw = nvme_mmio_read_4(ctrlr, csts);
353
354         if (cc.bits.en == 1 && csts.bits.rdy == 0)
355                 nvme_ctrlr_wait_for_ready(ctrlr);
356
357         cc.bits.en = 0;
358         nvme_mmio_write_4(ctrlr, cc, cc.raw);
359         DELAY(5000);
360 }
361
362 static int
363 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
364 {
365         union cc_register       cc;
366         union csts_register     csts;
367         union aqa_register      aqa;
368
369         cc.raw = nvme_mmio_read_4(ctrlr, cc);
370         csts.raw = nvme_mmio_read_4(ctrlr, csts);
371
372         if (cc.bits.en == 1) {
373                 if (csts.bits.rdy == 1)
374                         return (0);
375                 else
376                         return (nvme_ctrlr_wait_for_ready(ctrlr));
377         }
378
379         nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
380         DELAY(5000);
381         nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
382         DELAY(5000);
383
384         aqa.raw = 0;
385         /* acqs and asqs are 0-based. */
386         aqa.bits.acqs = ctrlr->adminq.num_entries-1;
387         aqa.bits.asqs = ctrlr->adminq.num_entries-1;
388         nvme_mmio_write_4(ctrlr, aqa, aqa.raw);
389         DELAY(5000);
390
391         cc.bits.en = 1;
392         cc.bits.css = 0;
393         cc.bits.ams = 0;
394         cc.bits.shn = 0;
395         cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
396         cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
397
398         /* This evaluates to 0, which is according to spec. */
399         cc.bits.mps = (PAGE_SIZE >> 13);
400
401         nvme_mmio_write_4(ctrlr, cc, cc.raw);
402         DELAY(5000);
403
404         return (nvme_ctrlr_wait_for_ready(ctrlr));
405 }
406
407 int
408 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
409 {
410         int i;
411
412         nvme_admin_qpair_disable(&ctrlr->adminq);
413         for (i = 0; i < ctrlr->num_io_queues; i++)
414                 nvme_io_qpair_disable(&ctrlr->ioq[i]);
415
416         DELAY(100*1000);
417
418         nvme_ctrlr_disable(ctrlr);
419         return (nvme_ctrlr_enable(ctrlr));
420 }
421
422 void
423 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
424 {
425         int cmpset;
426
427         cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
428
429         if (cmpset == 0)
430                 /* Controller is already resetting. */
431                 return;
432
433         taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
434 }
435
436 static int
437 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
438 {
439         struct mtx              *mtx;
440         struct nvme_completion  cpl;
441         int                     status;
442
443         mtx = mtx_pool_find(mtxpool_sleep, &cpl);
444
445         mtx_lock(mtx);
446         nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
447             nvme_ctrlr_cb, &cpl);
448         status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
449         mtx_unlock(mtx);
450         if ((status != 0) || nvme_completion_is_error(&cpl)) {
451                 printf("nvme_identify_controller failed!\n");
452                 return (ENXIO);
453         }
454
455 #ifdef CHATHAM2
456         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
457                 nvme_chatham_populate_cdata(ctrlr);
458 #endif
459
460         /*
461          * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
462          *  controller supports.
463          */
464         if (ctrlr->cdata.mdts > 0)
465                 ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
466                     ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
467
468         return (0);
469 }
470
471 static int
472 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
473 {
474         struct mtx              *mtx;
475         struct nvme_completion  cpl;
476         int                     cq_allocated, sq_allocated, status;
477
478         mtx = mtx_pool_find(mtxpool_sleep, &cpl);
479
480         mtx_lock(mtx);
481         nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
482             nvme_ctrlr_cb, &cpl);
483         status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
484         mtx_unlock(mtx);
485         if ((status != 0) || nvme_completion_is_error(&cpl)) {
486                 printf("nvme_set_num_queues failed!\n");
487                 return (ENXIO);
488         }
489
490         /*
491          * Data in cdw0 is 0-based.
492          * Lower 16-bits indicate number of submission queues allocated.
493          * Upper 16-bits indicate number of completion queues allocated.
494          */
495         sq_allocated = (cpl.cdw0 & 0xFFFF) + 1;
496         cq_allocated = (cpl.cdw0 >> 16) + 1;
497
498         /*
499          * Check that the controller was able to allocate the number of
500          *  queues we requested.  If not, revert to one IO queue.
501          */
502         if (sq_allocated < ctrlr->num_io_queues ||
503             cq_allocated < ctrlr->num_io_queues) {
504                 ctrlr->num_io_queues = 1;
505                 ctrlr->per_cpu_io_queues = 0;
506
507                 /* TODO: destroy extra queues that were created
508                  *  previously but now found to be not needed.
509                  */
510         }
511
512         return (0);
513 }
514
515 static int
516 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
517 {
518         struct mtx              *mtx;
519         struct nvme_qpair       *qpair;
520         struct nvme_completion  cpl;
521         int                     i, status;
522
523         mtx = mtx_pool_find(mtxpool_sleep, &cpl);
524
525         for (i = 0; i < ctrlr->num_io_queues; i++) {
526                 qpair = &ctrlr->ioq[i];
527
528                 mtx_lock(mtx);
529                 nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
530                     nvme_ctrlr_cb, &cpl);
531                 status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
532                 mtx_unlock(mtx);
533                 if ((status != 0) || nvme_completion_is_error(&cpl)) {
534                         printf("nvme_create_io_cq failed!\n");
535                         return (ENXIO);
536                 }
537
538                 mtx_lock(mtx);
539                 nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair,
540                     nvme_ctrlr_cb, &cpl);
541                 status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
542                 mtx_unlock(mtx);
543                 if ((status != 0) || nvme_completion_is_error(&cpl)) {
544                         printf("nvme_create_io_sq failed!\n");
545                         return (ENXIO);
546                 }
547         }
548
549         return (0);
550 }
551
552 static int
553 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
554 {
555         struct nvme_namespace   *ns;
556         int                     i, status;
557
558         for (i = 0; i < ctrlr->cdata.nn; i++) {
559                 ns = &ctrlr->ns[i];
560                 status = nvme_ns_construct(ns, i+1, ctrlr);
561                 if (status != 0)
562                         return (status);
563         }
564
565         return (0);
566 }
567
568 static boolean_t
569 is_log_page_id_valid(uint8_t page_id)
570 {
571
572         switch (page_id) {
573         case NVME_LOG_ERROR:
574         case NVME_LOG_HEALTH_INFORMATION:
575         case NVME_LOG_FIRMWARE_SLOT:
576                 return (TRUE);
577         }
578
579         return (FALSE);
580 }
581
582 static uint32_t
583 nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
584 {
585         uint32_t        log_page_size;
586
587         switch (page_id) {
588         case NVME_LOG_ERROR:
589                 log_page_size = min(
590                     sizeof(struct nvme_error_information_entry) *
591                     ctrlr->cdata.elpe,
592                     NVME_MAX_AER_LOG_SIZE);
593                 break;
594         case NVME_LOG_HEALTH_INFORMATION:
595                 log_page_size = sizeof(struct nvme_health_information_page);
596                 break;
597         case NVME_LOG_FIRMWARE_SLOT:
598                 log_page_size = sizeof(struct nvme_firmware_page);
599                 break;
600         default:
601                 log_page_size = 0;
602                 break;
603         }
604
605         return (log_page_size);
606 }
607
608 static void
609 nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
610 {
611         struct nvme_async_event_request *aer = arg;
612
613         /*
614          * If the log page fetch for some reason completed with an error,
615          *  don't pass log page data to the consumers.  In practice, this case
616          *  should never happen.
617          */
618         if (nvme_completion_is_error(cpl))
619                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
620                     aer->log_page_id, NULL, 0);
621         else
622                 /*
623                  * Pass the cpl data from the original async event completion,
624                  *  not the log page fetch.
625                  */
626                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
627                     aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
628
629         /*
630          * Repost another asynchronous event request to replace the one
631          *  that just completed.
632          */
633         nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
634 }
635
636 static void
637 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
638 {
639         struct nvme_async_event_request *aer = arg;
640
641         if (cpl->status.sc == NVME_SC_ABORTED_SQ_DELETION) {
642                 /*
643                  *  This is simulated when controller is being shut down, to
644                  *  effectively abort outstanding asynchronous event requests
645                  *  and make sure all memory is freed.  Do not repost the
646                  *  request in this case.
647                  */
648                 return;
649         }
650
651         printf("Asynchronous event occurred.\n");
652
653         /* Associated log page is in bits 23:16 of completion entry dw0. */
654         aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
655
656         if (is_log_page_id_valid(aer->log_page_id)) {
657                 aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
658                     aer->log_page_id);
659                 memcpy(&aer->cpl, cpl, sizeof(*cpl));
660                 nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
661                     NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
662                     aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
663                     aer);
664                 /* Wait to notify consumers until after log page is fetched. */
665         } else {
666                 nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
667                     NULL, 0);
668
669                 /*
670                  * Repost another asynchronous event request to replace the one
671                  *  that just completed.
672                  */
673                 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
674         }
675 }
676
677 static void
678 nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
679     struct nvme_async_event_request *aer)
680 {
681         struct nvme_request *req;
682
683         aer->ctrlr = ctrlr;
684         req = nvme_allocate_request(NULL, 0, nvme_ctrlr_async_event_cb, aer);
685         aer->req = req;
686
687         /*
688          * Disable timeout here, since asynchronous event requests should by
689          *  nature never be timed out.
690          */
691         req->timeout = FALSE;
692         req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
693         nvme_ctrlr_submit_admin_request(ctrlr, req);
694 }
695
696 static void
697 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
698 {
699         union nvme_critical_warning_state       state;
700         struct nvme_async_event_request         *aer;
701         uint32_t                                i;
702
703         state.raw = 0xFF;
704         state.bits.reserved = 0;
705         nvme_ctrlr_cmd_set_async_event_config(ctrlr, state, NULL, NULL);
706
707         /* aerl is a zero-based value, so we need to add 1 here. */
708         ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
709
710         /* Chatham doesn't support AERs. */
711         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
712                 ctrlr->num_aers = 0;
713
714         for (i = 0; i < ctrlr->num_aers; i++) {
715                 aer = &ctrlr->aer[i];
716                 nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
717         }
718 }
719
720 static void
721 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
722 {
723
724         ctrlr->int_coal_time = 0;
725         TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
726             &ctrlr->int_coal_time);
727
728         ctrlr->int_coal_threshold = 0;
729         TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
730             &ctrlr->int_coal_threshold);
731
732         nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
733             ctrlr->int_coal_threshold, NULL, NULL);
734 }
735
736 static void
737 nvme_ctrlr_start(void *ctrlr_arg)
738 {
739         struct nvme_controller *ctrlr = ctrlr_arg;
740         int i;
741
742         nvme_qpair_reset(&ctrlr->adminq);
743         for (i = 0; i < ctrlr->num_io_queues; i++)
744                 nvme_qpair_reset(&ctrlr->ioq[i]);
745
746         nvme_admin_qpair_enable(&ctrlr->adminq);
747
748         if (nvme_ctrlr_identify(ctrlr) != 0)
749                 return;
750
751         if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
752                 return;
753
754         if (nvme_ctrlr_create_qpairs(ctrlr) != 0)
755                 return;
756
757         if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
758                 return;
759
760         nvme_ctrlr_configure_aer(ctrlr);
761         nvme_ctrlr_configure_int_coalescing(ctrlr);
762
763         for (i = 0; i < ctrlr->num_io_queues; i++)
764                 nvme_io_qpair_enable(&ctrlr->ioq[i]);
765 }
766
767 void
768 nvme_ctrlr_start_config_hook(void *arg)
769 {
770         struct nvme_controller *ctrlr = arg;
771
772         nvme_ctrlr_start(ctrlr);
773         config_intrhook_disestablish(&ctrlr->config_hook);
774 }
775
776 static void
777 nvme_ctrlr_reset_task(void *arg, int pending)
778 {
779         struct nvme_controller  *ctrlr = arg;
780         int                     status;
781
782         device_printf(ctrlr->dev, "resetting controller");
783         status = nvme_ctrlr_hw_reset(ctrlr);
784         /*
785          * Use pause instead of DELAY, so that we yield to any nvme interrupt
786          *  handlers on this CPU that were blocked on a qpair lock. We want
787          *  all nvme interrupts completed before proceeding with restarting the
788          *  controller.
789          *
790          * XXX - any way to guarantee the interrupt handlers have quiesced?
791          */
792         pause("nvmereset", hz / 10);
793         if (status == 0)
794                 nvme_ctrlr_start(ctrlr);
795
796         atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
797 }
798
799 static void
800 nvme_ctrlr_intx_handler(void *arg)
801 {
802         struct nvme_controller *ctrlr = arg;
803
804         nvme_mmio_write_4(ctrlr, intms, 1);
805
806         nvme_qpair_process_completions(&ctrlr->adminq);
807
808         if (ctrlr->ioq[0].cpl)
809                 nvme_qpair_process_completions(&ctrlr->ioq[0]);
810
811         nvme_mmio_write_4(ctrlr, intmc, 1);
812 }
813
814 static int
815 nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
816 {
817
818         ctrlr->num_io_queues = 1;
819         ctrlr->per_cpu_io_queues = 0;
820         ctrlr->rid = 0;
821         ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
822             &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
823
824         if (ctrlr->res == NULL) {
825                 device_printf(ctrlr->dev, "unable to allocate shared IRQ\n");
826                 return (ENOMEM);
827         }
828
829         bus_setup_intr(ctrlr->dev, ctrlr->res,
830             INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
831             ctrlr, &ctrlr->tag);
832
833         if (ctrlr->tag == NULL) {
834                 device_printf(ctrlr->dev,
835                     "unable to setup legacy interrupt handler\n");
836                 return (ENOMEM);
837         }
838
839         return (0);
840 }
841
842 static int
843 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
844     struct thread *td)
845 {
846         struct nvme_controller  *ctrlr;
847         struct nvme_completion  cpl;
848         struct mtx              *mtx;
849
850         ctrlr = cdev->si_drv1;
851
852         switch (cmd) {
853         case NVME_IDENTIFY_CONTROLLER:
854 #ifdef CHATHAM2
855                 /*
856                  * Don't refresh data on Chatham, since Chatham returns
857                  *  garbage on IDENTIFY anyways.
858                  */
859                 if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID) {
860                         memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
861                         break;
862                 }
863 #endif
864                 /* Refresh data before returning to user. */
865                 mtx = mtx_pool_find(mtxpool_sleep, &cpl);
866                 mtx_lock(mtx);
867                 nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
868                     nvme_ctrlr_cb, &cpl);
869                 msleep(&cpl, mtx, PRIBIO, "nvme_ioctl", 0);
870                 mtx_unlock(mtx);
871                 if (nvme_completion_is_error(&cpl))
872                         return (ENXIO);
873                 memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
874                 break;
875         case NVME_RESET_CONTROLLER:
876                 nvme_ctrlr_reset(ctrlr);
877                 break;
878         default:
879                 return (ENOTTY);
880         }
881
882         return (0);
883 }
884
885 static struct cdevsw nvme_ctrlr_cdevsw = {
886         .d_version =    D_VERSION,
887         .d_flags =      0,
888         .d_ioctl =      nvme_ctrlr_ioctl
889 };
890
891 int
892 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
893 {
894         union cap_lo_register   cap_lo;
895         union cap_hi_register   cap_hi;
896         int                     num_vectors, per_cpu_io_queues, status = 0;
897         int                     timeout_period;
898
899         ctrlr->dev = dev;
900
901         status = nvme_ctrlr_allocate_bar(ctrlr);
902
903         if (status != 0)
904                 return (status);
905
906 #ifdef CHATHAM2
907         if (pci_get_devid(dev) == CHATHAM_PCI_ID) {
908                 status = nvme_ctrlr_allocate_chatham_bar(ctrlr);
909                 if (status != 0)
910                         return (status);
911                 nvme_ctrlr_setup_chatham(ctrlr);
912         }
913 #endif
914
915         /*
916          * Software emulators may set the doorbell stride to something
917          *  other than zero, but this driver is not set up to handle that.
918          */
919         cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi);
920         if (cap_hi.bits.dstrd != 0)
921                 return (ENXIO);
922
923         ctrlr->min_page_size = 1 << (12 + cap_hi.bits.mpsmin);
924
925         /* Get ready timeout value from controller, in units of 500ms. */
926         cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
927         ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500;
928
929         timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
930         TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
931         timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
932         timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
933         ctrlr->timeout_period = timeout_period;
934
935         nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
936         TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
937
938         per_cpu_io_queues = 1;
939         TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
940         ctrlr->per_cpu_io_queues = per_cpu_io_queues ? TRUE : FALSE;
941
942         if (ctrlr->per_cpu_io_queues)
943                 ctrlr->num_io_queues = mp_ncpus;
944         else
945                 ctrlr->num_io_queues = 1;
946
947         ctrlr->force_intx = 0;
948         TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
949
950         ctrlr->enable_aborts = 0;
951         TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
952
953         ctrlr->msix_enabled = 1;
954
955         if (ctrlr->force_intx) {
956                 ctrlr->msix_enabled = 0;
957                 goto intx;
958         }
959
960         /* One vector per IO queue, plus one vector for admin queue. */
961         num_vectors = ctrlr->num_io_queues + 1;
962
963         if (pci_msix_count(dev) < num_vectors) {
964                 ctrlr->msix_enabled = 0;
965                 goto intx;
966         }
967
968         if (pci_alloc_msix(dev, &num_vectors) != 0)
969                 ctrlr->msix_enabled = 0;
970
971 intx:
972
973         if (!ctrlr->msix_enabled)
974                 nvme_ctrlr_configure_intx(ctrlr);
975
976         nvme_ctrlr_construct_admin_qpair(ctrlr);
977
978         status = nvme_ctrlr_construct_io_qpairs(ctrlr);
979
980         if (status != 0)
981                 return (status);
982
983         ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
984             "nvme%d", device_get_unit(dev));
985
986         if (ctrlr->cdev == NULL)
987                 return (ENXIO);
988
989         ctrlr->cdev->si_drv1 = (void *)ctrlr;
990
991         TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
992         ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
993             taskqueue_thread_enqueue, &ctrlr->taskqueue);
994         taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");
995
996         ctrlr->is_resetting = 0;
997
998         return (0);
999 }
1000
1001 void
1002 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
1003 {
1004         int                             i;
1005
1006         taskqueue_free(ctrlr->taskqueue);
1007
1008         for (i = 0; i < NVME_MAX_NAMESPACES; i++)
1009                 nvme_ns_destruct(&ctrlr->ns[i]);
1010
1011         if (ctrlr->cdev)
1012                 destroy_dev(ctrlr->cdev);
1013
1014         for (i = 0; i < ctrlr->num_io_queues; i++) {
1015                 nvme_io_qpair_destroy(&ctrlr->ioq[i]);
1016         }
1017
1018         free(ctrlr->ioq, M_NVME);
1019
1020         nvme_admin_qpair_destroy(&ctrlr->adminq);
1021
1022         if (ctrlr->resource != NULL) {
1023                 bus_release_resource(dev, SYS_RES_MEMORY,
1024                     ctrlr->resource_id, ctrlr->resource);
1025         }
1026
1027         if (ctrlr->bar4_resource != NULL) {
1028                 bus_release_resource(dev, SYS_RES_MEMORY,
1029                     ctrlr->bar4_resource_id, ctrlr->bar4_resource);
1030         }
1031
1032 #ifdef CHATHAM2
1033         if (ctrlr->chatham_resource != NULL) {
1034                 bus_release_resource(dev, SYS_RES_MEMORY,
1035                     ctrlr->chatham_resource_id, ctrlr->chatham_resource);
1036         }
1037 #endif
1038
1039         if (ctrlr->tag)
1040                 bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
1041
1042         if (ctrlr->res)
1043                 bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
1044                     rman_get_rid(ctrlr->res), ctrlr->res);
1045
1046         if (ctrlr->msix_enabled)
1047                 pci_release_msi(dev);
1048 }
1049
1050 void
1051 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
1052     struct nvme_request *req)
1053 {
1054
1055         nvme_qpair_submit_request(&ctrlr->adminq, req);
1056 }
1057
1058 void
1059 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
1060     struct nvme_request *req)
1061 {
1062         struct nvme_qpair       *qpair;
1063
1064         if (ctrlr->per_cpu_io_queues)
1065                 qpair = &ctrlr->ioq[curcpu];
1066         else
1067                 qpair = &ctrlr->ioq[0];
1068
1069         nvme_qpair_submit_request(qpair, req);
1070 }
1071
1072 device_t
1073 nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
1074 {
1075
1076         return (ctrlr->dev);
1077 }
1078
1079 const struct nvme_controller_data *
1080 nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
1081 {
1082
1083         return (&ctrlr->cdata);
1084 }