]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/nvme/nvme_ctrlr.c
Merge ACPICA 20130328.
[FreeBSD/FreeBSD.git] / sys / dev / nvme / nvme_ctrlr.c
1 /*-
2  * Copyright (C) 2012 Intel Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/bus.h>
32 #include <sys/conf.h>
33 #include <sys/ioccom.h>
34 #include <sys/smp.h>
35
36 #include <dev/pci/pcireg.h>
37 #include <dev/pci/pcivar.h>
38
39 #include "nvme_private.h"
40
41 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
42                                                 struct nvme_async_event_request *aer);
43
44 static int
45 nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr)
46 {
47
48         /* Chatham puts the NVMe MMRs behind BAR 2/3, not BAR 0/1. */
49         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
50                 ctrlr->resource_id = PCIR_BAR(2);
51         else
52                 ctrlr->resource_id = PCIR_BAR(0);
53
54         ctrlr->resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
55             &ctrlr->resource_id, 0, ~0, 1, RF_ACTIVE);
56
57         if(ctrlr->resource == NULL) {
58                 nvme_printf(ctrlr, "unable to allocate pci resource\n");
59                 return (ENOMEM);
60         }
61
62         ctrlr->bus_tag = rman_get_bustag(ctrlr->resource);
63         ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource);
64         ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle;
65
66         /*
67          * The NVMe spec allows for the MSI-X table to be placed behind
68          *  BAR 4/5, separate from the control/doorbell registers.  Always
69          *  try to map this bar, because it must be mapped prior to calling
70          *  pci_alloc_msix().  If the table isn't behind BAR 4/5,
71          *  bus_alloc_resource() will just return NULL which is OK.
72          */
73         ctrlr->bar4_resource_id = PCIR_BAR(4);
74         ctrlr->bar4_resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
75             &ctrlr->bar4_resource_id, 0, ~0, 1, RF_ACTIVE);
76
77         return (0);
78 }
79
80 #ifdef CHATHAM2
81 static int
82 nvme_ctrlr_allocate_chatham_bar(struct nvme_controller *ctrlr)
83 {
84
85         ctrlr->chatham_resource_id = PCIR_BAR(CHATHAM_CONTROL_BAR);
86         ctrlr->chatham_resource = bus_alloc_resource(ctrlr->dev,
87             SYS_RES_MEMORY, &ctrlr->chatham_resource_id, 0, ~0, 1,
88             RF_ACTIVE);
89
90         if(ctrlr->chatham_resource == NULL) {
91                 nvme_printf(ctrlr, "unable to alloc pci resource\n");
92                 return (ENOMEM);
93         }
94
95         ctrlr->chatham_bus_tag = rman_get_bustag(ctrlr->chatham_resource);
96         ctrlr->chatham_bus_handle =
97             rman_get_bushandle(ctrlr->chatham_resource);
98
99         return (0);
100 }
101
102 static void
103 nvme_ctrlr_setup_chatham(struct nvme_controller *ctrlr)
104 {
105         uint64_t reg1, reg2, reg3;
106         uint64_t temp1, temp2;
107         uint32_t temp3;
108         uint32_t use_flash_timings = 0;
109
110         DELAY(10000);
111
112         temp3 = chatham_read_4(ctrlr, 0x8080);
113
114         device_printf(ctrlr->dev, "Chatham version: 0x%x\n", temp3);
115
116         ctrlr->chatham_lbas = chatham_read_4(ctrlr, 0x8068) - 0x110;
117         ctrlr->chatham_size = ctrlr->chatham_lbas * 512;
118
119         device_printf(ctrlr->dev, "Chatham size: %jd\n",
120             (intmax_t)ctrlr->chatham_size);
121
122         reg1 = reg2 = reg3 = ctrlr->chatham_size - 1;
123
124         TUNABLE_INT_FETCH("hw.nvme.use_flash_timings", &use_flash_timings);
125         if (use_flash_timings) {
126                 device_printf(ctrlr->dev, "Chatham: using flash timings\n");
127                 temp1 = 0x00001b58000007d0LL;
128                 temp2 = 0x000000cb00000131LL;
129         } else {
130                 device_printf(ctrlr->dev, "Chatham: using DDR timings\n");
131                 temp1 = temp2 = 0x0LL;
132         }
133
134         chatham_write_8(ctrlr, 0x8000, reg1);
135         chatham_write_8(ctrlr, 0x8008, reg2);
136         chatham_write_8(ctrlr, 0x8010, reg3);
137
138         chatham_write_8(ctrlr, 0x8020, temp1);
139         temp3 = chatham_read_4(ctrlr, 0x8020);
140
141         chatham_write_8(ctrlr, 0x8028, temp2);
142         temp3 = chatham_read_4(ctrlr, 0x8028);
143
144         chatham_write_8(ctrlr, 0x8030, temp1);
145         chatham_write_8(ctrlr, 0x8038, temp2);
146         chatham_write_8(ctrlr, 0x8040, temp1);
147         chatham_write_8(ctrlr, 0x8048, temp2);
148         chatham_write_8(ctrlr, 0x8050, temp1);
149         chatham_write_8(ctrlr, 0x8058, temp2);
150
151         DELAY(10000);
152 }
153
154 static void
155 nvme_chatham_populate_cdata(struct nvme_controller *ctrlr)
156 {
157         struct nvme_controller_data *cdata;
158
159         cdata = &ctrlr->cdata;
160
161         cdata->vid = 0x8086;
162         cdata->ssvid = 0x2011;
163
164         /*
165          * Chatham2 puts garbage data in these fields when we
166          *  invoke IDENTIFY_CONTROLLER, so we need to re-zero
167          *  the fields before calling bcopy().
168          */
169         memset(cdata->sn, 0, sizeof(cdata->sn));
170         memcpy(cdata->sn, "2012", strlen("2012"));
171         memset(cdata->mn, 0, sizeof(cdata->mn));
172         memcpy(cdata->mn, "CHATHAM2", strlen("CHATHAM2"));
173         memset(cdata->fr, 0, sizeof(cdata->fr));
174         memcpy(cdata->fr, "0", strlen("0"));
175         cdata->rab = 8;
176         cdata->aerl = 3;
177         cdata->lpa.ns_smart = 1;
178         cdata->sqes.min = 6;
179         cdata->sqes.max = 6;
180         cdata->sqes.min = 4;
181         cdata->sqes.max = 4;
182         cdata->nn = 1;
183
184         /* Chatham2 doesn't support DSM command */
185         cdata->oncs.dsm = 0;
186
187         cdata->vwc.present = 1;
188 }
189 #endif /* CHATHAM2 */
190
191 static void
192 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
193 {
194         struct nvme_qpair       *qpair;
195         uint32_t                num_entries;
196
197         qpair = &ctrlr->adminq;
198
199         num_entries = NVME_ADMIN_ENTRIES;
200         TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
201         /*
202          * If admin_entries was overridden to an invalid value, revert it
203          *  back to our default value.
204          */
205         if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
206             num_entries > NVME_MAX_ADMIN_ENTRIES) {
207                 nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d "
208                     "specified\n", num_entries);
209                 num_entries = NVME_ADMIN_ENTRIES;
210         }
211
212         /*
213          * The admin queue's max xfer size is treated differently than the
214          *  max I/O xfer size.  16KB is sufficient here - maybe even less?
215          */
216         nvme_qpair_construct(qpair, 
217                              0, /* qpair ID */
218                              0, /* vector */
219                              num_entries,
220                              NVME_ADMIN_TRACKERS,
221                              16*1024, /* max xfer size */
222                              ctrlr);
223 }
224
225 static int
226 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
227 {
228         struct nvme_qpair       *qpair;
229         union cap_lo_register   cap_lo;
230         int                     i, num_entries, num_trackers;
231
232         num_entries = NVME_IO_ENTRIES;
233         TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
234
235         /*
236          * NVMe spec sets a hard limit of 64K max entries, but
237          *  devices may specify a smaller limit, so we need to check
238          *  the MQES field in the capabilities register.
239          */
240         cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
241         num_entries = min(num_entries, cap_lo.bits.mqes+1);
242
243         num_trackers = NVME_IO_TRACKERS;
244         TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
245
246         num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
247         num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
248         /*
249          * No need to have more trackers than entries in the submit queue.
250          *  Note also that for a queue size of N, we can only have (N-1)
251          *  commands outstanding, hence the "-1" here.
252          */
253         num_trackers = min(num_trackers, (num_entries-1));
254
255         ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
256         TUNABLE_INT_FETCH("hw.nvme.max_xfer_size", &ctrlr->max_xfer_size);
257         /*
258          * Check that tunable doesn't specify a size greater than what our
259          *  driver supports, and is an even PAGE_SIZE multiple.
260          */
261         if (ctrlr->max_xfer_size > NVME_MAX_XFER_SIZE ||
262             ctrlr->max_xfer_size % PAGE_SIZE)
263                 ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
264
265         ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
266             M_NVME, M_ZERO | M_WAITOK);
267
268         for (i = 0; i < ctrlr->num_io_queues; i++) {
269                 qpair = &ctrlr->ioq[i];
270
271                 /*
272                  * Admin queue has ID=0. IO queues start at ID=1 -
273                  *  hence the 'i+1' here.
274                  *
275                  * For I/O queues, use the controller-wide max_xfer_size
276                  *  calculated in nvme_attach().
277                  */
278                 nvme_qpair_construct(qpair,
279                                      i+1, /* qpair ID */
280                                      ctrlr->msix_enabled ? i+1 : 0, /* vector */
281                                      num_entries,
282                                      num_trackers,
283                                      ctrlr->max_xfer_size,
284                                      ctrlr);
285
286                 if (ctrlr->per_cpu_io_queues)
287                         bus_bind_intr(ctrlr->dev, qpair->res, i);
288         }
289
290         return (0);
291 }
292
293 static void
294 nvme_ctrlr_fail(struct nvme_controller *ctrlr)
295 {
296         int i;
297
298         ctrlr->is_failed = TRUE;
299         nvme_qpair_fail(&ctrlr->adminq);
300         for (i = 0; i < ctrlr->num_io_queues; i++)
301                 nvme_qpair_fail(&ctrlr->ioq[i]);
302         nvme_notify_fail_consumers(ctrlr);
303 }
304
305 void
306 nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
307     struct nvme_request *req)
308 {
309
310         mtx_lock(&ctrlr->fail_req_lock);
311         STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
312         mtx_unlock(&ctrlr->fail_req_lock);
313         taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
314 }
315
316 static void
317 nvme_ctrlr_fail_req_task(void *arg, int pending)
318 {
319         struct nvme_controller  *ctrlr = arg;
320         struct nvme_request     *req;
321
322         mtx_lock(&ctrlr->fail_req_lock);
323         while (!STAILQ_EMPTY(&ctrlr->fail_req)) {
324                 req = STAILQ_FIRST(&ctrlr->fail_req);
325                 STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq);
326                 nvme_qpair_manual_complete_request(req->qpair, req,
327                     NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE);
328         }
329         mtx_unlock(&ctrlr->fail_req_lock);
330 }
331
332 static int
333 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr)
334 {
335         int ms_waited;
336         union cc_register cc;
337         union csts_register csts;
338
339         cc.raw = nvme_mmio_read_4(ctrlr, cc);
340         csts.raw = nvme_mmio_read_4(ctrlr, csts);
341
342         if (!cc.bits.en) {
343                 nvme_printf(ctrlr, "%s called with cc.en = 0\n", __func__);
344                 return (ENXIO);
345         }
346
347         ms_waited = 0;
348
349         while (!csts.bits.rdy) {
350                 DELAY(1000);
351                 if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
352                         nvme_printf(ctrlr, "controller did not become ready "
353                             "within %d ms\n", ctrlr->ready_timeout_in_ms);
354                         return (ENXIO);
355                 }
356                 csts.raw = nvme_mmio_read_4(ctrlr, csts);
357         }
358
359         return (0);
360 }
361
362 static void
363 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
364 {
365         union cc_register cc;
366         union csts_register csts;
367
368         cc.raw = nvme_mmio_read_4(ctrlr, cc);
369         csts.raw = nvme_mmio_read_4(ctrlr, csts);
370
371         if (cc.bits.en == 1 && csts.bits.rdy == 0)
372                 nvme_ctrlr_wait_for_ready(ctrlr);
373
374         cc.bits.en = 0;
375         nvme_mmio_write_4(ctrlr, cc, cc.raw);
376         DELAY(5000);
377 }
378
379 static int
380 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
381 {
382         union cc_register       cc;
383         union csts_register     csts;
384         union aqa_register      aqa;
385
386         cc.raw = nvme_mmio_read_4(ctrlr, cc);
387         csts.raw = nvme_mmio_read_4(ctrlr, csts);
388
389         if (cc.bits.en == 1) {
390                 if (csts.bits.rdy == 1)
391                         return (0);
392                 else
393                         return (nvme_ctrlr_wait_for_ready(ctrlr));
394         }
395
396         nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
397         DELAY(5000);
398         nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
399         DELAY(5000);
400
401         aqa.raw = 0;
402         /* acqs and asqs are 0-based. */
403         aqa.bits.acqs = ctrlr->adminq.num_entries-1;
404         aqa.bits.asqs = ctrlr->adminq.num_entries-1;
405         nvme_mmio_write_4(ctrlr, aqa, aqa.raw);
406         DELAY(5000);
407
408         cc.bits.en = 1;
409         cc.bits.css = 0;
410         cc.bits.ams = 0;
411         cc.bits.shn = 0;
412         cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
413         cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
414
415         /* This evaluates to 0, which is according to spec. */
416         cc.bits.mps = (PAGE_SIZE >> 13);
417
418         nvme_mmio_write_4(ctrlr, cc, cc.raw);
419         DELAY(5000);
420
421         return (nvme_ctrlr_wait_for_ready(ctrlr));
422 }
423
424 int
425 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
426 {
427         int i;
428
429         nvme_admin_qpair_disable(&ctrlr->adminq);
430         for (i = 0; i < ctrlr->num_io_queues; i++)
431                 nvme_io_qpair_disable(&ctrlr->ioq[i]);
432
433         DELAY(100*1000);
434
435         nvme_ctrlr_disable(ctrlr);
436         return (nvme_ctrlr_enable(ctrlr));
437 }
438
439 void
440 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
441 {
442         int cmpset;
443
444         cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
445
446         if (cmpset == 0 || ctrlr->is_failed)
447                 /*
448                  * Controller is already resetting or has failed.  Return
449                  *  immediately since there is no need to kick off another
450                  *  reset in these cases.
451                  */
452                 return;
453
454         taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
455 }
456
457 static int
458 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
459 {
460         struct nvme_completion_poll_status      status;
461
462         status.done = FALSE;
463         nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
464             nvme_completion_poll_cb, &status);
465         while (status.done == FALSE)
466                 DELAY(5);
467         if (nvme_completion_is_error(&status.cpl)) {
468                 nvme_printf(ctrlr, "nvme_identify_controller failed!\n");
469                 return (ENXIO);
470         }
471
472 #ifdef CHATHAM2
473         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
474                 nvme_chatham_populate_cdata(ctrlr);
475 #endif
476
477         /*
478          * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
479          *  controller supports.
480          */
481         if (ctrlr->cdata.mdts > 0)
482                 ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
483                     ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
484
485         return (0);
486 }
487
488 static int
489 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
490 {
491         struct nvme_completion_poll_status      status;
492         int                                     cq_allocated, i, sq_allocated;
493
494         status.done = FALSE;
495         nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
496             nvme_completion_poll_cb, &status);
497         while (status.done == FALSE)
498                 DELAY(5);
499         if (nvme_completion_is_error(&status.cpl)) {
500                 nvme_printf(ctrlr, "nvme_set_num_queues failed!\n");
501                 return (ENXIO);
502         }
503
504         /*
505          * Data in cdw0 is 0-based.
506          * Lower 16-bits indicate number of submission queues allocated.
507          * Upper 16-bits indicate number of completion queues allocated.
508          */
509         sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1;
510         cq_allocated = (status.cpl.cdw0 >> 16) + 1;
511
512         /*
513          * Check that the controller was able to allocate the number of
514          *  queues we requested.  If not, revert to one IO queue pair.
515          */
516         if (sq_allocated < ctrlr->num_io_queues ||
517             cq_allocated < ctrlr->num_io_queues) {
518
519                 /*
520                  * Destroy extra IO queue pairs that were created at
521                  *  controller construction time but are no longer
522                  *  needed.  This will only happen when a controller
523                  *  supports fewer queues than MSI-X vectors.  This
524                  *  is not the normal case, but does occur with the
525                  *  Chatham prototype board.
526                  */
527                 for (i = 1; i < ctrlr->num_io_queues; i++)
528                         nvme_io_qpair_destroy(&ctrlr->ioq[i]);
529
530                 ctrlr->num_io_queues = 1;
531                 ctrlr->per_cpu_io_queues = 0;
532         }
533
534         return (0);
535 }
536
537 static int
538 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
539 {
540         struct nvme_completion_poll_status      status;
541         struct nvme_qpair                       *qpair;
542         int                                     i;
543
544         for (i = 0; i < ctrlr->num_io_queues; i++) {
545                 qpair = &ctrlr->ioq[i];
546
547                 status.done = FALSE;
548                 nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
549                     nvme_completion_poll_cb, &status);
550                 while (status.done == FALSE)
551                         DELAY(5);
552                 if (nvme_completion_is_error(&status.cpl)) {
553                         nvme_printf(ctrlr, "nvme_create_io_cq failed!\n");
554                         return (ENXIO);
555                 }
556
557                 status.done = FALSE;
558                 nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair,
559                     nvme_completion_poll_cb, &status);
560                 while (status.done == FALSE)
561                         DELAY(5);
562                 if (nvme_completion_is_error(&status.cpl)) {
563                         nvme_printf(ctrlr, "nvme_create_io_sq failed!\n");
564                         return (ENXIO);
565                 }
566         }
567
568         return (0);
569 }
570
571 static int
572 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
573 {
574         struct nvme_namespace   *ns;
575         int                     i, status;
576
577         for (i = 0; i < ctrlr->cdata.nn; i++) {
578                 ns = &ctrlr->ns[i];
579                 status = nvme_ns_construct(ns, i+1, ctrlr);
580                 if (status != 0)
581                         return (status);
582         }
583
584         return (0);
585 }
586
587 static boolean_t
588 is_log_page_id_valid(uint8_t page_id)
589 {
590
591         switch (page_id) {
592         case NVME_LOG_ERROR:
593         case NVME_LOG_HEALTH_INFORMATION:
594         case NVME_LOG_FIRMWARE_SLOT:
595                 return (TRUE);
596         }
597
598         return (FALSE);
599 }
600
601 static uint32_t
602 nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
603 {
604         uint32_t        log_page_size;
605
606         switch (page_id) {
607         case NVME_LOG_ERROR:
608                 log_page_size = min(
609                     sizeof(struct nvme_error_information_entry) *
610                     ctrlr->cdata.elpe,
611                     NVME_MAX_AER_LOG_SIZE);
612                 break;
613         case NVME_LOG_HEALTH_INFORMATION:
614                 log_page_size = sizeof(struct nvme_health_information_page);
615                 break;
616         case NVME_LOG_FIRMWARE_SLOT:
617                 log_page_size = sizeof(struct nvme_firmware_page);
618                 break;
619         default:
620                 log_page_size = 0;
621                 break;
622         }
623
624         return (log_page_size);
625 }
626
627 static void
628 nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
629 {
630         struct nvme_async_event_request *aer = arg;
631
632         /*
633          * If the log page fetch for some reason completed with an error,
634          *  don't pass log page data to the consumers.  In practice, this case
635          *  should never happen.
636          */
637         if (nvme_completion_is_error(cpl))
638                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
639                     aer->log_page_id, NULL, 0);
640         else
641                 /*
642                  * Pass the cpl data from the original async event completion,
643                  *  not the log page fetch.
644                  */
645                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
646                     aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
647
648         /*
649          * Repost another asynchronous event request to replace the one
650          *  that just completed.
651          */
652         nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
653 }
654
655 static void
656 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
657 {
658         struct nvme_async_event_request *aer = arg;
659
660         if (cpl->status.sc == NVME_SC_ABORTED_SQ_DELETION) {
661                 /*
662                  *  This is simulated when controller is being shut down, to
663                  *  effectively abort outstanding asynchronous event requests
664                  *  and make sure all memory is freed.  Do not repost the
665                  *  request in this case.
666                  */
667                 return;
668         }
669
670         /* Associated log page is in bits 23:16 of completion entry dw0. */
671         aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
672
673         nvme_printf(aer->ctrlr, "async event occurred (log page id=0x%x)\n",
674             aer->log_page_id);
675
676         if (is_log_page_id_valid(aer->log_page_id)) {
677                 aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
678                     aer->log_page_id);
679                 memcpy(&aer->cpl, cpl, sizeof(*cpl));
680                 nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
681                     NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
682                     aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
683                     aer);
684                 /* Wait to notify consumers until after log page is fetched. */
685         } else {
686                 nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
687                     NULL, 0);
688
689                 /*
690                  * Repost another asynchronous event request to replace the one
691                  *  that just completed.
692                  */
693                 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
694         }
695 }
696
697 static void
698 nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
699     struct nvme_async_event_request *aer)
700 {
701         struct nvme_request *req;
702
703         aer->ctrlr = ctrlr;
704         req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
705         aer->req = req;
706
707         /*
708          * Disable timeout here, since asynchronous event requests should by
709          *  nature never be timed out.
710          */
711         req->timeout = FALSE;
712         req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
713         nvme_ctrlr_submit_admin_request(ctrlr, req);
714 }
715
716 static void
717 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
718 {
719         union nvme_critical_warning_state       state;
720         struct nvme_async_event_request         *aer;
721         uint32_t                                i;
722
723         state.raw = 0xFF;
724         state.bits.reserved = 0;
725         nvme_ctrlr_cmd_set_async_event_config(ctrlr, state, NULL, NULL);
726
727         /* aerl is a zero-based value, so we need to add 1 here. */
728         ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
729
730         /* Chatham doesn't support AERs. */
731         if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
732                 ctrlr->num_aers = 0;
733
734         for (i = 0; i < ctrlr->num_aers; i++) {
735                 aer = &ctrlr->aer[i];
736                 nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
737         }
738 }
739
740 static void
741 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
742 {
743
744         ctrlr->int_coal_time = 0;
745         TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
746             &ctrlr->int_coal_time);
747
748         ctrlr->int_coal_threshold = 0;
749         TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
750             &ctrlr->int_coal_threshold);
751
752         nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
753             ctrlr->int_coal_threshold, NULL, NULL);
754 }
755
756 static void
757 nvme_ctrlr_start(void *ctrlr_arg)
758 {
759         struct nvme_controller *ctrlr = ctrlr_arg;
760         int i;
761
762         nvme_qpair_reset(&ctrlr->adminq);
763         for (i = 0; i < ctrlr->num_io_queues; i++)
764                 nvme_qpair_reset(&ctrlr->ioq[i]);
765
766         nvme_admin_qpair_enable(&ctrlr->adminq);
767
768         if (nvme_ctrlr_identify(ctrlr) != 0) {
769                 nvme_ctrlr_fail(ctrlr);
770                 return;
771         }
772
773         if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
774                 nvme_ctrlr_fail(ctrlr);
775                 return;
776         }
777
778         if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
779                 nvme_ctrlr_fail(ctrlr);
780                 return;
781         }
782
783         if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
784                 nvme_ctrlr_fail(ctrlr);
785                 return;
786         }
787
788         nvme_ctrlr_configure_aer(ctrlr);
789         nvme_ctrlr_configure_int_coalescing(ctrlr);
790
791         for (i = 0; i < ctrlr->num_io_queues; i++)
792                 nvme_io_qpair_enable(&ctrlr->ioq[i]);
793
794         /*
795          * Clear software progress marker to 0, to indicate to pre-boot
796          *  software that OS driver load was successful.
797          *
798          * Chatham does not support this feature.
799          */
800         if (pci_get_devid(ctrlr->dev) != CHATHAM_PCI_ID)
801                 nvme_ctrlr_cmd_set_feature(ctrlr,
802                     NVME_FEAT_SOFTWARE_PROGRESS_MARKER, 0, NULL, 0, NULL, NULL);
803 }
804
805 void
806 nvme_ctrlr_start_config_hook(void *arg)
807 {
808         struct nvme_controller *ctrlr = arg;
809
810         nvme_ctrlr_start(ctrlr);
811         config_intrhook_disestablish(&ctrlr->config_hook);
812 }
813
814 static void
815 nvme_ctrlr_reset_task(void *arg, int pending)
816 {
817         struct nvme_controller  *ctrlr = arg;
818         int                     status;
819
820         nvme_printf(ctrlr, "resetting controller\n");
821         status = nvme_ctrlr_hw_reset(ctrlr);
822         /*
823          * Use pause instead of DELAY, so that we yield to any nvme interrupt
824          *  handlers on this CPU that were blocked on a qpair lock. We want
825          *  all nvme interrupts completed before proceeding with restarting the
826          *  controller.
827          *
828          * XXX - any way to guarantee the interrupt handlers have quiesced?
829          */
830         pause("nvmereset", hz / 10);
831         if (status == 0)
832                 nvme_ctrlr_start(ctrlr);
833         else
834                 nvme_ctrlr_fail(ctrlr);
835
836         atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
837 }
838
839 static void
840 nvme_ctrlr_intx_handler(void *arg)
841 {
842         struct nvme_controller *ctrlr = arg;
843
844         nvme_mmio_write_4(ctrlr, intms, 1);
845
846         nvme_qpair_process_completions(&ctrlr->adminq);
847
848         if (ctrlr->ioq[0].cpl)
849                 nvme_qpair_process_completions(&ctrlr->ioq[0]);
850
851         nvme_mmio_write_4(ctrlr, intmc, 1);
852 }
853
854 static int
855 nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
856 {
857
858         ctrlr->num_io_queues = 1;
859         ctrlr->per_cpu_io_queues = 0;
860         ctrlr->rid = 0;
861         ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
862             &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
863
864         if (ctrlr->res == NULL) {
865                 nvme_printf(ctrlr, "unable to allocate shared IRQ\n");
866                 return (ENOMEM);
867         }
868
869         bus_setup_intr(ctrlr->dev, ctrlr->res,
870             INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
871             ctrlr, &ctrlr->tag);
872
873         if (ctrlr->tag == NULL) {
874                 nvme_printf(ctrlr, "unable to setup intx handler\n");
875                 return (ENOMEM);
876         }
877
878         return (0);
879 }
880
881 static int
882 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
883     struct thread *td)
884 {
885         struct nvme_completion_poll_status      status;
886         struct nvme_controller                  *ctrlr;
887
888         ctrlr = cdev->si_drv1;
889
890         switch (cmd) {
891         case NVME_IDENTIFY_CONTROLLER:
892 #ifdef CHATHAM2
893                 /*
894                  * Don't refresh data on Chatham, since Chatham returns
895                  *  garbage on IDENTIFY anyways.
896                  */
897                 if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID) {
898                         memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
899                         break;
900                 }
901 #endif
902                 /* Refresh data before returning to user. */
903                 status.done = FALSE;
904                 nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
905                     nvme_completion_poll_cb, &status);
906                 while (status.done == FALSE)
907                         DELAY(5);
908                 if (nvme_completion_is_error(&status.cpl))
909                         return (ENXIO);
910                 memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
911                 break;
912         case NVME_RESET_CONTROLLER:
913                 nvme_ctrlr_reset(ctrlr);
914                 break;
915         default:
916                 return (ENOTTY);
917         }
918
919         return (0);
920 }
921
922 static struct cdevsw nvme_ctrlr_cdevsw = {
923         .d_version =    D_VERSION,
924         .d_flags =      0,
925         .d_ioctl =      nvme_ctrlr_ioctl
926 };
927
928 int
929 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
930 {
931         union cap_lo_register   cap_lo;
932         union cap_hi_register   cap_hi;
933         int                     num_vectors, per_cpu_io_queues, status = 0;
934         int                     timeout_period;
935
936         ctrlr->dev = dev;
937
938         status = nvme_ctrlr_allocate_bar(ctrlr);
939
940         if (status != 0)
941                 return (status);
942
943 #ifdef CHATHAM2
944         if (pci_get_devid(dev) == CHATHAM_PCI_ID) {
945                 status = nvme_ctrlr_allocate_chatham_bar(ctrlr);
946                 if (status != 0)
947                         return (status);
948                 nvme_ctrlr_setup_chatham(ctrlr);
949         }
950 #endif
951
952         /*
953          * Software emulators may set the doorbell stride to something
954          *  other than zero, but this driver is not set up to handle that.
955          */
956         cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi);
957         if (cap_hi.bits.dstrd != 0)
958                 return (ENXIO);
959
960         ctrlr->min_page_size = 1 << (12 + cap_hi.bits.mpsmin);
961
962         /* Get ready timeout value from controller, in units of 500ms. */
963         cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
964         ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500;
965
966         timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
967         TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
968         timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
969         timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
970         ctrlr->timeout_period = timeout_period;
971
972         nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
973         TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
974
975         per_cpu_io_queues = 1;
976         TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
977         ctrlr->per_cpu_io_queues = per_cpu_io_queues ? TRUE : FALSE;
978
979         if (ctrlr->per_cpu_io_queues)
980                 ctrlr->num_io_queues = mp_ncpus;
981         else
982                 ctrlr->num_io_queues = 1;
983
984         ctrlr->force_intx = 0;
985         TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
986
987         ctrlr->enable_aborts = 0;
988         TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
989
990         ctrlr->msix_enabled = 1;
991
992         if (ctrlr->force_intx) {
993                 ctrlr->msix_enabled = 0;
994                 goto intx;
995         }
996
997         /* One vector per IO queue, plus one vector for admin queue. */
998         num_vectors = ctrlr->num_io_queues + 1;
999
1000         if (pci_msix_count(dev) < num_vectors) {
1001                 ctrlr->msix_enabled = 0;
1002                 goto intx;
1003         }
1004
1005         if (pci_alloc_msix(dev, &num_vectors) != 0)
1006                 ctrlr->msix_enabled = 0;
1007
1008 intx:
1009
1010         if (!ctrlr->msix_enabled)
1011                 nvme_ctrlr_configure_intx(ctrlr);
1012
1013         nvme_ctrlr_construct_admin_qpair(ctrlr);
1014
1015         status = nvme_ctrlr_construct_io_qpairs(ctrlr);
1016
1017         if (status != 0)
1018                 return (status);
1019
1020         ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
1021             "nvme%d", device_get_unit(dev));
1022
1023         if (ctrlr->cdev == NULL)
1024                 return (ENXIO);
1025
1026         ctrlr->cdev->si_drv1 = (void *)ctrlr;
1027
1028         ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
1029             taskqueue_thread_enqueue, &ctrlr->taskqueue);
1030         taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");
1031
1032         ctrlr->is_resetting = 0;
1033         TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
1034
1035         TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr);
1036         mtx_init(&ctrlr->fail_req_lock, "nvme ctrlr fail req lock", NULL,
1037             MTX_DEF);
1038         STAILQ_INIT(&ctrlr->fail_req);
1039         ctrlr->is_failed = FALSE;
1040
1041         return (0);
1042 }
1043
1044 void
1045 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
1046 {
1047         int                             i;
1048
1049         nvme_ctrlr_disable(ctrlr);
1050         taskqueue_free(ctrlr->taskqueue);
1051
1052         for (i = 0; i < NVME_MAX_NAMESPACES; i++)
1053                 nvme_ns_destruct(&ctrlr->ns[i]);
1054
1055         if (ctrlr->cdev)
1056                 destroy_dev(ctrlr->cdev);
1057
1058         for (i = 0; i < ctrlr->num_io_queues; i++) {
1059                 nvme_io_qpair_destroy(&ctrlr->ioq[i]);
1060         }
1061
1062         free(ctrlr->ioq, M_NVME);
1063
1064         nvme_admin_qpair_destroy(&ctrlr->adminq);
1065
1066         if (ctrlr->resource != NULL) {
1067                 bus_release_resource(dev, SYS_RES_MEMORY,
1068                     ctrlr->resource_id, ctrlr->resource);
1069         }
1070
1071         if (ctrlr->bar4_resource != NULL) {
1072                 bus_release_resource(dev, SYS_RES_MEMORY,
1073                     ctrlr->bar4_resource_id, ctrlr->bar4_resource);
1074         }
1075
1076 #ifdef CHATHAM2
1077         if (ctrlr->chatham_resource != NULL) {
1078                 bus_release_resource(dev, SYS_RES_MEMORY,
1079                     ctrlr->chatham_resource_id, ctrlr->chatham_resource);
1080         }
1081 #endif
1082
1083         if (ctrlr->tag)
1084                 bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
1085
1086         if (ctrlr->res)
1087                 bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
1088                     rman_get_rid(ctrlr->res), ctrlr->res);
1089
1090         if (ctrlr->msix_enabled)
1091                 pci_release_msi(dev);
1092 }
1093
1094 void
1095 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
1096     struct nvme_request *req)
1097 {
1098
1099         nvme_qpair_submit_request(&ctrlr->adminq, req);
1100 }
1101
1102 void
1103 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
1104     struct nvme_request *req)
1105 {
1106         struct nvme_qpair       *qpair;
1107
1108         if (ctrlr->per_cpu_io_queues)
1109                 qpair = &ctrlr->ioq[curcpu];
1110         else
1111                 qpair = &ctrlr->ioq[0];
1112
1113         nvme_qpair_submit_request(qpair, req);
1114 }
1115
1116 device_t
1117 nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
1118 {
1119
1120         return (ctrlr->dev);
1121 }
1122
1123 const struct nvme_controller_data *
1124 nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
1125 {
1126
1127         return (&ctrlr->cdata);
1128 }