]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
Merge compiler-rt trunk r338150, and resolve conflicts.
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include <sys/types.h>
57
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71
72 #include <dev/nvme/nvme.h>
73
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77
78
79 static int nvme_debug = 0;
80 #define DPRINTF(params) if (nvme_debug) printf params
81 #define WPRINTF(params) printf params
82
83 /* defaults; can be overridden */
84 #define NVME_MSIX_BAR           4
85
86 #define NVME_IOSLOTS            8
87
88 #define NVME_QUEUES             16
89 #define NVME_MAX_QENTRIES       2048
90
91 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
92 #define NVME_MAX_BLOCKIOVS      512
93
94 /* helpers */
95
96 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
97
98 enum nvme_controller_register_offsets {
99         NVME_CR_CAP_LOW = 0x00,
100         NVME_CR_CAP_HI  = 0x04,
101         NVME_CR_VS      = 0x08,
102         NVME_CR_INTMS   = 0x0c,
103         NVME_CR_INTMC   = 0x10,
104         NVME_CR_CC      = 0x14,
105         NVME_CR_CSTS    = 0x1c,
106         NVME_CR_NSSR    = 0x20,
107         NVME_CR_AQA     = 0x24,
108         NVME_CR_ASQ_LOW = 0x28,
109         NVME_CR_ASQ_HI  = 0x2c,
110         NVME_CR_ACQ_LOW = 0x30,
111         NVME_CR_ACQ_HI  = 0x34,
112 };
113
114 enum nvme_cmd_cdw11 {
115         NVME_CMD_CDW11_PC  = 0x0001,
116         NVME_CMD_CDW11_IEN = 0x0002,
117         NVME_CMD_CDW11_IV  = 0xFFFF0000,
118 };
119
120 #define NVME_CMD_GET_OPC(opc) \
121         ((opc) >> NVME_CMD_OPC_SHIFT & NVME_CMD_OPC_MASK)
122
123 #define NVME_CQ_INTEN   0x01
124 #define NVME_CQ_INTCOAL 0x02
125
126 struct nvme_completion_queue {
127         struct nvme_completion *qbase;
128         uint32_t        size;
129         uint16_t        tail; /* nvme progress */
130         uint16_t        head; /* guest progress */
131         uint16_t        intr_vec;
132         uint32_t        intr_en;
133         pthread_mutex_t mtx;
134 };
135
136 struct nvme_submission_queue {
137         struct nvme_command *qbase;
138         uint32_t        size;
139         uint16_t        head; /* nvme progress */
140         uint16_t        tail; /* guest progress */
141         uint16_t        cqid; /* completion queue id */
142         int             busy; /* queue is being processed */
143         int             qpriority;
144 };
145
146 enum nvme_storage_type {
147         NVME_STOR_BLOCKIF = 0,
148         NVME_STOR_RAM = 1,
149 };
150
151 struct pci_nvme_blockstore {
152         enum nvme_storage_type type;
153         void            *ctx;
154         uint64_t        size;
155         uint32_t        sectsz;
156         uint32_t        sectsz_bits;
157 };
158
159 struct pci_nvme_ioreq {
160         struct pci_nvme_softc *sc;
161         struct pci_nvme_ioreq *next;
162         struct nvme_submission_queue *nvme_sq;
163         uint16_t        sqid;
164
165         /* command information */
166         uint16_t        opc;
167         uint16_t        cid;
168         uint32_t        nsid;
169
170         uint64_t        prev_gpaddr;
171         size_t          prev_size;
172
173         /*
174          * lock if all iovs consumed (big IO);
175          * complete transaction before continuing
176          */
177         pthread_mutex_t mtx;
178         pthread_cond_t  cv;
179
180         struct blockif_req io_req;
181
182         /* pad to fit up to 512 page descriptors from guest IO request */
183         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
184 };
185
186 struct pci_nvme_softc {
187         struct pci_devinst *nsc_pi;
188
189         pthread_mutex_t mtx;
190
191         struct nvme_registers regs;
192
193         struct nvme_namespace_data  nsdata;
194         struct nvme_controller_data ctrldata;
195
196         struct pci_nvme_blockstore nvstore;
197
198         uint16_t        max_qentries; /* max entries per queue */
199         uint32_t        max_queues;
200         uint32_t        num_cqueues;
201         uint32_t        num_squeues;
202
203         struct pci_nvme_ioreq *ioreqs;
204         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
205         uint32_t        pending_ios;
206         uint32_t        ioslots;
207         sem_t           iosemlock;
208
209         /* status and guest memory mapped queues */
210         struct nvme_completion_queue *compl_queues;
211         struct nvme_submission_queue *submit_queues;
212
213         /* controller features */
214         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
215         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
216         uint32_t        async_ev_config;         /* 0x0B: async event config */
217 };
218
219
220 static void pci_nvme_io_partial(struct blockif_req *br, int err);
221
222 /* Controller Configuration utils */
223 #define NVME_CC_GET_EN(cc) \
224         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
225 #define NVME_CC_GET_CSS(cc) \
226         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
227 #define NVME_CC_GET_SHN(cc) \
228         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
229 #define NVME_CC_GET_IOSQES(cc) \
230         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
231 #define NVME_CC_GET_IOCQES(cc) \
232         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
233
234 #define NVME_CC_WRITE_MASK \
235         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
236          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
237          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
238
239 #define NVME_CC_NEN_WRITE_MASK \
240         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
241          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
242          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
243
244 /* Controller Status utils */
245 #define NVME_CSTS_GET_RDY(sts) \
246         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
247
248 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
249
250 /* Completion Queue status word utils */
251 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
252 #define NVME_STATUS_MASK \
253         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
254          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
255
256 static __inline void
257 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
258 {
259
260         *status &= ~NVME_STATUS_MASK;
261         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
262                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
263 }
264
265 static __inline void
266 pci_nvme_status_genc(uint16_t *status, uint16_t code)
267 {
268
269         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
270 }
271
272 static __inline void
273 pci_nvme_toggle_phase(uint16_t *status, int prev)
274 {
275
276         if (prev)
277                 *status &= ~NVME_STATUS_P;
278         else
279                 *status |= NVME_STATUS_P;
280 }
281
282 static void
283 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
284 {
285         struct nvme_controller_data *cd = &sc->ctrldata;
286
287         cd->vid = 0xFB5D;
288         cd->ssvid = 0x0000;
289
290         cd->mn[0] = 'b';
291         cd->mn[1] = 'h';
292         cd->mn[2] = 'y';
293         cd->mn[3] = 'v';
294         cd->mn[4] = 'e';
295         cd->mn[5] = '-';
296         cd->mn[6] = 'N';
297         cd->mn[7] = 'V';
298         cd->mn[8] = 'M';
299         cd->mn[9] = 'e';
300
301         cd->fr[0] = '1';
302         cd->fr[1] = '.';
303         cd->fr[2] = '0';
304
305         /* Num of submission commands that we can handle at a time (2^rab) */
306         cd->rab   = 4;
307
308         /* FreeBSD OUI */
309         cd->ieee[0] = 0x58;
310         cd->ieee[1] = 0x9c;
311         cd->ieee[2] = 0xfc;
312
313         cd->mic = 0;
314
315         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
316
317         cd->ver = 0x00010300;
318
319         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
320         cd->acl = 2;
321         cd->aerl = 4;
322
323         cd->lpa = 0;    /* TODO: support some simple things like SMART */
324         cd->elpe = 0;   /* max error log page entries */
325         cd->npss = 1;   /* number of power states support */
326
327         /* Warning Composite Temperature Threshold */
328         cd->wctemp = 0x0157;
329
330         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
331             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
332         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
333             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
334         cd->nn = 1;     /* number of namespaces */
335
336         cd->fna = 0x03;
337
338         cd->power_state[0].mp = 10;
339 }
340
341 static void
342 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
343 {
344         struct nvme_namespace_data *nd;
345
346         nd = &sc->nsdata;
347
348         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
349         nd->ncap = nd->nsze;
350         nd->nuse = nd->nsze;
351
352         /* Get LBA and backstore information from backing store */
353         nd->nlbaf = 1;
354         /* LBA data-sz = 2^lbads */
355         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
356
357         nd->flbas = 0;
358 }
359
360 static void
361 pci_nvme_reset(struct pci_nvme_softc *sc)
362 {
363         DPRINTF(("%s\r\n", __func__));
364
365         sc->regs.cap_lo = (sc->max_qentries & NVME_CAP_LO_REG_MQES_MASK) |
366             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
367             (60 << NVME_CAP_LO_REG_TO_SHIFT);
368
369         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
370
371         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
372
373         sc->regs.cc = 0;
374         sc->regs.csts = 0;
375
376         if (sc->submit_queues != NULL) {
377                 pthread_mutex_lock(&sc->mtx);
378                 sc->num_cqueues = sc->num_squeues = sc->max_queues;
379
380                 for (int i = 0; i <= sc->max_queues; i++) {
381                         /*
382                          * The Admin Submission Queue is at index 0.
383                          * It must not be changed at reset otherwise the
384                          * emulation will be out of sync with the guest.
385                          */
386                         if (i != 0) {
387                                 sc->submit_queues[i].qbase = NULL;
388                                 sc->submit_queues[i].size = 0;
389                                 sc->submit_queues[i].cqid = 0;
390
391                                 sc->compl_queues[i].qbase = NULL;
392                                 sc->compl_queues[i].size = 0;
393                         }
394                         sc->submit_queues[i].tail = 0;
395                         sc->submit_queues[i].head = 0;
396                         sc->submit_queues[i].busy = 0;
397
398                         sc->compl_queues[i].tail = 0;
399                         sc->compl_queues[i].head = 0;
400                 }
401
402                 pthread_mutex_unlock(&sc->mtx);
403         } else
404                 sc->submit_queues = calloc(sc->max_queues + 1,
405                                         sizeof(struct nvme_submission_queue));
406
407         if (sc->compl_queues == NULL) {
408                 sc->compl_queues = calloc(sc->max_queues + 1,
409                                         sizeof(struct nvme_completion_queue));
410
411                 for (int i = 0; i <= sc->num_cqueues; i++)
412                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
413         }
414 }
415
416 static void
417 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
418 {
419         uint16_t acqs, asqs;
420
421         DPRINTF(("%s\r\n", __func__));
422
423         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
424         sc->submit_queues[0].size = asqs;
425         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
426                     sizeof(struct nvme_command) * asqs);
427
428         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
429                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
430
431         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
432             NVME_AQA_REG_ACQS_MASK) + 1;
433         sc->compl_queues[0].size = acqs;
434         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
435                  sizeof(struct nvme_completion) * acqs);
436         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
437                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
438 }
439
440 static int
441 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
442         struct nvme_completion* compl)
443 {
444         uint16_t qid = command->cdw10 & 0xffff;
445
446         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
447         if (qid == 0 || qid > sc->num_cqueues) {
448                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
449                         __func__, qid, sc->num_squeues));
450                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
451                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
452                 return (1);
453         }
454
455         sc->submit_queues[qid].qbase = NULL;
456         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
457         return (1);
458 }
459
460 static int
461 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
462         struct nvme_completion* compl)
463 {
464         if (command->cdw11 & NVME_CMD_CDW11_PC) {
465                 uint16_t qid = command->cdw10 & 0xffff;
466                 struct nvme_submission_queue *nsq;
467
468                 if (qid > sc->num_squeues) {
469                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
470                                 __func__, qid, sc->num_squeues));
471                         pci_nvme_status_tc(&compl->status,
472                             NVME_SCT_COMMAND_SPECIFIC,
473                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
474                         return (1);
475                 }
476
477                 nsq = &sc->submit_queues[qid];
478                 nsq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
479
480                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
481                               sizeof(struct nvme_command) * (size_t)nsq->size);
482                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
483                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
484
485                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
486                         qid, nsq->size, nsq->qbase, nsq->cqid));
487
488                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
489
490                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
491                          __func__, qid));
492         } else {
493                 /* 
494                  * Guest sent non-cont submission queue request.
495                  * This setting is unsupported by this emulation.
496                  */
497                 WPRINTF(("%s unsupported non-contig (list-based) "
498                          "create i/o submission queue\r\n", __func__));
499
500                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
501         }
502         return (1);
503 }
504
505 static int
506 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
507         struct nvme_completion* compl)
508 {
509         uint16_t qid = command->cdw10 & 0xffff;
510
511         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
512         if (qid == 0 || qid > sc->num_cqueues) {
513                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
514                         __func__, qid, sc->num_cqueues));
515                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
516                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
517                 return (1);
518         }
519
520         sc->compl_queues[qid].qbase = NULL;
521         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
522         return (1);
523 }
524
525 static int
526 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
527         struct nvme_completion* compl)
528 {
529         if (command->cdw11 & NVME_CMD_CDW11_PC) {
530                 uint16_t qid = command->cdw10 & 0xffff;
531                 struct nvme_completion_queue *ncq;
532
533                 if (qid > sc->num_cqueues) {
534                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
535                                 __func__, qid, sc->num_cqueues));
536                         pci_nvme_status_tc(&compl->status,
537                             NVME_SCT_COMMAND_SPECIFIC,
538                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
539                         return (1);
540                 }
541
542                 ncq = &sc->compl_queues[qid];
543                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
544                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
545                 ncq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
546
547                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
548                              command->prp1,
549                              sizeof(struct nvme_command) * (size_t)ncq->size);
550
551                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
552         } else {
553                 /* 
554                  * Non-contig completion queue unsupported.
555                  */
556                 WPRINTF(("%s unsupported non-contig (list-based) "
557                          "create i/o completion queue\r\n",
558                          __func__));
559
560                 /* 0x12 = Invalid Use of Controller Memory Buffer */
561                 pci_nvme_status_genc(&compl->status, 0x12);
562         }
563
564         return (1);
565 }
566
567 static int
568 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
569         struct nvme_completion* compl)
570 {
571         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
572         uint8_t logpage = command->cdw10 & 0xFF;
573         void *data;
574
575         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
576
577         if (logpage >= 1 && logpage <= 3)
578                 data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
579                                   PAGE_SIZE);
580
581         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
582
583         switch (logpage) {
584         case 0x01: /* Error information */
585                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
586                 break;
587         case 0x02: /* SMART/Health information */
588                 /* TODO: present some smart info */
589                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
590                 break;
591         case 0x03: /* Firmware slot information */
592                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
593                 break;
594         default:
595                 WPRINTF(("%s get log page %x command not supported\r\n",
596                         __func__, logpage));
597
598                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
599                     NVME_SC_INVALID_LOG_PAGE);
600         }
601
602         return (1);
603 }
604
605 static int
606 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
607         struct nvme_completion* compl)
608 {
609         void *dest;
610
611         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
612                 command->cdw10 & 0xFF, command->nsid));
613
614         switch (command->cdw10 & 0xFF) {
615         case 0x00: /* return Identify Namespace data structure */
616                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
617                                   sizeof(sc->nsdata));
618                 memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
619                 break;
620         case 0x01: /* return Identify Controller data structure */
621                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
622                                   sizeof(sc->ctrldata));
623                 memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
624                 break;
625         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
626                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
627                                   sizeof(uint32_t) * 1024);
628                 ((uint32_t *)dest)[0] = 1;
629                 ((uint32_t *)dest)[1] = 0;
630                 break;
631         case 0x11:
632                 pci_nvme_status_genc(&compl->status,
633                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
634                 return (1);
635         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
636         case 0x10:
637         case 0x12:
638         case 0x13:
639         case 0x14:
640         case 0x15:
641         default:
642                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
643                          __func__, command->cdw10 & 0xFF));
644                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
645                 return (1);
646         }
647
648         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
649         return (1);
650 }
651
652 static int
653 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
654         struct nvme_completion* compl)
655 {
656         int feature = command->cdw10 & 0x0F;
657         uint32_t iv;
658
659         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
660         compl->cdw0 = 0;
661
662         switch (feature) {
663         case NVME_FEAT_ARBITRATION:
664                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
665                 break;
666         case NVME_FEAT_POWER_MANAGEMENT:
667                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
668                 break;
669         case NVME_FEAT_LBA_RANGE_TYPE:
670                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
671                 break;
672         case NVME_FEAT_TEMPERATURE_THRESHOLD:
673                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
674                 break;
675         case NVME_FEAT_ERROR_RECOVERY:
676                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
677                 break;
678         case NVME_FEAT_VOLATILE_WRITE_CACHE:
679                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
680                 break;
681         case NVME_FEAT_NUMBER_OF_QUEUES:
682                 sc->num_squeues = command->cdw11 & 0xFFFF;
683                 sc->num_cqueues = (command->cdw11 >> 16) & 0xFFFF;
684                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
685                         sc->num_squeues, sc->num_cqueues));
686
687                 if (sc->num_squeues == 0 || sc->num_squeues > sc->max_queues)
688                         sc->num_squeues = sc->max_queues;
689                 if (sc->num_cqueues == 0 || sc->num_cqueues > sc->max_queues)
690                         sc->num_cqueues = sc->max_queues;
691
692                 compl->cdw0 = (sc->num_squeues & 0xFFFF) |
693                               ((sc->num_cqueues & 0xFFFF) << 16);
694
695                 break;
696         case NVME_FEAT_INTERRUPT_COALESCING:
697                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
698
699                 /* in uS */
700                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
701
702                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
703                 break;
704         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
705                 iv = command->cdw11 & 0xFFFF;
706
707                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
708                         command->cdw11));
709
710                 for (uint32_t i = 0; i <= sc->num_cqueues; i++) {
711                         if (sc->compl_queues[i].intr_vec == iv) {
712                                 if (command->cdw11 & (1 << 16))
713                                         sc->compl_queues[i].intr_en |=
714                                                               NVME_CQ_INTCOAL;  
715                                 else
716                                         sc->compl_queues[i].intr_en &=
717                                                              ~NVME_CQ_INTCOAL;  
718                         }
719                 }
720                 break;
721         case NVME_FEAT_WRITE_ATOMICITY:
722                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
723                 break;
724         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
725                 DPRINTF(("  async event configuration 0x%x\r\n",
726                         command->cdw11));
727                 sc->async_ev_config = command->cdw11;
728                 break;
729         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
730                 DPRINTF(("  software progress marker 0x%x\r\n",
731                         command->cdw11));
732                 break;
733         case 0x0C:
734                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
735                         command->cdw11));
736                 break;
737         default:
738                 WPRINTF(("%s invalid feature\r\n", __func__));
739                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
740                 return (1);
741         }
742
743         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
744         return (1);
745 }
746
747 static int
748 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
749         struct nvme_completion* compl)
750 {
751         int feature = command->cdw10 & 0x0F;
752
753         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
754
755         compl->cdw0 = 0;
756
757         switch (feature) {
758         case NVME_FEAT_ARBITRATION:
759                 DPRINTF(("  arbitration\r\n"));
760                 break;
761         case NVME_FEAT_POWER_MANAGEMENT:
762                 DPRINTF(("  power management\r\n"));
763                 break;
764         case NVME_FEAT_LBA_RANGE_TYPE:
765                 DPRINTF(("  lba range\r\n"));
766                 break;
767         case NVME_FEAT_TEMPERATURE_THRESHOLD:
768                 DPRINTF(("  temperature threshold\r\n"));
769                 switch ((command->cdw11 >> 20) & 0x3) {
770                 case 0:
771                         /* Over temp threshold */
772                         compl->cdw0 = 0xFFFF;
773                         break;
774                 case 1:
775                         /* Under temp threshold */
776                         compl->cdw0 = 0;
777                         break;
778                 default:
779                         WPRINTF(("  invalid threshold type select\r\n"));
780                         pci_nvme_status_genc(&compl->status,
781                             NVME_SC_INVALID_FIELD);
782                         return (1);
783                 }
784                 break;
785         case NVME_FEAT_ERROR_RECOVERY:
786                 DPRINTF(("  error recovery\r\n"));
787                 break;
788         case NVME_FEAT_VOLATILE_WRITE_CACHE:
789                 DPRINTF(("  volatile write cache\r\n"));
790                 break;
791         case NVME_FEAT_NUMBER_OF_QUEUES:
792                 compl->cdw0 = 0;
793                 if (sc->num_squeues == 0)
794                         compl->cdw0 |= sc->max_queues & 0xFFFF;
795                 else
796                         compl->cdw0 |= sc->num_squeues & 0xFFFF;
797
798                 if (sc->num_cqueues == 0)
799                         compl->cdw0 |= (sc->max_queues & 0xFFFF) << 16;
800                 else
801                         compl->cdw0 |= (sc->num_cqueues & 0xFFFF) << 16;
802
803                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
804                         compl->cdw0 & 0xFFFF,
805                         (compl->cdw0 >> 16) & 0xFFFF));
806
807                 break;
808         case NVME_FEAT_INTERRUPT_COALESCING:
809                 DPRINTF(("  interrupt coalescing\r\n"));
810                 break;
811         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
812                 DPRINTF(("  interrupt vector configuration\r\n"));
813                 break;
814         case NVME_FEAT_WRITE_ATOMICITY:
815                 DPRINTF(("  write atomicity\r\n"));
816                 break;
817         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
818                 DPRINTF(("  async event configuration\r\n"));
819                 sc->async_ev_config = command->cdw11;
820                 break;
821         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
822                 DPRINTF(("  software progress marker\r\n"));
823                 break;
824         case 0x0C:
825                 DPRINTF(("  autonomous power state transition\r\n"));
826                 break;
827         default:
828                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
829                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
830                 return (1);
831         }
832
833         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
834         return (1);
835 }
836
837 static int
838 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
839         struct nvme_completion* compl)
840 {
841         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
842                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
843
844         /* TODO: search for the command ID and abort it */
845
846         compl->cdw0 = 1;
847         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
848         return (1);
849 }
850
851 static int
852 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
853         struct nvme_command* command, struct nvme_completion* compl)
854 {
855         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
856
857         /*
858          * TODO: raise events when they happen based on the Set Features cmd.
859          * These events happen async, so only set completion successful if
860          * there is an event reflective of the request to get event.
861          */
862         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
863             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
864         return (0);
865 }
866
867 static void
868 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
869 {
870         struct nvme_completion compl;
871         struct nvme_command *cmd;
872         struct nvme_submission_queue *sq;
873         struct nvme_completion_queue *cq;
874         int do_intr = 0;
875         uint16_t sqhead;
876
877         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
878
879         sq = &sc->submit_queues[0];
880
881         sqhead = atomic_load_acq_short(&sq->head);
882
883         if (atomic_testandset_int(&sq->busy, 1)) {
884                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
885                         __func__, sqhead, sq->tail));
886                 return;
887         }
888
889         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
890         
891         while (sqhead != atomic_load_acq_short(&sq->tail)) {
892                 cmd = &(sq->qbase)[sqhead];
893                 compl.status = 0;
894
895                 switch (NVME_CMD_GET_OPC(cmd->opc_fuse)) {
896                 case NVME_OPC_DELETE_IO_SQ:
897                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
898                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
899                         break;
900                 case NVME_OPC_CREATE_IO_SQ:
901                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
902                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
903                         break;
904                 case NVME_OPC_DELETE_IO_CQ:
905                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
906                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
907                         break;
908                 case NVME_OPC_CREATE_IO_CQ:
909                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
910                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
911                         break;
912                 case NVME_OPC_GET_LOG_PAGE:
913                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
914                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
915                         break;
916                 case NVME_OPC_IDENTIFY:
917                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
918                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
919                         break;
920                 case NVME_OPC_ABORT:
921                         DPRINTF(("%s command ABORT\r\n", __func__));
922                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
923                         break;
924                 case NVME_OPC_SET_FEATURES:
925                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
926                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
927                         break;
928                 case NVME_OPC_GET_FEATURES:
929                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
930                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
931                         break;
932                 case NVME_OPC_ASYNC_EVENT_REQUEST:
933                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
934                         /* XXX dont care, unhandled for now
935                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
936                         */
937                         break;
938                 default:
939                         WPRINTF(("0x%x command is not implemented\r\n",
940                             NVME_CMD_GET_OPC(cmd->opc_fuse)));
941                 }
942         
943                 /* for now skip async event generation */
944                 if (NVME_CMD_GET_OPC(cmd->opc_fuse) !=
945                     NVME_OPC_ASYNC_EVENT_REQUEST) {
946                         struct nvme_completion *cp;
947                         int phase;
948
949                         cq = &sc->compl_queues[0];
950
951                         cp = &(cq->qbase)[cq->tail];
952                         cp->sqid = 0;
953                         cp->sqhd = sqhead;
954                         cp->cid = cmd->cid;
955
956                         phase = NVME_STATUS_GET_P(cp->status);
957                         cp->status = compl.status;
958                         pci_nvme_toggle_phase(&cp->status, phase);
959
960                         cq->tail = (cq->tail + 1) % cq->size;
961                 }
962                 sqhead = (sqhead + 1) % sq->size;
963         }
964
965         DPRINTF(("setting sqhead %u\r\n", sqhead));
966         atomic_store_short(&sq->head, sqhead);
967         atomic_store_int(&sq->busy, 0);
968
969         if (do_intr)
970                 pci_generate_msix(sc->nsc_pi, 0);
971
972 }
973
974 static int
975 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
976         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
977 {
978         int iovidx;
979
980         if (req != NULL) {
981                 /* concatenate contig block-iovs to minimize number of iovs */
982                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
983                         iovidx = req->io_req.br_iovcnt - 1;
984
985                         req->io_req.br_iov[iovidx].iov_base =
986                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
987                                              req->prev_gpaddr, size);
988
989                         req->prev_size += size;
990                         req->io_req.br_resid += size;
991
992                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
993                 } else {
994                         pthread_mutex_lock(&req->mtx);
995
996                         iovidx = req->io_req.br_iovcnt;
997                         if (iovidx == NVME_MAX_BLOCKIOVS) {
998                                 int err = 0;
999
1000                                 DPRINTF(("large I/O, doing partial req\r\n"));
1001
1002                                 iovidx = 0;
1003                                 req->io_req.br_iovcnt = 0;
1004
1005                                 req->io_req.br_callback = pci_nvme_io_partial;
1006
1007                                 if (!do_write)
1008                                         err = blockif_read(sc->nvstore.ctx,
1009                                                            &req->io_req);
1010                                 else
1011                                         err = blockif_write(sc->nvstore.ctx,
1012                                                             &req->io_req);
1013
1014                                 /* wait until req completes before cont */
1015                                 if (err == 0)
1016                                         pthread_cond_wait(&req->cv, &req->mtx);
1017                         }
1018                         if (iovidx == 0) {
1019                                 req->io_req.br_offset = lba;
1020                                 req->io_req.br_resid = 0;
1021                                 req->io_req.br_param = req;
1022                         }
1023
1024                         req->io_req.br_iov[iovidx].iov_base =
1025                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1026                                              gpaddr, size);
1027
1028                         req->io_req.br_iov[iovidx].iov_len = size;
1029
1030                         req->prev_gpaddr = gpaddr;
1031                         req->prev_size = size;
1032                         req->io_req.br_resid += size;
1033
1034                         req->io_req.br_iovcnt++;
1035
1036                         pthread_mutex_unlock(&req->mtx);
1037                 }
1038         } else {
1039                 /* RAM buffer: read/write directly */
1040                 void *p = sc->nvstore.ctx;
1041                 void *gptr;
1042
1043                 if ((lba + size) > sc->nvstore.size) {
1044                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1045                         return (-1);
1046                 }
1047
1048                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1049                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1050                 if (do_write) 
1051                         memcpy(p, gptr, size);
1052                 else
1053                         memcpy(gptr, p, size);
1054         }
1055         return (0);
1056 }
1057
1058 static void
1059 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1060         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1061         uint32_t cdw0, uint16_t status, int ignore_busy)
1062 {
1063         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1064         struct nvme_completion *compl;
1065         int do_intr = 0;
1066         int phase;
1067
1068         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1069                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1070                  NVME_STATUS_GET_SC(status)));
1071
1072         pthread_mutex_lock(&cq->mtx);
1073
1074         assert(cq->qbase != NULL);
1075
1076         compl = &cq->qbase[cq->tail];
1077
1078         compl->sqhd = atomic_load_acq_short(&sq->head);
1079         compl->sqid = sqid;
1080         compl->cid = cid;
1081
1082         // toggle phase
1083         phase = NVME_STATUS_GET_P(compl->status);
1084         compl->status = status;
1085         pci_nvme_toggle_phase(&compl->status, phase);
1086
1087         cq->tail = (cq->tail + 1) % cq->size;
1088
1089         if (cq->intr_en & NVME_CQ_INTEN)
1090                 do_intr = 1;
1091
1092         pthread_mutex_unlock(&cq->mtx);
1093
1094         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1095                 if (do_intr)
1096                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1097 }
1098
1099 static void
1100 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1101 {
1102         req->sc = NULL;
1103         req->nvme_sq = NULL;
1104         req->sqid = 0;
1105
1106         pthread_mutex_lock(&sc->mtx);
1107
1108         req->next = sc->ioreqs_free;
1109         sc->ioreqs_free = req;
1110         sc->pending_ios--;
1111
1112         /* when no more IO pending, can set to ready if device reset/enabled */
1113         if (sc->pending_ios == 0 &&
1114             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1115                 sc->regs.csts |= NVME_CSTS_RDY;
1116
1117         pthread_mutex_unlock(&sc->mtx);
1118
1119         sem_post(&sc->iosemlock);
1120 }
1121
1122 static struct pci_nvme_ioreq *
1123 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1124 {
1125         struct pci_nvme_ioreq *req = NULL;;
1126
1127         sem_wait(&sc->iosemlock);
1128         pthread_mutex_lock(&sc->mtx);
1129
1130         req = sc->ioreqs_free;
1131         assert(req != NULL);
1132
1133         sc->ioreqs_free = req->next;
1134
1135         req->next = NULL;
1136         req->sc = sc;
1137
1138         sc->pending_ios++;
1139
1140         pthread_mutex_unlock(&sc->mtx);
1141
1142         req->io_req.br_iovcnt = 0;
1143         req->io_req.br_offset = 0;
1144         req->io_req.br_resid = 0;
1145         req->io_req.br_param = req;
1146         req->prev_gpaddr = 0;
1147         req->prev_size = 0;
1148
1149         return req;
1150 }
1151
1152 static void
1153 pci_nvme_io_done(struct blockif_req *br, int err)
1154 {
1155         struct pci_nvme_ioreq *req = br->br_param;
1156         struct nvme_submission_queue *sq = req->nvme_sq;
1157         uint16_t code, status;
1158
1159         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1160         
1161         /* TODO return correct error */
1162         if (err)
1163                 code = NVME_SC_DATA_TRANSFER_ERROR;
1164         else
1165                 code = NVME_SC_SUCCESS;
1166
1167         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1168         pci_nvme_status_genc(&status, code);
1169
1170         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1171         pci_nvme_release_ioreq(req->sc, req);
1172 }
1173
1174 static void
1175 pci_nvme_io_partial(struct blockif_req *br, int err)
1176 {
1177         struct pci_nvme_ioreq *req = br->br_param;
1178
1179         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1180
1181         pthread_cond_signal(&req->cv);
1182 }
1183
1184
1185 static void
1186 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1187 {
1188         struct nvme_submission_queue *sq;
1189         uint16_t status;
1190         uint16_t sqhead;
1191         int err;
1192
1193         /* handle all submissions up to sq->tail index */
1194         sq = &sc->submit_queues[idx];
1195
1196         if (atomic_testandset_int(&sq->busy, 1)) {
1197                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1198                 return;
1199         }
1200
1201         sqhead = atomic_load_acq_short(&sq->head);
1202
1203         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1204                  idx, sqhead, sq->tail, sq->qbase));
1205
1206         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1207                 struct nvme_command *cmd;
1208                 struct pci_nvme_ioreq *req = NULL;
1209                 uint64_t lba;
1210                 uint64_t nblocks, bytes, size, cpsz;
1211
1212                 /* TODO: support scatter gather list handling */
1213
1214                 cmd = &sq->qbase[sqhead];
1215                 sqhead = (sqhead + 1) % sq->size;
1216
1217                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1218
1219                 if (NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_FLUSH) {
1220                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1221                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1222                                                 status, 1);
1223
1224                         continue;
1225                 } else if (NVME_CMD_GET_OPC(cmd->opc_fuse) == 0x08) {
1226                         /* TODO: write zeroes */
1227                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1228                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1229                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1230                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1231                                                 status, 1);
1232
1233                         continue;
1234                 }
1235
1236                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1237
1238                 bytes = nblocks * sc->nvstore.sectsz;
1239
1240                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1241                         req = pci_nvme_get_ioreq(sc);
1242                         req->nvme_sq = sq;
1243                         req->sqid = idx;
1244                 }
1245
1246                 /*
1247                  * If data starts mid-page and flows into the next page, then
1248                  * increase page count
1249                  */
1250
1251                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1252                          "(%lu-bytes)\r\n",
1253                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1254                          NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE ?
1255                              "WRITE" : "READ",
1256                          lba, nblocks, bytes));
1257
1258                 cmd->prp1 &= ~(0x03UL);
1259                 cmd->prp2 &= ~(0x03UL);
1260
1261                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1262
1263                 size = bytes;
1264                 lba *= sc->nvstore.sectsz;
1265
1266                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1267
1268                 if (cpsz > bytes)
1269                         cpsz = bytes;
1270
1271                 if (req != NULL) {
1272                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1273                                                 cmd->cdw10;
1274                         req->opc = NVME_CMD_GET_OPC(cmd->opc_fuse);
1275                         req->cid = cmd->cid;
1276                         req->nsid = cmd->nsid;
1277                 }
1278
1279                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1280                     NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE, lba);
1281                 lba += cpsz;
1282                 size -= cpsz;
1283
1284                 if (size == 0)
1285                         goto iodone;
1286
1287                 if (size <= PAGE_SIZE) {
1288                         /* prp2 is second (and final) page in transfer */
1289
1290                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1291                             size,
1292                             NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE,
1293                             lba);
1294                 } else {
1295                         uint64_t *prp_list;
1296                         int i;
1297
1298                         /* prp2 is pointer to a physical region page list */
1299                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1300                                                     cmd->prp2, PAGE_SIZE);
1301
1302                         i = 0;
1303                         while (size != 0) {
1304                                 cpsz = MIN(size, PAGE_SIZE);
1305
1306                                 /*
1307                                  * Move to linked physical region page list
1308                                  * in last item.
1309                                  */ 
1310                                 if (i == (NVME_PRP2_ITEMS-1) &&
1311                                     size > PAGE_SIZE) {
1312                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1313                                         prp_list = paddr_guest2host(
1314                                                       sc->nsc_pi->pi_vmctx,
1315                                                       prp_list[i], PAGE_SIZE);
1316                                         i = 0;
1317                                 }
1318                                 if (prp_list[i] == 0) {
1319                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1320                                         err = 1;
1321                                         break;
1322                                 }
1323
1324                                 err = pci_nvme_append_iov_req(sc, req,
1325                                     prp_list[i], cpsz,
1326                                     NVME_CMD_GET_OPC(cmd->opc_fuse) ==
1327                                         NVME_OPC_WRITE, lba);
1328                                 if (err)
1329                                         break;
1330
1331                                 lba += cpsz;
1332                                 size -= cpsz;
1333                                 i++;
1334                         }
1335                 }
1336
1337 iodone:
1338                 if (sc->nvstore.type == NVME_STOR_RAM) {
1339                         uint16_t code, status;
1340
1341                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1342                             NVME_SC_SUCCESS;
1343                         pci_nvme_status_genc(&status, code);
1344
1345                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1346                                                 status, 1);
1347
1348                         continue;
1349                 }
1350
1351
1352                 if (err)
1353                         goto do_error;
1354
1355                 req->io_req.br_callback = pci_nvme_io_done;
1356
1357                 err = 0;
1358                 switch (NVME_CMD_GET_OPC(cmd->opc_fuse)) {
1359                 case NVME_OPC_READ:
1360                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1361                         break;
1362                 case NVME_OPC_WRITE:
1363                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1364                         break;
1365                 default:
1366                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1367                                  __func__, NVME_CMD_GET_OPC(cmd->opc_fuse)));
1368                         err = 1;
1369                 }
1370
1371 do_error:
1372                 if (err) {
1373                         uint16_t status;
1374
1375                         pci_nvme_status_genc(&status,
1376                             NVME_SC_DATA_TRANSFER_ERROR);
1377
1378                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1379                                                 status, 1);
1380                         pci_nvme_release_ioreq(sc, req);
1381                 }
1382         }
1383
1384         atomic_store_short(&sq->head, sqhead);
1385         atomic_store_int(&sq->busy, 0);
1386 }
1387
1388 static void
1389 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1390         uint64_t idx, int is_sq, uint64_t value)
1391 {
1392         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1393                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1394
1395         if (is_sq) {
1396                 atomic_store_short(&sc->submit_queues[idx].tail,
1397                                    (uint16_t)value);
1398
1399                 if (idx == 0) {
1400                         pci_nvme_handle_admin_cmd(sc, value);
1401                 } else {
1402                         /* submission queue; handle new entries in SQ */
1403                         if (idx > sc->num_squeues) {
1404                                 WPRINTF(("%s SQ index %lu overflow from "
1405                                          "guest (max %u)\r\n",
1406                                          __func__, idx, sc->num_squeues));
1407                                 return;
1408                         }
1409                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1410                 }
1411         } else {
1412                 if (idx > sc->num_cqueues) {
1413                         WPRINTF(("%s queue index %lu overflow from "
1414                                  "guest (max %u)\r\n",
1415                                  __func__, idx, sc->num_cqueues));
1416                         return;
1417                 }
1418
1419                 sc->compl_queues[idx].head = (uint16_t)value;
1420         }
1421 }
1422
1423 static void
1424 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1425 {
1426         const char *s = iswrite ? "WRITE" : "READ";
1427
1428         switch (offset) {
1429         case NVME_CR_CAP_LOW:
1430                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1431                 break;
1432         case NVME_CR_CAP_HI:
1433                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1434                 break;
1435         case NVME_CR_VS:
1436                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1437                 break;
1438         case NVME_CR_INTMS:
1439                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1440                 break;
1441         case NVME_CR_INTMC:
1442                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1443                 break;
1444         case NVME_CR_CC:
1445                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1446                 break;
1447         case NVME_CR_CSTS:
1448                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1449                 break;
1450         case NVME_CR_NSSR:
1451                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1452                 break;
1453         case NVME_CR_AQA:
1454                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1455                 break;
1456         case NVME_CR_ASQ_LOW:
1457                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1458                 break;
1459         case NVME_CR_ASQ_HI:
1460                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1461                 break;
1462         case NVME_CR_ACQ_LOW:
1463                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1464                 break;
1465         case NVME_CR_ACQ_HI:
1466                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1467                 break;
1468         default:
1469                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1470         }
1471
1472 }
1473
1474 static void
1475 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1476         uint64_t offset, int size, uint64_t value)
1477 {
1478         uint32_t ccreg;
1479
1480         if (offset >= NVME_DOORBELL_OFFSET) {
1481                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1482                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1483                 int is_sq = (belloffset % 8) < 4;
1484
1485                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1486                         WPRINTF(("guest attempted an overflow write offset "
1487                                  "0x%lx, val 0x%lx in %s",
1488                                  offset, value, __func__));
1489                         return;
1490                 }
1491
1492                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1493                 return;
1494         }
1495
1496         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1497                 offset, size, value));
1498
1499         if (size != 4) {
1500                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1501                          "val 0x%lx) to bar0 in %s",
1502                          size, offset, value, __func__));
1503                 /* TODO: shutdown device */
1504                 return;
1505         }
1506
1507         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1508
1509         pthread_mutex_lock(&sc->mtx);
1510
1511         switch (offset) {
1512         case NVME_CR_CAP_LOW:
1513         case NVME_CR_CAP_HI:
1514                 /* readonly */
1515                 break;
1516         case NVME_CR_VS:
1517                 /* readonly */
1518                 break;
1519         case NVME_CR_INTMS:
1520                 /* MSI-X, so ignore */
1521                 break;
1522         case NVME_CR_INTMC:
1523                 /* MSI-X, so ignore */
1524                 break;
1525         case NVME_CR_CC:
1526                 ccreg = (uint32_t)value;
1527
1528                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1529                          "iocqes %u\r\n",
1530                         __func__,
1531                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1532                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1533                          NVME_CC_GET_IOCQES(ccreg)));
1534
1535                 if (NVME_CC_GET_SHN(ccreg)) {
1536                         /* perform shutdown - flush out data to backend */
1537                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1538                             NVME_CSTS_REG_SHST_SHIFT);
1539                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1540                             NVME_CSTS_REG_SHST_SHIFT;
1541                 }
1542                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1543                         if (NVME_CC_GET_EN(ccreg) == 0)
1544                                 /* transition 1-> causes controller reset */
1545                                 pci_nvme_reset(sc);
1546                         else
1547                                 pci_nvme_init_controller(ctx, sc);
1548                 }
1549
1550                 /* Insert the iocqes, iosqes and en bits from the write */
1551                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1552                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1553                 if (NVME_CC_GET_EN(ccreg) == 0) {
1554                         /* Insert the ams, mps and css bit fields */
1555                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1556                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1557                         sc->regs.csts &= ~NVME_CSTS_RDY;
1558                 } else if (sc->pending_ios == 0) {
1559                         sc->regs.csts |= NVME_CSTS_RDY;
1560                 }
1561                 break;
1562         case NVME_CR_CSTS:
1563                 break;
1564         case NVME_CR_NSSR:
1565                 /* ignore writes; don't support subsystem reset */
1566                 break;
1567         case NVME_CR_AQA:
1568                 sc->regs.aqa = (uint32_t)value;
1569                 break;
1570         case NVME_CR_ASQ_LOW:
1571                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1572                                (0xFFFFF000 & value);
1573                 break;
1574         case NVME_CR_ASQ_HI:
1575                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1576                                (value << 32);
1577                 break;
1578         case NVME_CR_ACQ_LOW:
1579                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1580                                (0xFFFFF000 & value);
1581                 break;
1582         case NVME_CR_ACQ_HI:
1583                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1584                                (value << 32);
1585                 break;
1586         default:
1587                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1588                          __func__, offset, value, size));
1589         }
1590         pthread_mutex_unlock(&sc->mtx);
1591 }
1592
1593 static void
1594 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1595                 int baridx, uint64_t offset, int size, uint64_t value)
1596 {
1597         struct pci_nvme_softc* sc = pi->pi_arg;
1598
1599         if (baridx == pci_msix_table_bar(pi) ||
1600             baridx == pci_msix_pba_bar(pi)) {
1601                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1602                          " value 0x%lx\r\n", baridx, offset, size, value));
1603
1604                 pci_emul_msix_twrite(pi, offset, size, value);
1605                 return;
1606         }
1607
1608         switch (baridx) {
1609         case 0:
1610                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1611                 break;
1612
1613         default:
1614                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1615                          __func__, baridx, value));
1616         }
1617 }
1618
1619 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1620         uint64_t offset, int size)
1621 {
1622         uint64_t value;
1623
1624         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1625
1626         if (offset < NVME_DOORBELL_OFFSET) {
1627                 void *p = &(sc->regs);
1628                 pthread_mutex_lock(&sc->mtx);
1629                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1630                 pthread_mutex_unlock(&sc->mtx);
1631         } else {
1632                 value = 0;
1633                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1634         }
1635
1636         switch (size) {
1637         case 1:
1638                 value &= 0xFF;
1639                 break;
1640         case 2:
1641                 value &= 0xFFFF;
1642                 break;
1643         case 4:
1644                 value &= 0xFFFFFFFF;
1645                 break;
1646         }
1647
1648         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1649                  offset, size, (uint32_t)value));
1650
1651         return (value);
1652 }
1653
1654
1655
1656 static uint64_t
1657 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1658     uint64_t offset, int size)
1659 {
1660         struct pci_nvme_softc* sc = pi->pi_arg;
1661
1662         if (baridx == pci_msix_table_bar(pi) ||
1663             baridx == pci_msix_pba_bar(pi)) {
1664                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1665                         baridx, offset, size));
1666
1667                 return pci_emul_msix_tread(pi, offset, size);
1668         }
1669
1670         switch (baridx) {
1671         case 0:
1672                 return pci_nvme_read_bar_0(sc, offset, size);
1673
1674         default:
1675                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1676         }
1677
1678         return (0);
1679 }
1680
1681
1682 static int
1683 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1684 {
1685         char bident[sizeof("XX:X:X")];
1686         char    *uopt, *xopts, *config;
1687         uint32_t sectsz;
1688         int optidx;
1689
1690         sc->max_queues = NVME_QUEUES;
1691         sc->max_qentries = NVME_MAX_QENTRIES;
1692         sc->ioslots = NVME_IOSLOTS;
1693         sc->num_squeues = sc->max_queues;
1694         sc->num_cqueues = sc->max_queues;
1695         sectsz = 0;
1696
1697         uopt = strdup(opts);
1698         optidx = 0;
1699         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1700                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1701         for (xopts = strtok(uopt, ",");
1702              xopts != NULL;
1703              xopts = strtok(NULL, ",")) {
1704
1705                 if ((config = strchr(xopts, '=')) != NULL)
1706                         *config++ = '\0';
1707
1708                 if (!strcmp("maxq", xopts)) {
1709                         sc->max_queues = atoi(config);
1710                 } else if (!strcmp("qsz", xopts)) {
1711                         sc->max_qentries = atoi(config);
1712                 } else if (!strcmp("ioslots", xopts)) {
1713                         sc->ioslots = atoi(config);
1714                 } else if (!strcmp("sectsz", xopts)) {
1715                         sectsz = atoi(config);
1716                 } else if (!strcmp("ser", xopts)) {
1717                         memset(sc->ctrldata.sn, 0, sizeof(sc->ctrldata.sn));
1718                         strncpy(sc->ctrldata.sn, config,
1719                                 sizeof(sc->ctrldata.sn));
1720                 } else if (!strcmp("ram", xopts)) {
1721                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1722
1723                         sc->nvstore.type = NVME_STOR_RAM;
1724                         sc->nvstore.size = sz * 1024 * 1024;
1725                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1726                         sc->nvstore.sectsz = 4096;
1727                         sc->nvstore.sectsz_bits = 12;
1728                         if (sc->nvstore.ctx == NULL) {
1729                                 perror("Unable to allocate RAM");
1730                                 return (-1);
1731                         }
1732                 } else if (optidx == 0) {
1733                         snprintf(bident, sizeof(bident), "%d:%d",
1734                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1735                         sc->nvstore.ctx = blockif_open(xopts, bident);
1736                         if (sc->nvstore.ctx == NULL) {
1737                                 perror("Could not open backing file");
1738                                 return (-1);
1739                         }
1740                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1741                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1742                 } else {
1743                         fprintf(stderr, "Invalid option %s\n", xopts);
1744                         return (-1);
1745                 }
1746
1747                 optidx++;
1748         }
1749         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1750                 fprintf(stderr, "backing store not specified\n");
1751                 return (-1);
1752         }
1753         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1754                 sc->nvstore.sectsz = sectsz;
1755         else if (sc->nvstore.type != NVME_STOR_RAM)
1756                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1757         for (sc->nvstore.sectsz_bits = 9;
1758              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1759              sc->nvstore.sectsz_bits++)
1760                 ;
1761
1762
1763         if (sc->max_queues == 0) {
1764                 fprintf(stderr, "Invalid maxq option\n");
1765                 return (-1);
1766         }
1767         if (sc->max_qentries <= 0) {
1768                 fprintf(stderr, "Invalid qsz option\n");
1769                 return (-1);
1770         }
1771         if (sc->ioslots <= 0) {
1772                 fprintf(stderr, "Invalid ioslots option\n");
1773                 return (-1);
1774         }
1775
1776         return (0);
1777 }
1778
1779 static int
1780 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1781 {
1782         struct pci_nvme_softc *sc;
1783         uint32_t pci_membar_sz;
1784         int     error;
1785
1786         error = 0;
1787
1788         sc = calloc(1, sizeof(struct pci_nvme_softc));
1789         pi->pi_arg = sc;
1790         sc->nsc_pi = pi;
1791
1792         error = pci_nvme_parse_opts(sc, opts);
1793         if (error < 0)
1794                 goto done;
1795         else
1796                 error = 0;
1797
1798         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1799         for (int i = 0; i < sc->ioslots; i++) {
1800                 if (i < (sc->ioslots-1))
1801                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1802                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1803                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1804         }
1805         sc->ioreqs_free = sc->ioreqs;
1806         sc->intr_coales_aggr_thresh = 1;
1807
1808         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1809         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1810         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1811         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1812         pci_set_cfgdata8(pi, PCIR_PROGIF,
1813                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1814
1815         /* allocate size of nvme registers + doorbell space for all queues */
1816         pci_membar_sz = sizeof(struct nvme_registers) +
1817                         2*sizeof(uint32_t)*(sc->max_queues);
1818
1819         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1820
1821         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1822         if (error) {
1823                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1824                 goto done;
1825         }
1826
1827         error = pci_emul_add_msixcap(pi, sc->max_queues, NVME_MSIX_BAR);
1828         if (error) {
1829                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1830                 goto done;
1831         }
1832
1833         pthread_mutex_init(&sc->mtx, NULL);
1834         sem_init(&sc->iosemlock, 0, sc->ioslots);
1835
1836         pci_nvme_reset(sc);
1837         pci_nvme_init_ctrldata(sc);
1838         pci_nvme_init_nsdata(sc);
1839
1840         pci_lintr_request(pi);
1841
1842 done:
1843         return (error);
1844 }
1845
1846
1847 struct pci_devemu pci_de_nvme = {
1848         .pe_emu =       "nvme",
1849         .pe_init =      pci_nvme_init,
1850         .pe_barwrite =  pci_nvme_write,
1851         .pe_barread =   pci_nvme_read
1852 };
1853 PCI_EMUL_SET(pci_de_nvme);