]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
MFC r342762:
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include <sys/types.h>
57
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71
72 #include <dev/nvme/nvme.h>
73
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77
78
79 static int nvme_debug = 0;
80 #define DPRINTF(params) if (nvme_debug) printf params
81 #define WPRINTF(params) printf params
82
83 /* defaults; can be overridden */
84 #define NVME_MSIX_BAR           4
85
86 #define NVME_IOSLOTS            8
87
88 #define NVME_QUEUES             16
89 #define NVME_MAX_QENTRIES       2048
90
91 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
92 #define NVME_MAX_BLOCKIOVS      512
93
94 /* helpers */
95
96 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
97
98 enum nvme_controller_register_offsets {
99         NVME_CR_CAP_LOW = 0x00,
100         NVME_CR_CAP_HI  = 0x04,
101         NVME_CR_VS      = 0x08,
102         NVME_CR_INTMS   = 0x0c,
103         NVME_CR_INTMC   = 0x10,
104         NVME_CR_CC      = 0x14,
105         NVME_CR_CSTS    = 0x1c,
106         NVME_CR_NSSR    = 0x20,
107         NVME_CR_AQA     = 0x24,
108         NVME_CR_ASQ_LOW = 0x28,
109         NVME_CR_ASQ_HI  = 0x2c,
110         NVME_CR_ACQ_LOW = 0x30,
111         NVME_CR_ACQ_HI  = 0x34,
112 };
113
114 enum nvme_cmd_cdw11 {
115         NVME_CMD_CDW11_PC  = 0x0001,
116         NVME_CMD_CDW11_IEN = 0x0002,
117         NVME_CMD_CDW11_IV  = 0xFFFF0000,
118 };
119
120 #define NVME_CQ_INTEN   0x01
121 #define NVME_CQ_INTCOAL 0x02
122
123 struct nvme_completion_queue {
124         struct nvme_completion *qbase;
125         uint32_t        size;
126         uint16_t        tail; /* nvme progress */
127         uint16_t        head; /* guest progress */
128         uint16_t        intr_vec;
129         uint32_t        intr_en;
130         pthread_mutex_t mtx;
131 };
132
133 struct nvme_submission_queue {
134         struct nvme_command *qbase;
135         uint32_t        size;
136         uint16_t        head; /* nvme progress */
137         uint16_t        tail; /* guest progress */
138         uint16_t        cqid; /* completion queue id */
139         int             busy; /* queue is being processed */
140         int             qpriority;
141 };
142
143 enum nvme_storage_type {
144         NVME_STOR_BLOCKIF = 0,
145         NVME_STOR_RAM = 1,
146 };
147
148 struct pci_nvme_blockstore {
149         enum nvme_storage_type type;
150         void            *ctx;
151         uint64_t        size;
152         uint32_t        sectsz;
153         uint32_t        sectsz_bits;
154 };
155
156 struct pci_nvme_ioreq {
157         struct pci_nvme_softc *sc;
158         struct pci_nvme_ioreq *next;
159         struct nvme_submission_queue *nvme_sq;
160         uint16_t        sqid;
161
162         /* command information */
163         uint16_t        opc;
164         uint16_t        cid;
165         uint32_t        nsid;
166
167         uint64_t        prev_gpaddr;
168         size_t          prev_size;
169
170         /*
171          * lock if all iovs consumed (big IO);
172          * complete transaction before continuing
173          */
174         pthread_mutex_t mtx;
175         pthread_cond_t  cv;
176
177         struct blockif_req io_req;
178
179         /* pad to fit up to 512 page descriptors from guest IO request */
180         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
181 };
182
183 struct pci_nvme_softc {
184         struct pci_devinst *nsc_pi;
185
186         pthread_mutex_t mtx;
187
188         struct nvme_registers regs;
189
190         struct nvme_namespace_data  nsdata;
191         struct nvme_controller_data ctrldata;
192
193         struct pci_nvme_blockstore nvstore;
194
195         uint16_t        max_qentries; /* max entries per queue */
196         uint32_t        max_queues;
197         uint32_t        num_cqueues;
198         uint32_t        num_squeues;
199
200         struct pci_nvme_ioreq *ioreqs;
201         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
202         uint32_t        pending_ios;
203         uint32_t        ioslots;
204         sem_t           iosemlock;
205
206         /* status and guest memory mapped queues */
207         struct nvme_completion_queue *compl_queues;
208         struct nvme_submission_queue *submit_queues;
209
210         /* controller features */
211         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
212         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
213         uint32_t        async_ev_config;         /* 0x0B: async event config */
214 };
215
216
217 static void pci_nvme_io_partial(struct blockif_req *br, int err);
218
219 /* Controller Configuration utils */
220 #define NVME_CC_GET_EN(cc) \
221         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
222 #define NVME_CC_GET_CSS(cc) \
223         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
224 #define NVME_CC_GET_SHN(cc) \
225         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
226 #define NVME_CC_GET_IOSQES(cc) \
227         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
228 #define NVME_CC_GET_IOCQES(cc) \
229         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
230
231 #define NVME_CC_WRITE_MASK \
232         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
233          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
234          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
235
236 #define NVME_CC_NEN_WRITE_MASK \
237         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
238          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
239          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
240
241 /* Controller Status utils */
242 #define NVME_CSTS_GET_RDY(sts) \
243         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
244
245 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
246
247 /* Completion Queue status word utils */
248 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
249 #define NVME_STATUS_MASK \
250         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
251          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
252
253 static __inline void
254 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
255 {
256         size_t len;
257
258         len = strnlen(src, dst_size);
259         memset(dst, pad, dst_size);
260         memcpy(dst, src, len);
261 }
262
263 static __inline void
264 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
265 {
266
267         *status &= ~NVME_STATUS_MASK;
268         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
269                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
270 }
271
272 static __inline void
273 pci_nvme_status_genc(uint16_t *status, uint16_t code)
274 {
275
276         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
277 }
278
279 static __inline void
280 pci_nvme_toggle_phase(uint16_t *status, int prev)
281 {
282
283         if (prev)
284                 *status &= ~NVME_STATUS_P;
285         else
286                 *status |= NVME_STATUS_P;
287 }
288
289 static void
290 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
291 {
292         struct nvme_controller_data *cd = &sc->ctrldata;
293
294         cd->vid = 0xFB5D;
295         cd->ssvid = 0x0000;
296
297         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
298         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
299
300         /* Num of submission commands that we can handle at a time (2^rab) */
301         cd->rab   = 4;
302
303         /* FreeBSD OUI */
304         cd->ieee[0] = 0x58;
305         cd->ieee[1] = 0x9c;
306         cd->ieee[2] = 0xfc;
307
308         cd->mic = 0;
309
310         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
311
312         cd->ver = 0x00010300;
313
314         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
315         cd->acl = 2;
316         cd->aerl = 4;
317
318         cd->lpa = 0;    /* TODO: support some simple things like SMART */
319         cd->elpe = 0;   /* max error log page entries */
320         cd->npss = 1;   /* number of power states support */
321
322         /* Warning Composite Temperature Threshold */
323         cd->wctemp = 0x0157;
324
325         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
326             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
327         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
328             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
329         cd->nn = 1;     /* number of namespaces */
330
331         cd->fna = 0x03;
332
333         cd->power_state[0].mp = 10;
334 }
335
336 static void
337 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
338 {
339         struct nvme_namespace_data *nd;
340
341         nd = &sc->nsdata;
342
343         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
344         nd->ncap = nd->nsze;
345         nd->nuse = nd->nsze;
346
347         /* Get LBA and backstore information from backing store */
348         nd->nlbaf = 1;
349         /* LBA data-sz = 2^lbads */
350         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
351
352         nd->flbas = 0;
353 }
354
355 static void
356 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
357 {
358         DPRINTF(("%s\r\n", __func__));
359
360         sc->regs.cap_lo = (sc->max_qentries & NVME_CAP_LO_REG_MQES_MASK) |
361             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
362             (60 << NVME_CAP_LO_REG_TO_SHIFT);
363
364         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
365
366         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
367
368         sc->regs.cc = 0;
369         sc->regs.csts = 0;
370
371         sc->num_cqueues = sc->num_squeues = sc->max_queues;
372         if (sc->submit_queues != NULL) {
373                 for (int i = 0; i <= sc->max_queues; i++) {
374                         /*
375                          * The Admin Submission Queue is at index 0.
376                          * It must not be changed at reset otherwise the
377                          * emulation will be out of sync with the guest.
378                          */
379                         if (i != 0) {
380                                 sc->submit_queues[i].qbase = NULL;
381                                 sc->submit_queues[i].size = 0;
382                                 sc->submit_queues[i].cqid = 0;
383
384                                 sc->compl_queues[i].qbase = NULL;
385                                 sc->compl_queues[i].size = 0;
386                         }
387                         sc->submit_queues[i].tail = 0;
388                         sc->submit_queues[i].head = 0;
389                         sc->submit_queues[i].busy = 0;
390
391                         sc->compl_queues[i].tail = 0;
392                         sc->compl_queues[i].head = 0;
393                 }
394         } else
395                 sc->submit_queues = calloc(sc->max_queues + 1,
396                                         sizeof(struct nvme_submission_queue));
397
398         if (sc->compl_queues == NULL) {
399                 sc->compl_queues = calloc(sc->max_queues + 1,
400                                         sizeof(struct nvme_completion_queue));
401
402                 for (int i = 0; i <= sc->num_cqueues; i++)
403                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
404         }
405 }
406
407 static void
408 pci_nvme_reset(struct pci_nvme_softc *sc)
409 {
410         pthread_mutex_lock(&sc->mtx);
411         pci_nvme_reset_locked(sc);
412         pthread_mutex_unlock(&sc->mtx);
413 }
414
415 static void
416 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
417 {
418         uint16_t acqs, asqs;
419
420         DPRINTF(("%s\r\n", __func__));
421
422         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
423         sc->submit_queues[0].size = asqs;
424         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
425                     sizeof(struct nvme_command) * asqs);
426
427         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
428                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
429
430         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
431             NVME_AQA_REG_ACQS_MASK) + 1;
432         sc->compl_queues[0].size = acqs;
433         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
434                  sizeof(struct nvme_completion) * acqs);
435         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
436                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
437 }
438
439 static int
440 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
441         struct nvme_completion* compl)
442 {
443         uint16_t qid = command->cdw10 & 0xffff;
444
445         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
446         if (qid == 0 || qid > sc->num_cqueues) {
447                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
448                         __func__, qid, sc->num_squeues));
449                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
450                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
451                 return (1);
452         }
453
454         sc->submit_queues[qid].qbase = NULL;
455         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
456         return (1);
457 }
458
459 static int
460 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
461         struct nvme_completion* compl)
462 {
463         if (command->cdw11 & NVME_CMD_CDW11_PC) {
464                 uint16_t qid = command->cdw10 & 0xffff;
465                 struct nvme_submission_queue *nsq;
466
467                 if (qid > sc->num_squeues) {
468                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
469                                 __func__, qid, sc->num_squeues));
470                         pci_nvme_status_tc(&compl->status,
471                             NVME_SCT_COMMAND_SPECIFIC,
472                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
473                         return (1);
474                 }
475
476                 nsq = &sc->submit_queues[qid];
477                 nsq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
478
479                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
480                               sizeof(struct nvme_command) * (size_t)nsq->size);
481                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
482                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
483
484                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
485                         qid, nsq->size, nsq->qbase, nsq->cqid));
486
487                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
488
489                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
490                          __func__, qid));
491         } else {
492                 /* 
493                  * Guest sent non-cont submission queue request.
494                  * This setting is unsupported by this emulation.
495                  */
496                 WPRINTF(("%s unsupported non-contig (list-based) "
497                          "create i/o submission queue\r\n", __func__));
498
499                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
500         }
501         return (1);
502 }
503
504 static int
505 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
506         struct nvme_completion* compl)
507 {
508         uint16_t qid = command->cdw10 & 0xffff;
509
510         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
511         if (qid == 0 || qid > sc->num_cqueues) {
512                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
513                         __func__, qid, sc->num_cqueues));
514                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
515                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
516                 return (1);
517         }
518
519         sc->compl_queues[qid].qbase = NULL;
520         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
521         return (1);
522 }
523
524 static int
525 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
526         struct nvme_completion* compl)
527 {
528         if (command->cdw11 & NVME_CMD_CDW11_PC) {
529                 uint16_t qid = command->cdw10 & 0xffff;
530                 struct nvme_completion_queue *ncq;
531
532                 if (qid > sc->num_cqueues) {
533                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
534                                 __func__, qid, sc->num_cqueues));
535                         pci_nvme_status_tc(&compl->status,
536                             NVME_SCT_COMMAND_SPECIFIC,
537                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
538                         return (1);
539                 }
540
541                 ncq = &sc->compl_queues[qid];
542                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
543                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
544                 ncq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
545
546                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
547                              command->prp1,
548                              sizeof(struct nvme_command) * (size_t)ncq->size);
549
550                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
551         } else {
552                 /* 
553                  * Non-contig completion queue unsupported.
554                  */
555                 WPRINTF(("%s unsupported non-contig (list-based) "
556                          "create i/o completion queue\r\n",
557                          __func__));
558
559                 /* 0x12 = Invalid Use of Controller Memory Buffer */
560                 pci_nvme_status_genc(&compl->status, 0x12);
561         }
562
563         return (1);
564 }
565
566 static int
567 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
568         struct nvme_completion* compl)
569 {
570         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
571         uint8_t logpage = command->cdw10 & 0xFF;
572         void *data;
573
574         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
575
576         if (logpage >= 1 && logpage <= 3)
577                 data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
578                                   PAGE_SIZE);
579
580         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
581
582         switch (logpage) {
583         case 0x01: /* Error information */
584                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
585                 break;
586         case 0x02: /* SMART/Health information */
587                 /* TODO: present some smart info */
588                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
589                 break;
590         case 0x03: /* Firmware slot information */
591                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
592                 break;
593         default:
594                 WPRINTF(("%s get log page %x command not supported\r\n",
595                         __func__, logpage));
596
597                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
598                     NVME_SC_INVALID_LOG_PAGE);
599         }
600
601         return (1);
602 }
603
604 static int
605 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
606         struct nvme_completion* compl)
607 {
608         void *dest;
609
610         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
611                 command->cdw10 & 0xFF, command->nsid));
612
613         switch (command->cdw10 & 0xFF) {
614         case 0x00: /* return Identify Namespace data structure */
615                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
616                                   sizeof(sc->nsdata));
617                 memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
618                 break;
619         case 0x01: /* return Identify Controller data structure */
620                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
621                                   sizeof(sc->ctrldata));
622                 memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
623                 break;
624         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
625                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
626                                   sizeof(uint32_t) * 1024);
627                 ((uint32_t *)dest)[0] = 1;
628                 ((uint32_t *)dest)[1] = 0;
629                 break;
630         case 0x11:
631                 pci_nvme_status_genc(&compl->status,
632                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
633                 return (1);
634         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
635         case 0x10:
636         case 0x12:
637         case 0x13:
638         case 0x14:
639         case 0x15:
640         default:
641                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
642                          __func__, command->cdw10 & 0xFF));
643                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
644                 return (1);
645         }
646
647         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
648         return (1);
649 }
650
651 static int
652 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
653         struct nvme_completion* compl)
654 {
655         int feature = command->cdw10 & 0xFF;
656         uint32_t iv;
657
658         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
659         compl->cdw0 = 0;
660
661         switch (feature) {
662         case NVME_FEAT_ARBITRATION:
663                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
664                 break;
665         case NVME_FEAT_POWER_MANAGEMENT:
666                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
667                 break;
668         case NVME_FEAT_LBA_RANGE_TYPE:
669                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
670                 break;
671         case NVME_FEAT_TEMPERATURE_THRESHOLD:
672                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
673                 break;
674         case NVME_FEAT_ERROR_RECOVERY:
675                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
676                 break;
677         case NVME_FEAT_VOLATILE_WRITE_CACHE:
678                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
679                 break;
680         case NVME_FEAT_NUMBER_OF_QUEUES:
681                 sc->num_squeues = command->cdw11 & 0xFFFF;
682                 sc->num_cqueues = (command->cdw11 >> 16) & 0xFFFF;
683                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
684                         sc->num_squeues, sc->num_cqueues));
685
686                 if (sc->num_squeues == 0 || sc->num_squeues > sc->max_queues)
687                         sc->num_squeues = sc->max_queues;
688                 if (sc->num_cqueues == 0 || sc->num_cqueues > sc->max_queues)
689                         sc->num_cqueues = sc->max_queues;
690
691                 compl->cdw0 = (sc->num_squeues & 0xFFFF) |
692                               ((sc->num_cqueues & 0xFFFF) << 16);
693
694                 break;
695         case NVME_FEAT_INTERRUPT_COALESCING:
696                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
697
698                 /* in uS */
699                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
700
701                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
702                 break;
703         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
704                 iv = command->cdw11 & 0xFFFF;
705
706                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
707                         command->cdw11));
708
709                 for (uint32_t i = 0; i <= sc->num_cqueues; i++) {
710                         if (sc->compl_queues[i].intr_vec == iv) {
711                                 if (command->cdw11 & (1 << 16))
712                                         sc->compl_queues[i].intr_en |=
713                                                               NVME_CQ_INTCOAL;  
714                                 else
715                                         sc->compl_queues[i].intr_en &=
716                                                              ~NVME_CQ_INTCOAL;  
717                         }
718                 }
719                 break;
720         case NVME_FEAT_WRITE_ATOMICITY:
721                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
722                 break;
723         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
724                 DPRINTF(("  async event configuration 0x%x\r\n",
725                         command->cdw11));
726                 sc->async_ev_config = command->cdw11;
727                 break;
728         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
729                 DPRINTF(("  software progress marker 0x%x\r\n",
730                         command->cdw11));
731                 break;
732         case 0x0C:
733                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
734                         command->cdw11));
735                 break;
736         default:
737                 WPRINTF(("%s invalid feature\r\n", __func__));
738                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
739                 return (1);
740         }
741
742         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
743         return (1);
744 }
745
746 static int
747 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
748         struct nvme_completion* compl)
749 {
750         int feature = command->cdw10 & 0xFF;
751
752         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
753
754         compl->cdw0 = 0;
755
756         switch (feature) {
757         case NVME_FEAT_ARBITRATION:
758                 DPRINTF(("  arbitration\r\n"));
759                 break;
760         case NVME_FEAT_POWER_MANAGEMENT:
761                 DPRINTF(("  power management\r\n"));
762                 break;
763         case NVME_FEAT_LBA_RANGE_TYPE:
764                 DPRINTF(("  lba range\r\n"));
765                 break;
766         case NVME_FEAT_TEMPERATURE_THRESHOLD:
767                 DPRINTF(("  temperature threshold\r\n"));
768                 switch ((command->cdw11 >> 20) & 0x3) {
769                 case 0:
770                         /* Over temp threshold */
771                         compl->cdw0 = 0xFFFF;
772                         break;
773                 case 1:
774                         /* Under temp threshold */
775                         compl->cdw0 = 0;
776                         break;
777                 default:
778                         WPRINTF(("  invalid threshold type select\r\n"));
779                         pci_nvme_status_genc(&compl->status,
780                             NVME_SC_INVALID_FIELD);
781                         return (1);
782                 }
783                 break;
784         case NVME_FEAT_ERROR_RECOVERY:
785                 DPRINTF(("  error recovery\r\n"));
786                 break;
787         case NVME_FEAT_VOLATILE_WRITE_CACHE:
788                 DPRINTF(("  volatile write cache\r\n"));
789                 break;
790         case NVME_FEAT_NUMBER_OF_QUEUES:
791                 compl->cdw0 = 0;
792                 if (sc->num_squeues == 0)
793                         compl->cdw0 |= sc->max_queues & 0xFFFF;
794                 else
795                         compl->cdw0 |= sc->num_squeues & 0xFFFF;
796
797                 if (sc->num_cqueues == 0)
798                         compl->cdw0 |= (sc->max_queues & 0xFFFF) << 16;
799                 else
800                         compl->cdw0 |= (sc->num_cqueues & 0xFFFF) << 16;
801
802                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
803                         compl->cdw0 & 0xFFFF,
804                         (compl->cdw0 >> 16) & 0xFFFF));
805
806                 break;
807         case NVME_FEAT_INTERRUPT_COALESCING:
808                 DPRINTF(("  interrupt coalescing\r\n"));
809                 break;
810         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
811                 DPRINTF(("  interrupt vector configuration\r\n"));
812                 break;
813         case NVME_FEAT_WRITE_ATOMICITY:
814                 DPRINTF(("  write atomicity\r\n"));
815                 break;
816         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
817                 DPRINTF(("  async event configuration\r\n"));
818                 sc->async_ev_config = command->cdw11;
819                 break;
820         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
821                 DPRINTF(("  software progress marker\r\n"));
822                 break;
823         case 0x0C:
824                 DPRINTF(("  autonomous power state transition\r\n"));
825                 break;
826         default:
827                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
828                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
829                 return (1);
830         }
831
832         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
833         return (1);
834 }
835
836 static int
837 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
838         struct nvme_completion* compl)
839 {
840         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
841                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
842
843         /* TODO: search for the command ID and abort it */
844
845         compl->cdw0 = 1;
846         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
847         return (1);
848 }
849
850 static int
851 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
852         struct nvme_command* command, struct nvme_completion* compl)
853 {
854         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
855
856         /*
857          * TODO: raise events when they happen based on the Set Features cmd.
858          * These events happen async, so only set completion successful if
859          * there is an event reflective of the request to get event.
860          */
861         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
862             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
863         return (0);
864 }
865
866 static void
867 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
868 {
869         struct nvme_completion compl;
870         struct nvme_command *cmd;
871         struct nvme_submission_queue *sq;
872         struct nvme_completion_queue *cq;
873         int do_intr = 0;
874         uint16_t sqhead;
875
876         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
877
878         sq = &sc->submit_queues[0];
879
880         sqhead = atomic_load_acq_short(&sq->head);
881
882         if (atomic_testandset_int(&sq->busy, 1)) {
883                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
884                         __func__, sqhead, sq->tail));
885                 return;
886         }
887
888         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
889         
890         while (sqhead != atomic_load_acq_short(&sq->tail)) {
891                 cmd = &(sq->qbase)[sqhead];
892                 compl.status = 0;
893
894                 switch (cmd->opc) {
895                 case NVME_OPC_DELETE_IO_SQ:
896                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
897                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
898                         break;
899                 case NVME_OPC_CREATE_IO_SQ:
900                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
901                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
902                         break;
903                 case NVME_OPC_DELETE_IO_CQ:
904                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
905                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
906                         break;
907                 case NVME_OPC_CREATE_IO_CQ:
908                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
909                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
910                         break;
911                 case NVME_OPC_GET_LOG_PAGE:
912                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
913                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
914                         break;
915                 case NVME_OPC_IDENTIFY:
916                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
917                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
918                         break;
919                 case NVME_OPC_ABORT:
920                         DPRINTF(("%s command ABORT\r\n", __func__));
921                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
922                         break;
923                 case NVME_OPC_SET_FEATURES:
924                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
925                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
926                         break;
927                 case NVME_OPC_GET_FEATURES:
928                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
929                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
930                         break;
931                 case NVME_OPC_ASYNC_EVENT_REQUEST:
932                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
933                         /* XXX dont care, unhandled for now
934                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
935                         */
936                         break;
937                 default:
938                         WPRINTF(("0x%x command is not implemented\r\n",
939                             cmd->opc));
940                 }
941         
942                 /* for now skip async event generation */
943                 if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
944                         struct nvme_completion *cp;
945                         int phase;
946
947                         cq = &sc->compl_queues[0];
948
949                         cp = &(cq->qbase)[cq->tail];
950                         cp->cdw0 = compl.cdw0;
951                         cp->sqid = 0;
952                         cp->sqhd = sqhead;
953                         cp->cid = cmd->cid;
954
955                         phase = NVME_STATUS_GET_P(cp->status);
956                         cp->status = compl.status;
957                         pci_nvme_toggle_phase(&cp->status, phase);
958
959                         cq->tail = (cq->tail + 1) % cq->size;
960                 }
961                 sqhead = (sqhead + 1) % sq->size;
962         }
963
964         DPRINTF(("setting sqhead %u\r\n", sqhead));
965         atomic_store_short(&sq->head, sqhead);
966         atomic_store_int(&sq->busy, 0);
967
968         if (do_intr)
969                 pci_generate_msix(sc->nsc_pi, 0);
970
971 }
972
973 static int
974 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
975         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
976 {
977         int iovidx;
978
979         if (req != NULL) {
980                 /* concatenate contig block-iovs to minimize number of iovs */
981                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
982                         iovidx = req->io_req.br_iovcnt - 1;
983
984                         req->io_req.br_iov[iovidx].iov_base =
985                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
986                                              req->prev_gpaddr, size);
987
988                         req->prev_size += size;
989                         req->io_req.br_resid += size;
990
991                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
992                 } else {
993                         pthread_mutex_lock(&req->mtx);
994
995                         iovidx = req->io_req.br_iovcnt;
996                         if (iovidx == NVME_MAX_BLOCKIOVS) {
997                                 int err = 0;
998
999                                 DPRINTF(("large I/O, doing partial req\r\n"));
1000
1001                                 iovidx = 0;
1002                                 req->io_req.br_iovcnt = 0;
1003
1004                                 req->io_req.br_callback = pci_nvme_io_partial;
1005
1006                                 if (!do_write)
1007                                         err = blockif_read(sc->nvstore.ctx,
1008                                                            &req->io_req);
1009                                 else
1010                                         err = blockif_write(sc->nvstore.ctx,
1011                                                             &req->io_req);
1012
1013                                 /* wait until req completes before cont */
1014                                 if (err == 0)
1015                                         pthread_cond_wait(&req->cv, &req->mtx);
1016                         }
1017                         if (iovidx == 0) {
1018                                 req->io_req.br_offset = lba;
1019                                 req->io_req.br_resid = 0;
1020                                 req->io_req.br_param = req;
1021                         }
1022
1023                         req->io_req.br_iov[iovidx].iov_base =
1024                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1025                                              gpaddr, size);
1026
1027                         req->io_req.br_iov[iovidx].iov_len = size;
1028
1029                         req->prev_gpaddr = gpaddr;
1030                         req->prev_size = size;
1031                         req->io_req.br_resid += size;
1032
1033                         req->io_req.br_iovcnt++;
1034
1035                         pthread_mutex_unlock(&req->mtx);
1036                 }
1037         } else {
1038                 /* RAM buffer: read/write directly */
1039                 void *p = sc->nvstore.ctx;
1040                 void *gptr;
1041
1042                 if ((lba + size) > sc->nvstore.size) {
1043                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1044                         return (-1);
1045                 }
1046
1047                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1048                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1049                 if (do_write) 
1050                         memcpy(p, gptr, size);
1051                 else
1052                         memcpy(gptr, p, size);
1053         }
1054         return (0);
1055 }
1056
1057 static void
1058 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1059         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1060         uint32_t cdw0, uint16_t status, int ignore_busy)
1061 {
1062         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1063         struct nvme_completion *compl;
1064         int do_intr = 0;
1065         int phase;
1066
1067         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1068                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1069                  NVME_STATUS_GET_SC(status)));
1070
1071         pthread_mutex_lock(&cq->mtx);
1072
1073         assert(cq->qbase != NULL);
1074
1075         compl = &cq->qbase[cq->tail];
1076
1077         compl->sqhd = atomic_load_acq_short(&sq->head);
1078         compl->sqid = sqid;
1079         compl->cid = cid;
1080
1081         // toggle phase
1082         phase = NVME_STATUS_GET_P(compl->status);
1083         compl->status = status;
1084         pci_nvme_toggle_phase(&compl->status, phase);
1085
1086         cq->tail = (cq->tail + 1) % cq->size;
1087
1088         if (cq->intr_en & NVME_CQ_INTEN)
1089                 do_intr = 1;
1090
1091         pthread_mutex_unlock(&cq->mtx);
1092
1093         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1094                 if (do_intr)
1095                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1096 }
1097
1098 static void
1099 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1100 {
1101         req->sc = NULL;
1102         req->nvme_sq = NULL;
1103         req->sqid = 0;
1104
1105         pthread_mutex_lock(&sc->mtx);
1106
1107         req->next = sc->ioreqs_free;
1108         sc->ioreqs_free = req;
1109         sc->pending_ios--;
1110
1111         /* when no more IO pending, can set to ready if device reset/enabled */
1112         if (sc->pending_ios == 0 &&
1113             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1114                 sc->regs.csts |= NVME_CSTS_RDY;
1115
1116         pthread_mutex_unlock(&sc->mtx);
1117
1118         sem_post(&sc->iosemlock);
1119 }
1120
1121 static struct pci_nvme_ioreq *
1122 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1123 {
1124         struct pci_nvme_ioreq *req = NULL;;
1125
1126         sem_wait(&sc->iosemlock);
1127         pthread_mutex_lock(&sc->mtx);
1128
1129         req = sc->ioreqs_free;
1130         assert(req != NULL);
1131
1132         sc->ioreqs_free = req->next;
1133
1134         req->next = NULL;
1135         req->sc = sc;
1136
1137         sc->pending_ios++;
1138
1139         pthread_mutex_unlock(&sc->mtx);
1140
1141         req->io_req.br_iovcnt = 0;
1142         req->io_req.br_offset = 0;
1143         req->io_req.br_resid = 0;
1144         req->io_req.br_param = req;
1145         req->prev_gpaddr = 0;
1146         req->prev_size = 0;
1147
1148         return req;
1149 }
1150
1151 static void
1152 pci_nvme_io_done(struct blockif_req *br, int err)
1153 {
1154         struct pci_nvme_ioreq *req = br->br_param;
1155         struct nvme_submission_queue *sq = req->nvme_sq;
1156         uint16_t code, status;
1157
1158         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1159         
1160         /* TODO return correct error */
1161         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1162         pci_nvme_status_genc(&status, code);
1163
1164         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1165         pci_nvme_release_ioreq(req->sc, req);
1166 }
1167
1168 static void
1169 pci_nvme_io_partial(struct blockif_req *br, int err)
1170 {
1171         struct pci_nvme_ioreq *req = br->br_param;
1172
1173         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1174
1175         pthread_cond_signal(&req->cv);
1176 }
1177
1178
1179 static void
1180 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1181 {
1182         struct nvme_submission_queue *sq;
1183         uint16_t status;
1184         uint16_t sqhead;
1185         int err;
1186
1187         /* handle all submissions up to sq->tail index */
1188         sq = &sc->submit_queues[idx];
1189
1190         if (atomic_testandset_int(&sq->busy, 1)) {
1191                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1192                 return;
1193         }
1194
1195         sqhead = atomic_load_acq_short(&sq->head);
1196
1197         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1198                  idx, sqhead, sq->tail, sq->qbase));
1199
1200         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1201                 struct nvme_command *cmd;
1202                 struct pci_nvme_ioreq *req = NULL;
1203                 uint64_t lba;
1204                 uint64_t nblocks, bytes, size, cpsz;
1205
1206                 /* TODO: support scatter gather list handling */
1207
1208                 cmd = &sq->qbase[sqhead];
1209                 sqhead = (sqhead + 1) % sq->size;
1210
1211                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1212
1213                 if (cmd->opc == NVME_OPC_FLUSH) {
1214                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1215                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1216                                                 status, 1);
1217
1218                         continue;
1219                 } else if (cmd->opc == 0x08) {
1220                         /* TODO: write zeroes */
1221                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1222                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1223                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1224                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1225                                                 status, 1);
1226
1227                         continue;
1228                 }
1229
1230                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1231
1232                 bytes = nblocks * sc->nvstore.sectsz;
1233
1234                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1235                         req = pci_nvme_get_ioreq(sc);
1236                         req->nvme_sq = sq;
1237                         req->sqid = idx;
1238                 }
1239
1240                 /*
1241                  * If data starts mid-page and flows into the next page, then
1242                  * increase page count
1243                  */
1244
1245                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1246                          "(%lu-bytes)\r\n",
1247                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1248                          cmd->opc == NVME_OPC_WRITE ?
1249                              "WRITE" : "READ",
1250                          lba, nblocks, bytes));
1251
1252                 cmd->prp1 &= ~(0x03UL);
1253                 cmd->prp2 &= ~(0x03UL);
1254
1255                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1256
1257                 size = bytes;
1258                 lba *= sc->nvstore.sectsz;
1259
1260                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1261
1262                 if (cpsz > bytes)
1263                         cpsz = bytes;
1264
1265                 if (req != NULL) {
1266                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1267                                                 cmd->cdw10;
1268                         req->opc = cmd->opc;
1269                         req->cid = cmd->cid;
1270                         req->nsid = cmd->nsid;
1271                 }
1272
1273                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1274                     cmd->opc == NVME_OPC_WRITE, lba);
1275                 lba += cpsz;
1276                 size -= cpsz;
1277
1278                 if (size == 0)
1279                         goto iodone;
1280
1281                 if (size <= PAGE_SIZE) {
1282                         /* prp2 is second (and final) page in transfer */
1283
1284                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1285                             size,
1286                             cmd->opc == NVME_OPC_WRITE,
1287                             lba);
1288                 } else {
1289                         uint64_t *prp_list;
1290                         int i;
1291
1292                         /* prp2 is pointer to a physical region page list */
1293                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1294                                                     cmd->prp2, PAGE_SIZE);
1295
1296                         i = 0;
1297                         while (size != 0) {
1298                                 cpsz = MIN(size, PAGE_SIZE);
1299
1300                                 /*
1301                                  * Move to linked physical region page list
1302                                  * in last item.
1303                                  */ 
1304                                 if (i == (NVME_PRP2_ITEMS-1) &&
1305                                     size > PAGE_SIZE) {
1306                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1307                                         prp_list = paddr_guest2host(
1308                                                       sc->nsc_pi->pi_vmctx,
1309                                                       prp_list[i], PAGE_SIZE);
1310                                         i = 0;
1311                                 }
1312                                 if (prp_list[i] == 0) {
1313                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1314                                         err = 1;
1315                                         break;
1316                                 }
1317
1318                                 err = pci_nvme_append_iov_req(sc, req,
1319                                     prp_list[i], cpsz,
1320                                     cmd->opc == NVME_OPC_WRITE, lba);
1321                                 if (err)
1322                                         break;
1323
1324                                 lba += cpsz;
1325                                 size -= cpsz;
1326                                 i++;
1327                         }
1328                 }
1329
1330 iodone:
1331                 if (sc->nvstore.type == NVME_STOR_RAM) {
1332                         uint16_t code, status;
1333
1334                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1335                             NVME_SC_SUCCESS;
1336                         pci_nvme_status_genc(&status, code);
1337
1338                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1339                                                 status, 1);
1340
1341                         continue;
1342                 }
1343
1344
1345                 if (err)
1346                         goto do_error;
1347
1348                 req->io_req.br_callback = pci_nvme_io_done;
1349
1350                 err = 0;
1351                 switch (cmd->opc) {
1352                 case NVME_OPC_READ:
1353                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1354                         break;
1355                 case NVME_OPC_WRITE:
1356                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1357                         break;
1358                 default:
1359                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1360                                  __func__, cmd->opc));
1361                         err = 1;
1362                 }
1363
1364 do_error:
1365                 if (err) {
1366                         uint16_t status;
1367
1368                         pci_nvme_status_genc(&status,
1369                             NVME_SC_DATA_TRANSFER_ERROR);
1370
1371                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1372                                                 status, 1);
1373                         pci_nvme_release_ioreq(sc, req);
1374                 }
1375         }
1376
1377         atomic_store_short(&sq->head, sqhead);
1378         atomic_store_int(&sq->busy, 0);
1379 }
1380
1381 static void
1382 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1383         uint64_t idx, int is_sq, uint64_t value)
1384 {
1385         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1386                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1387
1388         if (is_sq) {
1389                 atomic_store_short(&sc->submit_queues[idx].tail,
1390                                    (uint16_t)value);
1391
1392                 if (idx == 0) {
1393                         pci_nvme_handle_admin_cmd(sc, value);
1394                 } else {
1395                         /* submission queue; handle new entries in SQ */
1396                         if (idx > sc->num_squeues) {
1397                                 WPRINTF(("%s SQ index %lu overflow from "
1398                                          "guest (max %u)\r\n",
1399                                          __func__, idx, sc->num_squeues));
1400                                 return;
1401                         }
1402                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1403                 }
1404         } else {
1405                 if (idx > sc->num_cqueues) {
1406                         WPRINTF(("%s queue index %lu overflow from "
1407                                  "guest (max %u)\r\n",
1408                                  __func__, idx, sc->num_cqueues));
1409                         return;
1410                 }
1411
1412                 sc->compl_queues[idx].head = (uint16_t)value;
1413         }
1414 }
1415
1416 static void
1417 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1418 {
1419         const char *s = iswrite ? "WRITE" : "READ";
1420
1421         switch (offset) {
1422         case NVME_CR_CAP_LOW:
1423                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1424                 break;
1425         case NVME_CR_CAP_HI:
1426                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1427                 break;
1428         case NVME_CR_VS:
1429                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1430                 break;
1431         case NVME_CR_INTMS:
1432                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1433                 break;
1434         case NVME_CR_INTMC:
1435                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1436                 break;
1437         case NVME_CR_CC:
1438                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1439                 break;
1440         case NVME_CR_CSTS:
1441                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1442                 break;
1443         case NVME_CR_NSSR:
1444                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1445                 break;
1446         case NVME_CR_AQA:
1447                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1448                 break;
1449         case NVME_CR_ASQ_LOW:
1450                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1451                 break;
1452         case NVME_CR_ASQ_HI:
1453                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1454                 break;
1455         case NVME_CR_ACQ_LOW:
1456                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1457                 break;
1458         case NVME_CR_ACQ_HI:
1459                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1460                 break;
1461         default:
1462                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1463         }
1464
1465 }
1466
1467 static void
1468 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1469         uint64_t offset, int size, uint64_t value)
1470 {
1471         uint32_t ccreg;
1472
1473         if (offset >= NVME_DOORBELL_OFFSET) {
1474                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1475                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1476                 int is_sq = (belloffset % 8) < 4;
1477
1478                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1479                         WPRINTF(("guest attempted an overflow write offset "
1480                                  "0x%lx, val 0x%lx in %s",
1481                                  offset, value, __func__));
1482                         return;
1483                 }
1484
1485                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1486                 return;
1487         }
1488
1489         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1490                 offset, size, value));
1491
1492         if (size != 4) {
1493                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1494                          "val 0x%lx) to bar0 in %s",
1495                          size, offset, value, __func__));
1496                 /* TODO: shutdown device */
1497                 return;
1498         }
1499
1500         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1501
1502         pthread_mutex_lock(&sc->mtx);
1503
1504         switch (offset) {
1505         case NVME_CR_CAP_LOW:
1506         case NVME_CR_CAP_HI:
1507                 /* readonly */
1508                 break;
1509         case NVME_CR_VS:
1510                 /* readonly */
1511                 break;
1512         case NVME_CR_INTMS:
1513                 /* MSI-X, so ignore */
1514                 break;
1515         case NVME_CR_INTMC:
1516                 /* MSI-X, so ignore */
1517                 break;
1518         case NVME_CR_CC:
1519                 ccreg = (uint32_t)value;
1520
1521                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1522                          "iocqes %u\r\n",
1523                         __func__,
1524                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1525                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1526                          NVME_CC_GET_IOCQES(ccreg)));
1527
1528                 if (NVME_CC_GET_SHN(ccreg)) {
1529                         /* perform shutdown - flush out data to backend */
1530                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1531                             NVME_CSTS_REG_SHST_SHIFT);
1532                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1533                             NVME_CSTS_REG_SHST_SHIFT;
1534                 }
1535                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1536                         if (NVME_CC_GET_EN(ccreg) == 0)
1537                                 /* transition 1-> causes controller reset */
1538                                 pci_nvme_reset_locked(sc);
1539                         else
1540                                 pci_nvme_init_controller(ctx, sc);
1541                 }
1542
1543                 /* Insert the iocqes, iosqes and en bits from the write */
1544                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1545                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1546                 if (NVME_CC_GET_EN(ccreg) == 0) {
1547                         /* Insert the ams, mps and css bit fields */
1548                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1549                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1550                         sc->regs.csts &= ~NVME_CSTS_RDY;
1551                 } else if (sc->pending_ios == 0) {
1552                         sc->regs.csts |= NVME_CSTS_RDY;
1553                 }
1554                 break;
1555         case NVME_CR_CSTS:
1556                 break;
1557         case NVME_CR_NSSR:
1558                 /* ignore writes; don't support subsystem reset */
1559                 break;
1560         case NVME_CR_AQA:
1561                 sc->regs.aqa = (uint32_t)value;
1562                 break;
1563         case NVME_CR_ASQ_LOW:
1564                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1565                                (0xFFFFF000 & value);
1566                 break;
1567         case NVME_CR_ASQ_HI:
1568                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1569                                (value << 32);
1570                 break;
1571         case NVME_CR_ACQ_LOW:
1572                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1573                                (0xFFFFF000 & value);
1574                 break;
1575         case NVME_CR_ACQ_HI:
1576                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1577                                (value << 32);
1578                 break;
1579         default:
1580                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1581                          __func__, offset, value, size));
1582         }
1583         pthread_mutex_unlock(&sc->mtx);
1584 }
1585
1586 static void
1587 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1588                 int baridx, uint64_t offset, int size, uint64_t value)
1589 {
1590         struct pci_nvme_softc* sc = pi->pi_arg;
1591
1592         if (baridx == pci_msix_table_bar(pi) ||
1593             baridx == pci_msix_pba_bar(pi)) {
1594                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1595                          " value 0x%lx\r\n", baridx, offset, size, value));
1596
1597                 pci_emul_msix_twrite(pi, offset, size, value);
1598                 return;
1599         }
1600
1601         switch (baridx) {
1602         case 0:
1603                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1604                 break;
1605
1606         default:
1607                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1608                          __func__, baridx, value));
1609         }
1610 }
1611
1612 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1613         uint64_t offset, int size)
1614 {
1615         uint64_t value;
1616
1617         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1618
1619         if (offset < NVME_DOORBELL_OFFSET) {
1620                 void *p = &(sc->regs);
1621                 pthread_mutex_lock(&sc->mtx);
1622                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1623                 pthread_mutex_unlock(&sc->mtx);
1624         } else {
1625                 value = 0;
1626                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1627         }
1628
1629         switch (size) {
1630         case 1:
1631                 value &= 0xFF;
1632                 break;
1633         case 2:
1634                 value &= 0xFFFF;
1635                 break;
1636         case 4:
1637                 value &= 0xFFFFFFFF;
1638                 break;
1639         }
1640
1641         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1642                  offset, size, (uint32_t)value));
1643
1644         return (value);
1645 }
1646
1647
1648
1649 static uint64_t
1650 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1651     uint64_t offset, int size)
1652 {
1653         struct pci_nvme_softc* sc = pi->pi_arg;
1654
1655         if (baridx == pci_msix_table_bar(pi) ||
1656             baridx == pci_msix_pba_bar(pi)) {
1657                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1658                         baridx, offset, size));
1659
1660                 return pci_emul_msix_tread(pi, offset, size);
1661         }
1662
1663         switch (baridx) {
1664         case 0:
1665                 return pci_nvme_read_bar_0(sc, offset, size);
1666
1667         default:
1668                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1669         }
1670
1671         return (0);
1672 }
1673
1674
1675 static int
1676 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1677 {
1678         char bident[sizeof("XX:X:X")];
1679         char    *uopt, *xopts, *config;
1680         uint32_t sectsz;
1681         int optidx;
1682
1683         sc->max_queues = NVME_QUEUES;
1684         sc->max_qentries = NVME_MAX_QENTRIES;
1685         sc->ioslots = NVME_IOSLOTS;
1686         sc->num_squeues = sc->max_queues;
1687         sc->num_cqueues = sc->max_queues;
1688         sectsz = 0;
1689
1690         uopt = strdup(opts);
1691         optidx = 0;
1692         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1693                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1694         for (xopts = strtok(uopt, ",");
1695              xopts != NULL;
1696              xopts = strtok(NULL, ",")) {
1697
1698                 if ((config = strchr(xopts, '=')) != NULL)
1699                         *config++ = '\0';
1700
1701                 if (!strcmp("maxq", xopts)) {
1702                         sc->max_queues = atoi(config);
1703                 } else if (!strcmp("qsz", xopts)) {
1704                         sc->max_qentries = atoi(config);
1705                 } else if (!strcmp("ioslots", xopts)) {
1706                         sc->ioslots = atoi(config);
1707                 } else if (!strcmp("sectsz", xopts)) {
1708                         sectsz = atoi(config);
1709                 } else if (!strcmp("ser", xopts)) {
1710                         /*
1711                          * This field indicates the Product Serial Number in
1712                          * 7-bit ASCII, unused bytes should be space characters.
1713                          * Ref: NVMe v1.3c.
1714                          */
1715                         cpywithpad((char *)sc->ctrldata.sn,
1716                                    sizeof(sc->ctrldata.sn), config, ' ');
1717                 } else if (!strcmp("ram", xopts)) {
1718                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1719
1720                         sc->nvstore.type = NVME_STOR_RAM;
1721                         sc->nvstore.size = sz * 1024 * 1024;
1722                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1723                         sc->nvstore.sectsz = 4096;
1724                         sc->nvstore.sectsz_bits = 12;
1725                         if (sc->nvstore.ctx == NULL) {
1726                                 perror("Unable to allocate RAM");
1727                                 free(uopt);
1728                                 return (-1);
1729                         }
1730                 } else if (optidx == 0) {
1731                         snprintf(bident, sizeof(bident), "%d:%d",
1732                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1733                         sc->nvstore.ctx = blockif_open(xopts, bident);
1734                         if (sc->nvstore.ctx == NULL) {
1735                                 perror("Could not open backing file");
1736                                 free(uopt);
1737                                 return (-1);
1738                         }
1739                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1740                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1741                 } else {
1742                         fprintf(stderr, "Invalid option %s\n", xopts);
1743                         free(uopt);
1744                         return (-1);
1745                 }
1746
1747                 optidx++;
1748         }
1749         free(uopt);
1750
1751         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1752                 fprintf(stderr, "backing store not specified\n");
1753                 return (-1);
1754         }
1755         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1756                 sc->nvstore.sectsz = sectsz;
1757         else if (sc->nvstore.type != NVME_STOR_RAM)
1758                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1759         for (sc->nvstore.sectsz_bits = 9;
1760              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1761              sc->nvstore.sectsz_bits++);
1762
1763         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1764                 sc->max_queues = NVME_QUEUES;
1765
1766         if (sc->max_qentries <= 0) {
1767                 fprintf(stderr, "Invalid qsz option\n");
1768                 return (-1);
1769         }
1770         if (sc->ioslots <= 0) {
1771                 fprintf(stderr, "Invalid ioslots option\n");
1772                 return (-1);
1773         }
1774
1775         return (0);
1776 }
1777
1778 static int
1779 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1780 {
1781         struct pci_nvme_softc *sc;
1782         uint32_t pci_membar_sz;
1783         int     error;
1784
1785         error = 0;
1786
1787         sc = calloc(1, sizeof(struct pci_nvme_softc));
1788         pi->pi_arg = sc;
1789         sc->nsc_pi = pi;
1790
1791         error = pci_nvme_parse_opts(sc, opts);
1792         if (error < 0)
1793                 goto done;
1794         else
1795                 error = 0;
1796
1797         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1798         for (int i = 0; i < sc->ioslots; i++) {
1799                 if (i < (sc->ioslots-1))
1800                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1801                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1802                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1803         }
1804         sc->ioreqs_free = sc->ioreqs;
1805         sc->intr_coales_aggr_thresh = 1;
1806
1807         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1808         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1809         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1810         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1811         pci_set_cfgdata8(pi, PCIR_PROGIF,
1812                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1813
1814         /* allocate size of nvme registers + doorbell space for all queues */
1815         pci_membar_sz = sizeof(struct nvme_registers) +
1816                         2*sizeof(uint32_t)*(sc->max_queues);
1817
1818         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1819
1820         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1821         if (error) {
1822                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1823                 goto done;
1824         }
1825
1826         error = pci_emul_add_msixcap(pi, sc->max_queues, NVME_MSIX_BAR);
1827         if (error) {
1828                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1829                 goto done;
1830         }
1831
1832         pthread_mutex_init(&sc->mtx, NULL);
1833         sem_init(&sc->iosemlock, 0, sc->ioslots);
1834
1835         pci_nvme_reset(sc);
1836         pci_nvme_init_ctrldata(sc);
1837         pci_nvme_init_nsdata(sc);
1838
1839         pci_lintr_request(pi);
1840
1841 done:
1842         return (error);
1843 }
1844
1845
1846 struct pci_devemu pci_de_nvme = {
1847         .pe_emu =       "nvme",
1848         .pe_init =      pci_nvme_init,
1849         .pe_barwrite =  pci_nvme_write,
1850         .pe_barread =   pci_nvme_read
1851 };
1852 PCI_EMUL_SET(pci_de_nvme);