]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
MFV r339226 (peter): Record merge of serf-1.3.9.
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include <sys/types.h>
57
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71
72 #include <dev/nvme/nvme.h>
73
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77
78
79 static int nvme_debug = 0;
80 #define DPRINTF(params) if (nvme_debug) printf params
81 #define WPRINTF(params) printf params
82
83 /* defaults; can be overridden */
84 #define NVME_MSIX_BAR           4
85
86 #define NVME_IOSLOTS            8
87
88 #define NVME_QUEUES             16
89 #define NVME_MAX_QENTRIES       2048
90
91 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
92 #define NVME_MAX_BLOCKIOVS      512
93
94 /* helpers */
95
96 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
97
98 enum nvme_controller_register_offsets {
99         NVME_CR_CAP_LOW = 0x00,
100         NVME_CR_CAP_HI  = 0x04,
101         NVME_CR_VS      = 0x08,
102         NVME_CR_INTMS   = 0x0c,
103         NVME_CR_INTMC   = 0x10,
104         NVME_CR_CC      = 0x14,
105         NVME_CR_CSTS    = 0x1c,
106         NVME_CR_NSSR    = 0x20,
107         NVME_CR_AQA     = 0x24,
108         NVME_CR_ASQ_LOW = 0x28,
109         NVME_CR_ASQ_HI  = 0x2c,
110         NVME_CR_ACQ_LOW = 0x30,
111         NVME_CR_ACQ_HI  = 0x34,
112 };
113
114 enum nvme_cmd_cdw11 {
115         NVME_CMD_CDW11_PC  = 0x0001,
116         NVME_CMD_CDW11_IEN = 0x0002,
117         NVME_CMD_CDW11_IV  = 0xFFFF0000,
118 };
119
120 #define NVME_CQ_INTEN   0x01
121 #define NVME_CQ_INTCOAL 0x02
122
123 struct nvme_completion_queue {
124         struct nvme_completion *qbase;
125         uint32_t        size;
126         uint16_t        tail; /* nvme progress */
127         uint16_t        head; /* guest progress */
128         uint16_t        intr_vec;
129         uint32_t        intr_en;
130         pthread_mutex_t mtx;
131 };
132
133 struct nvme_submission_queue {
134         struct nvme_command *qbase;
135         uint32_t        size;
136         uint16_t        head; /* nvme progress */
137         uint16_t        tail; /* guest progress */
138         uint16_t        cqid; /* completion queue id */
139         int             busy; /* queue is being processed */
140         int             qpriority;
141 };
142
143 enum nvme_storage_type {
144         NVME_STOR_BLOCKIF = 0,
145         NVME_STOR_RAM = 1,
146 };
147
148 struct pci_nvme_blockstore {
149         enum nvme_storage_type type;
150         void            *ctx;
151         uint64_t        size;
152         uint32_t        sectsz;
153         uint32_t        sectsz_bits;
154 };
155
156 struct pci_nvme_ioreq {
157         struct pci_nvme_softc *sc;
158         struct pci_nvme_ioreq *next;
159         struct nvme_submission_queue *nvme_sq;
160         uint16_t        sqid;
161
162         /* command information */
163         uint16_t        opc;
164         uint16_t        cid;
165         uint32_t        nsid;
166
167         uint64_t        prev_gpaddr;
168         size_t          prev_size;
169
170         /*
171          * lock if all iovs consumed (big IO);
172          * complete transaction before continuing
173          */
174         pthread_mutex_t mtx;
175         pthread_cond_t  cv;
176
177         struct blockif_req io_req;
178
179         /* pad to fit up to 512 page descriptors from guest IO request */
180         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
181 };
182
183 struct pci_nvme_softc {
184         struct pci_devinst *nsc_pi;
185
186         pthread_mutex_t mtx;
187
188         struct nvme_registers regs;
189
190         struct nvme_namespace_data  nsdata;
191         struct nvme_controller_data ctrldata;
192
193         struct pci_nvme_blockstore nvstore;
194
195         uint16_t        max_qentries; /* max entries per queue */
196         uint32_t        max_queues;
197         uint32_t        num_cqueues;
198         uint32_t        num_squeues;
199
200         struct pci_nvme_ioreq *ioreqs;
201         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
202         uint32_t        pending_ios;
203         uint32_t        ioslots;
204         sem_t           iosemlock;
205
206         /* status and guest memory mapped queues */
207         struct nvme_completion_queue *compl_queues;
208         struct nvme_submission_queue *submit_queues;
209
210         /* controller features */
211         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
212         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
213         uint32_t        async_ev_config;         /* 0x0B: async event config */
214 };
215
216
217 static void pci_nvme_io_partial(struct blockif_req *br, int err);
218
219 /* Controller Configuration utils */
220 #define NVME_CC_GET_EN(cc) \
221         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
222 #define NVME_CC_GET_CSS(cc) \
223         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
224 #define NVME_CC_GET_SHN(cc) \
225         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
226 #define NVME_CC_GET_IOSQES(cc) \
227         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
228 #define NVME_CC_GET_IOCQES(cc) \
229         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
230
231 #define NVME_CC_WRITE_MASK \
232         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
233          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
234          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
235
236 #define NVME_CC_NEN_WRITE_MASK \
237         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
238          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
239          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
240
241 /* Controller Status utils */
242 #define NVME_CSTS_GET_RDY(sts) \
243         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
244
245 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
246
247 /* Completion Queue status word utils */
248 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
249 #define NVME_STATUS_MASK \
250         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
251          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
252
253 static __inline void
254 cpywithpad(char *dst, int dst_size, const char *src, char pad)
255 {
256         int len = strnlen(src, dst_size);
257         memcpy(dst, src, len);
258         memset(dst + len, pad, dst_size - len);
259 }
260
261 static __inline void
262 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
263 {
264
265         *status &= ~NVME_STATUS_MASK;
266         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
267                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
268 }
269
270 static __inline void
271 pci_nvme_status_genc(uint16_t *status, uint16_t code)
272 {
273
274         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
275 }
276
277 static __inline void
278 pci_nvme_toggle_phase(uint16_t *status, int prev)
279 {
280
281         if (prev)
282                 *status &= ~NVME_STATUS_P;
283         else
284                 *status |= NVME_STATUS_P;
285 }
286
287 static void
288 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
289 {
290         struct nvme_controller_data *cd = &sc->ctrldata;
291
292         cd->vid = 0xFB5D;
293         cd->ssvid = 0x0000;
294
295         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
296         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
297
298         /* Num of submission commands that we can handle at a time (2^rab) */
299         cd->rab   = 4;
300
301         /* FreeBSD OUI */
302         cd->ieee[0] = 0x58;
303         cd->ieee[1] = 0x9c;
304         cd->ieee[2] = 0xfc;
305
306         cd->mic = 0;
307
308         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
309
310         cd->ver = 0x00010300;
311
312         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
313         cd->acl = 2;
314         cd->aerl = 4;
315
316         cd->lpa = 0;    /* TODO: support some simple things like SMART */
317         cd->elpe = 0;   /* max error log page entries */
318         cd->npss = 1;   /* number of power states support */
319
320         /* Warning Composite Temperature Threshold */
321         cd->wctemp = 0x0157;
322
323         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
324             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
325         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
326             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
327         cd->nn = 1;     /* number of namespaces */
328
329         cd->fna = 0x03;
330
331         cd->power_state[0].mp = 10;
332 }
333
334 static void
335 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
336 {
337         struct nvme_namespace_data *nd;
338
339         nd = &sc->nsdata;
340
341         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
342         nd->ncap = nd->nsze;
343         nd->nuse = nd->nsze;
344
345         /* Get LBA and backstore information from backing store */
346         nd->nlbaf = 1;
347         /* LBA data-sz = 2^lbads */
348         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
349
350         nd->flbas = 0;
351 }
352
353 static void
354 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
355 {
356         DPRINTF(("%s\r\n", __func__));
357
358         sc->regs.cap_lo = (sc->max_qentries & NVME_CAP_LO_REG_MQES_MASK) |
359             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
360             (60 << NVME_CAP_LO_REG_TO_SHIFT);
361
362         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
363
364         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
365
366         sc->regs.cc = 0;
367         sc->regs.csts = 0;
368
369         sc->num_cqueues = sc->num_squeues = sc->max_queues;
370         if (sc->submit_queues != NULL) {
371                 for (int i = 0; i <= sc->max_queues; i++) {
372                         /*
373                          * The Admin Submission Queue is at index 0.
374                          * It must not be changed at reset otherwise the
375                          * emulation will be out of sync with the guest.
376                          */
377                         if (i != 0) {
378                                 sc->submit_queues[i].qbase = NULL;
379                                 sc->submit_queues[i].size = 0;
380                                 sc->submit_queues[i].cqid = 0;
381
382                                 sc->compl_queues[i].qbase = NULL;
383                                 sc->compl_queues[i].size = 0;
384                         }
385                         sc->submit_queues[i].tail = 0;
386                         sc->submit_queues[i].head = 0;
387                         sc->submit_queues[i].busy = 0;
388
389                         sc->compl_queues[i].tail = 0;
390                         sc->compl_queues[i].head = 0;
391                 }
392         } else
393                 sc->submit_queues = calloc(sc->max_queues + 1,
394                                         sizeof(struct nvme_submission_queue));
395
396         if (sc->compl_queues == NULL) {
397                 sc->compl_queues = calloc(sc->max_queues + 1,
398                                         sizeof(struct nvme_completion_queue));
399
400                 for (int i = 0; i <= sc->num_cqueues; i++)
401                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
402         }
403 }
404
405 static void
406 pci_nvme_reset(struct pci_nvme_softc *sc)
407 {
408         pthread_mutex_lock(&sc->mtx);
409         pci_nvme_reset_locked(sc);
410         pthread_mutex_unlock(&sc->mtx);
411 }
412
413 static void
414 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
415 {
416         uint16_t acqs, asqs;
417
418         DPRINTF(("%s\r\n", __func__));
419
420         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
421         sc->submit_queues[0].size = asqs;
422         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
423                     sizeof(struct nvme_command) * asqs);
424
425         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
426                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
427
428         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
429             NVME_AQA_REG_ACQS_MASK) + 1;
430         sc->compl_queues[0].size = acqs;
431         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
432                  sizeof(struct nvme_completion) * acqs);
433         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
434                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
435 }
436
437 static int
438 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
439         struct nvme_completion* compl)
440 {
441         uint16_t qid = command->cdw10 & 0xffff;
442
443         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
444         if (qid == 0 || qid > sc->num_cqueues) {
445                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
446                         __func__, qid, sc->num_squeues));
447                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
448                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
449                 return (1);
450         }
451
452         sc->submit_queues[qid].qbase = NULL;
453         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
454         return (1);
455 }
456
457 static int
458 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
459         struct nvme_completion* compl)
460 {
461         if (command->cdw11 & NVME_CMD_CDW11_PC) {
462                 uint16_t qid = command->cdw10 & 0xffff;
463                 struct nvme_submission_queue *nsq;
464
465                 if (qid > sc->num_squeues) {
466                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
467                                 __func__, qid, sc->num_squeues));
468                         pci_nvme_status_tc(&compl->status,
469                             NVME_SCT_COMMAND_SPECIFIC,
470                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
471                         return (1);
472                 }
473
474                 nsq = &sc->submit_queues[qid];
475                 nsq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
476
477                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
478                               sizeof(struct nvme_command) * (size_t)nsq->size);
479                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
480                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
481
482                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
483                         qid, nsq->size, nsq->qbase, nsq->cqid));
484
485                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
486
487                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
488                          __func__, qid));
489         } else {
490                 /* 
491                  * Guest sent non-cont submission queue request.
492                  * This setting is unsupported by this emulation.
493                  */
494                 WPRINTF(("%s unsupported non-contig (list-based) "
495                          "create i/o submission queue\r\n", __func__));
496
497                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
498         }
499         return (1);
500 }
501
502 static int
503 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
504         struct nvme_completion* compl)
505 {
506         uint16_t qid = command->cdw10 & 0xffff;
507
508         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
509         if (qid == 0 || qid > sc->num_cqueues) {
510                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
511                         __func__, qid, sc->num_cqueues));
512                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
513                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
514                 return (1);
515         }
516
517         sc->compl_queues[qid].qbase = NULL;
518         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
519         return (1);
520 }
521
522 static int
523 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
524         struct nvme_completion* compl)
525 {
526         if (command->cdw11 & NVME_CMD_CDW11_PC) {
527                 uint16_t qid = command->cdw10 & 0xffff;
528                 struct nvme_completion_queue *ncq;
529
530                 if (qid > sc->num_cqueues) {
531                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
532                                 __func__, qid, sc->num_cqueues));
533                         pci_nvme_status_tc(&compl->status,
534                             NVME_SCT_COMMAND_SPECIFIC,
535                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
536                         return (1);
537                 }
538
539                 ncq = &sc->compl_queues[qid];
540                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
541                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
542                 ncq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
543
544                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
545                              command->prp1,
546                              sizeof(struct nvme_command) * (size_t)ncq->size);
547
548                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
549         } else {
550                 /* 
551                  * Non-contig completion queue unsupported.
552                  */
553                 WPRINTF(("%s unsupported non-contig (list-based) "
554                          "create i/o completion queue\r\n",
555                          __func__));
556
557                 /* 0x12 = Invalid Use of Controller Memory Buffer */
558                 pci_nvme_status_genc(&compl->status, 0x12);
559         }
560
561         return (1);
562 }
563
564 static int
565 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
566         struct nvme_completion* compl)
567 {
568         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
569         uint8_t logpage = command->cdw10 & 0xFF;
570         void *data;
571
572         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
573
574         if (logpage >= 1 && logpage <= 3)
575                 data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
576                                   PAGE_SIZE);
577
578         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
579
580         switch (logpage) {
581         case 0x01: /* Error information */
582                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
583                 break;
584         case 0x02: /* SMART/Health information */
585                 /* TODO: present some smart info */
586                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
587                 break;
588         case 0x03: /* Firmware slot information */
589                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
590                 break;
591         default:
592                 WPRINTF(("%s get log page %x command not supported\r\n",
593                         __func__, logpage));
594
595                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
596                     NVME_SC_INVALID_LOG_PAGE);
597         }
598
599         return (1);
600 }
601
602 static int
603 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
604         struct nvme_completion* compl)
605 {
606         void *dest;
607
608         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
609                 command->cdw10 & 0xFF, command->nsid));
610
611         switch (command->cdw10 & 0xFF) {
612         case 0x00: /* return Identify Namespace data structure */
613                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
614                                   sizeof(sc->nsdata));
615                 memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
616                 break;
617         case 0x01: /* return Identify Controller data structure */
618                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
619                                   sizeof(sc->ctrldata));
620                 memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
621                 break;
622         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
623                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
624                                   sizeof(uint32_t) * 1024);
625                 ((uint32_t *)dest)[0] = 1;
626                 ((uint32_t *)dest)[1] = 0;
627                 break;
628         case 0x11:
629                 pci_nvme_status_genc(&compl->status,
630                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
631                 return (1);
632         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
633         case 0x10:
634         case 0x12:
635         case 0x13:
636         case 0x14:
637         case 0x15:
638         default:
639                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
640                          __func__, command->cdw10 & 0xFF));
641                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
642                 return (1);
643         }
644
645         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
646         return (1);
647 }
648
649 static int
650 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
651         struct nvme_completion* compl)
652 {
653         int feature = command->cdw10 & 0xFF;
654         uint32_t iv;
655
656         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
657         compl->cdw0 = 0;
658
659         switch (feature) {
660         case NVME_FEAT_ARBITRATION:
661                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
662                 break;
663         case NVME_FEAT_POWER_MANAGEMENT:
664                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
665                 break;
666         case NVME_FEAT_LBA_RANGE_TYPE:
667                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
668                 break;
669         case NVME_FEAT_TEMPERATURE_THRESHOLD:
670                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
671                 break;
672         case NVME_FEAT_ERROR_RECOVERY:
673                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
674                 break;
675         case NVME_FEAT_VOLATILE_WRITE_CACHE:
676                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
677                 break;
678         case NVME_FEAT_NUMBER_OF_QUEUES:
679                 sc->num_squeues = command->cdw11 & 0xFFFF;
680                 sc->num_cqueues = (command->cdw11 >> 16) & 0xFFFF;
681                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
682                         sc->num_squeues, sc->num_cqueues));
683
684                 if (sc->num_squeues == 0 || sc->num_squeues > sc->max_queues)
685                         sc->num_squeues = sc->max_queues;
686                 if (sc->num_cqueues == 0 || sc->num_cqueues > sc->max_queues)
687                         sc->num_cqueues = sc->max_queues;
688
689                 compl->cdw0 = (sc->num_squeues & 0xFFFF) |
690                               ((sc->num_cqueues & 0xFFFF) << 16);
691
692                 break;
693         case NVME_FEAT_INTERRUPT_COALESCING:
694                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
695
696                 /* in uS */
697                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
698
699                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
700                 break;
701         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
702                 iv = command->cdw11 & 0xFFFF;
703
704                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
705                         command->cdw11));
706
707                 for (uint32_t i = 0; i <= sc->num_cqueues; i++) {
708                         if (sc->compl_queues[i].intr_vec == iv) {
709                                 if (command->cdw11 & (1 << 16))
710                                         sc->compl_queues[i].intr_en |=
711                                                               NVME_CQ_INTCOAL;  
712                                 else
713                                         sc->compl_queues[i].intr_en &=
714                                                              ~NVME_CQ_INTCOAL;  
715                         }
716                 }
717                 break;
718         case NVME_FEAT_WRITE_ATOMICITY:
719                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
720                 break;
721         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
722                 DPRINTF(("  async event configuration 0x%x\r\n",
723                         command->cdw11));
724                 sc->async_ev_config = command->cdw11;
725                 break;
726         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
727                 DPRINTF(("  software progress marker 0x%x\r\n",
728                         command->cdw11));
729                 break;
730         case 0x0C:
731                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
732                         command->cdw11));
733                 break;
734         default:
735                 WPRINTF(("%s invalid feature\r\n", __func__));
736                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
737                 return (1);
738         }
739
740         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
741         return (1);
742 }
743
744 static int
745 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
746         struct nvme_completion* compl)
747 {
748         int feature = command->cdw10 & 0xFF;
749
750         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
751
752         compl->cdw0 = 0;
753
754         switch (feature) {
755         case NVME_FEAT_ARBITRATION:
756                 DPRINTF(("  arbitration\r\n"));
757                 break;
758         case NVME_FEAT_POWER_MANAGEMENT:
759                 DPRINTF(("  power management\r\n"));
760                 break;
761         case NVME_FEAT_LBA_RANGE_TYPE:
762                 DPRINTF(("  lba range\r\n"));
763                 break;
764         case NVME_FEAT_TEMPERATURE_THRESHOLD:
765                 DPRINTF(("  temperature threshold\r\n"));
766                 switch ((command->cdw11 >> 20) & 0x3) {
767                 case 0:
768                         /* Over temp threshold */
769                         compl->cdw0 = 0xFFFF;
770                         break;
771                 case 1:
772                         /* Under temp threshold */
773                         compl->cdw0 = 0;
774                         break;
775                 default:
776                         WPRINTF(("  invalid threshold type select\r\n"));
777                         pci_nvme_status_genc(&compl->status,
778                             NVME_SC_INVALID_FIELD);
779                         return (1);
780                 }
781                 break;
782         case NVME_FEAT_ERROR_RECOVERY:
783                 DPRINTF(("  error recovery\r\n"));
784                 break;
785         case NVME_FEAT_VOLATILE_WRITE_CACHE:
786                 DPRINTF(("  volatile write cache\r\n"));
787                 break;
788         case NVME_FEAT_NUMBER_OF_QUEUES:
789                 compl->cdw0 = 0;
790                 if (sc->num_squeues == 0)
791                         compl->cdw0 |= sc->max_queues & 0xFFFF;
792                 else
793                         compl->cdw0 |= sc->num_squeues & 0xFFFF;
794
795                 if (sc->num_cqueues == 0)
796                         compl->cdw0 |= (sc->max_queues & 0xFFFF) << 16;
797                 else
798                         compl->cdw0 |= (sc->num_cqueues & 0xFFFF) << 16;
799
800                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
801                         compl->cdw0 & 0xFFFF,
802                         (compl->cdw0 >> 16) & 0xFFFF));
803
804                 break;
805         case NVME_FEAT_INTERRUPT_COALESCING:
806                 DPRINTF(("  interrupt coalescing\r\n"));
807                 break;
808         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
809                 DPRINTF(("  interrupt vector configuration\r\n"));
810                 break;
811         case NVME_FEAT_WRITE_ATOMICITY:
812                 DPRINTF(("  write atomicity\r\n"));
813                 break;
814         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
815                 DPRINTF(("  async event configuration\r\n"));
816                 sc->async_ev_config = command->cdw11;
817                 break;
818         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
819                 DPRINTF(("  software progress marker\r\n"));
820                 break;
821         case 0x0C:
822                 DPRINTF(("  autonomous power state transition\r\n"));
823                 break;
824         default:
825                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
826                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
827                 return (1);
828         }
829
830         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
831         return (1);
832 }
833
834 static int
835 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
836         struct nvme_completion* compl)
837 {
838         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
839                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
840
841         /* TODO: search for the command ID and abort it */
842
843         compl->cdw0 = 1;
844         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
845         return (1);
846 }
847
848 static int
849 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
850         struct nvme_command* command, struct nvme_completion* compl)
851 {
852         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
853
854         /*
855          * TODO: raise events when they happen based on the Set Features cmd.
856          * These events happen async, so only set completion successful if
857          * there is an event reflective of the request to get event.
858          */
859         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
860             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
861         return (0);
862 }
863
864 static void
865 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
866 {
867         struct nvme_completion compl;
868         struct nvme_command *cmd;
869         struct nvme_submission_queue *sq;
870         struct nvme_completion_queue *cq;
871         int do_intr = 0;
872         uint16_t sqhead;
873
874         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
875
876         sq = &sc->submit_queues[0];
877
878         sqhead = atomic_load_acq_short(&sq->head);
879
880         if (atomic_testandset_int(&sq->busy, 1)) {
881                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
882                         __func__, sqhead, sq->tail));
883                 return;
884         }
885
886         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
887         
888         while (sqhead != atomic_load_acq_short(&sq->tail)) {
889                 cmd = &(sq->qbase)[sqhead];
890                 compl.status = 0;
891
892                 switch (cmd->opc) {
893                 case NVME_OPC_DELETE_IO_SQ:
894                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
895                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
896                         break;
897                 case NVME_OPC_CREATE_IO_SQ:
898                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
899                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
900                         break;
901                 case NVME_OPC_DELETE_IO_CQ:
902                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
903                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
904                         break;
905                 case NVME_OPC_CREATE_IO_CQ:
906                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
907                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
908                         break;
909                 case NVME_OPC_GET_LOG_PAGE:
910                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
911                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
912                         break;
913                 case NVME_OPC_IDENTIFY:
914                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
915                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
916                         break;
917                 case NVME_OPC_ABORT:
918                         DPRINTF(("%s command ABORT\r\n", __func__));
919                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
920                         break;
921                 case NVME_OPC_SET_FEATURES:
922                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
923                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
924                         break;
925                 case NVME_OPC_GET_FEATURES:
926                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
927                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
928                         break;
929                 case NVME_OPC_ASYNC_EVENT_REQUEST:
930                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
931                         /* XXX dont care, unhandled for now
932                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
933                         */
934                         break;
935                 default:
936                         WPRINTF(("0x%x command is not implemented\r\n",
937                             cmd->opc));
938                 }
939         
940                 /* for now skip async event generation */
941                 if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
942                         struct nvme_completion *cp;
943                         int phase;
944
945                         cq = &sc->compl_queues[0];
946
947                         cp = &(cq->qbase)[cq->tail];
948                         cp->sqid = 0;
949                         cp->sqhd = sqhead;
950                         cp->cid = cmd->cid;
951
952                         phase = NVME_STATUS_GET_P(cp->status);
953                         cp->status = compl.status;
954                         pci_nvme_toggle_phase(&cp->status, phase);
955
956                         cq->tail = (cq->tail + 1) % cq->size;
957                 }
958                 sqhead = (sqhead + 1) % sq->size;
959         }
960
961         DPRINTF(("setting sqhead %u\r\n", sqhead));
962         atomic_store_short(&sq->head, sqhead);
963         atomic_store_int(&sq->busy, 0);
964
965         if (do_intr)
966                 pci_generate_msix(sc->nsc_pi, 0);
967
968 }
969
970 static int
971 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
972         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
973 {
974         int iovidx;
975
976         if (req != NULL) {
977                 /* concatenate contig block-iovs to minimize number of iovs */
978                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
979                         iovidx = req->io_req.br_iovcnt - 1;
980
981                         req->io_req.br_iov[iovidx].iov_base =
982                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
983                                              req->prev_gpaddr, size);
984
985                         req->prev_size += size;
986                         req->io_req.br_resid += size;
987
988                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
989                 } else {
990                         pthread_mutex_lock(&req->mtx);
991
992                         iovidx = req->io_req.br_iovcnt;
993                         if (iovidx == NVME_MAX_BLOCKIOVS) {
994                                 int err = 0;
995
996                                 DPRINTF(("large I/O, doing partial req\r\n"));
997
998                                 iovidx = 0;
999                                 req->io_req.br_iovcnt = 0;
1000
1001                                 req->io_req.br_callback = pci_nvme_io_partial;
1002
1003                                 if (!do_write)
1004                                         err = blockif_read(sc->nvstore.ctx,
1005                                                            &req->io_req);
1006                                 else
1007                                         err = blockif_write(sc->nvstore.ctx,
1008                                                             &req->io_req);
1009
1010                                 /* wait until req completes before cont */
1011                                 if (err == 0)
1012                                         pthread_cond_wait(&req->cv, &req->mtx);
1013                         }
1014                         if (iovidx == 0) {
1015                                 req->io_req.br_offset = lba;
1016                                 req->io_req.br_resid = 0;
1017                                 req->io_req.br_param = req;
1018                         }
1019
1020                         req->io_req.br_iov[iovidx].iov_base =
1021                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1022                                              gpaddr, size);
1023
1024                         req->io_req.br_iov[iovidx].iov_len = size;
1025
1026                         req->prev_gpaddr = gpaddr;
1027                         req->prev_size = size;
1028                         req->io_req.br_resid += size;
1029
1030                         req->io_req.br_iovcnt++;
1031
1032                         pthread_mutex_unlock(&req->mtx);
1033                 }
1034         } else {
1035                 /* RAM buffer: read/write directly */
1036                 void *p = sc->nvstore.ctx;
1037                 void *gptr;
1038
1039                 if ((lba + size) > sc->nvstore.size) {
1040                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1041                         return (-1);
1042                 }
1043
1044                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1045                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1046                 if (do_write) 
1047                         memcpy(p, gptr, size);
1048                 else
1049                         memcpy(gptr, p, size);
1050         }
1051         return (0);
1052 }
1053
1054 static void
1055 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1056         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1057         uint32_t cdw0, uint16_t status, int ignore_busy)
1058 {
1059         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1060         struct nvme_completion *compl;
1061         int do_intr = 0;
1062         int phase;
1063
1064         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1065                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1066                  NVME_STATUS_GET_SC(status)));
1067
1068         pthread_mutex_lock(&cq->mtx);
1069
1070         assert(cq->qbase != NULL);
1071
1072         compl = &cq->qbase[cq->tail];
1073
1074         compl->sqhd = atomic_load_acq_short(&sq->head);
1075         compl->sqid = sqid;
1076         compl->cid = cid;
1077
1078         // toggle phase
1079         phase = NVME_STATUS_GET_P(compl->status);
1080         compl->status = status;
1081         pci_nvme_toggle_phase(&compl->status, phase);
1082
1083         cq->tail = (cq->tail + 1) % cq->size;
1084
1085         if (cq->intr_en & NVME_CQ_INTEN)
1086                 do_intr = 1;
1087
1088         pthread_mutex_unlock(&cq->mtx);
1089
1090         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1091                 if (do_intr)
1092                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1093 }
1094
1095 static void
1096 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1097 {
1098         req->sc = NULL;
1099         req->nvme_sq = NULL;
1100         req->sqid = 0;
1101
1102         pthread_mutex_lock(&sc->mtx);
1103
1104         req->next = sc->ioreqs_free;
1105         sc->ioreqs_free = req;
1106         sc->pending_ios--;
1107
1108         /* when no more IO pending, can set to ready if device reset/enabled */
1109         if (sc->pending_ios == 0 &&
1110             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1111                 sc->regs.csts |= NVME_CSTS_RDY;
1112
1113         pthread_mutex_unlock(&sc->mtx);
1114
1115         sem_post(&sc->iosemlock);
1116 }
1117
1118 static struct pci_nvme_ioreq *
1119 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1120 {
1121         struct pci_nvme_ioreq *req = NULL;;
1122
1123         sem_wait(&sc->iosemlock);
1124         pthread_mutex_lock(&sc->mtx);
1125
1126         req = sc->ioreqs_free;
1127         assert(req != NULL);
1128
1129         sc->ioreqs_free = req->next;
1130
1131         req->next = NULL;
1132         req->sc = sc;
1133
1134         sc->pending_ios++;
1135
1136         pthread_mutex_unlock(&sc->mtx);
1137
1138         req->io_req.br_iovcnt = 0;
1139         req->io_req.br_offset = 0;
1140         req->io_req.br_resid = 0;
1141         req->io_req.br_param = req;
1142         req->prev_gpaddr = 0;
1143         req->prev_size = 0;
1144
1145         return req;
1146 }
1147
1148 static void
1149 pci_nvme_io_done(struct blockif_req *br, int err)
1150 {
1151         struct pci_nvme_ioreq *req = br->br_param;
1152         struct nvme_submission_queue *sq = req->nvme_sq;
1153         uint16_t code, status;
1154
1155         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1156         
1157         /* TODO return correct error */
1158         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1159         pci_nvme_status_genc(&status, code);
1160
1161         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1162         pci_nvme_release_ioreq(req->sc, req);
1163 }
1164
1165 static void
1166 pci_nvme_io_partial(struct blockif_req *br, int err)
1167 {
1168         struct pci_nvme_ioreq *req = br->br_param;
1169
1170         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1171
1172         pthread_cond_signal(&req->cv);
1173 }
1174
1175
1176 static void
1177 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1178 {
1179         struct nvme_submission_queue *sq;
1180         uint16_t status;
1181         uint16_t sqhead;
1182         int err;
1183
1184         /* handle all submissions up to sq->tail index */
1185         sq = &sc->submit_queues[idx];
1186
1187         if (atomic_testandset_int(&sq->busy, 1)) {
1188                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1189                 return;
1190         }
1191
1192         sqhead = atomic_load_acq_short(&sq->head);
1193
1194         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1195                  idx, sqhead, sq->tail, sq->qbase));
1196
1197         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1198                 struct nvme_command *cmd;
1199                 struct pci_nvme_ioreq *req = NULL;
1200                 uint64_t lba;
1201                 uint64_t nblocks, bytes, size, cpsz;
1202
1203                 /* TODO: support scatter gather list handling */
1204
1205                 cmd = &sq->qbase[sqhead];
1206                 sqhead = (sqhead + 1) % sq->size;
1207
1208                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1209
1210                 if (cmd->opc == NVME_OPC_FLUSH) {
1211                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1212                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1213                                                 status, 1);
1214
1215                         continue;
1216                 } else if (cmd->opc == 0x08) {
1217                         /* TODO: write zeroes */
1218                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1219                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1220                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1221                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1222                                                 status, 1);
1223
1224                         continue;
1225                 }
1226
1227                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1228
1229                 bytes = nblocks * sc->nvstore.sectsz;
1230
1231                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1232                         req = pci_nvme_get_ioreq(sc);
1233                         req->nvme_sq = sq;
1234                         req->sqid = idx;
1235                 }
1236
1237                 /*
1238                  * If data starts mid-page and flows into the next page, then
1239                  * increase page count
1240                  */
1241
1242                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1243                          "(%lu-bytes)\r\n",
1244                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1245                          cmd->opc == NVME_OPC_WRITE ?
1246                              "WRITE" : "READ",
1247                          lba, nblocks, bytes));
1248
1249                 cmd->prp1 &= ~(0x03UL);
1250                 cmd->prp2 &= ~(0x03UL);
1251
1252                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1253
1254                 size = bytes;
1255                 lba *= sc->nvstore.sectsz;
1256
1257                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1258
1259                 if (cpsz > bytes)
1260                         cpsz = bytes;
1261
1262                 if (req != NULL) {
1263                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1264                                                 cmd->cdw10;
1265                         req->opc = cmd->opc;
1266                         req->cid = cmd->cid;
1267                         req->nsid = cmd->nsid;
1268                 }
1269
1270                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1271                     cmd->opc == NVME_OPC_WRITE, lba);
1272                 lba += cpsz;
1273                 size -= cpsz;
1274
1275                 if (size == 0)
1276                         goto iodone;
1277
1278                 if (size <= PAGE_SIZE) {
1279                         /* prp2 is second (and final) page in transfer */
1280
1281                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1282                             size,
1283                             cmd->opc == NVME_OPC_WRITE,
1284                             lba);
1285                 } else {
1286                         uint64_t *prp_list;
1287                         int i;
1288
1289                         /* prp2 is pointer to a physical region page list */
1290                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1291                                                     cmd->prp2, PAGE_SIZE);
1292
1293                         i = 0;
1294                         while (size != 0) {
1295                                 cpsz = MIN(size, PAGE_SIZE);
1296
1297                                 /*
1298                                  * Move to linked physical region page list
1299                                  * in last item.
1300                                  */ 
1301                                 if (i == (NVME_PRP2_ITEMS-1) &&
1302                                     size > PAGE_SIZE) {
1303                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1304                                         prp_list = paddr_guest2host(
1305                                                       sc->nsc_pi->pi_vmctx,
1306                                                       prp_list[i], PAGE_SIZE);
1307                                         i = 0;
1308                                 }
1309                                 if (prp_list[i] == 0) {
1310                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1311                                         err = 1;
1312                                         break;
1313                                 }
1314
1315                                 err = pci_nvme_append_iov_req(sc, req,
1316                                     prp_list[i], cpsz,
1317                                     cmd->opc == NVME_OPC_WRITE, lba);
1318                                 if (err)
1319                                         break;
1320
1321                                 lba += cpsz;
1322                                 size -= cpsz;
1323                                 i++;
1324                         }
1325                 }
1326
1327 iodone:
1328                 if (sc->nvstore.type == NVME_STOR_RAM) {
1329                         uint16_t code, status;
1330
1331                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1332                             NVME_SC_SUCCESS;
1333                         pci_nvme_status_genc(&status, code);
1334
1335                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1336                                                 status, 1);
1337
1338                         continue;
1339                 }
1340
1341
1342                 if (err)
1343                         goto do_error;
1344
1345                 req->io_req.br_callback = pci_nvme_io_done;
1346
1347                 err = 0;
1348                 switch (cmd->opc) {
1349                 case NVME_OPC_READ:
1350                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1351                         break;
1352                 case NVME_OPC_WRITE:
1353                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1354                         break;
1355                 default:
1356                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1357                                  __func__, cmd->opc));
1358                         err = 1;
1359                 }
1360
1361 do_error:
1362                 if (err) {
1363                         uint16_t status;
1364
1365                         pci_nvme_status_genc(&status,
1366                             NVME_SC_DATA_TRANSFER_ERROR);
1367
1368                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1369                                                 status, 1);
1370                         pci_nvme_release_ioreq(sc, req);
1371                 }
1372         }
1373
1374         atomic_store_short(&sq->head, sqhead);
1375         atomic_store_int(&sq->busy, 0);
1376 }
1377
1378 static void
1379 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1380         uint64_t idx, int is_sq, uint64_t value)
1381 {
1382         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1383                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1384
1385         if (is_sq) {
1386                 atomic_store_short(&sc->submit_queues[idx].tail,
1387                                    (uint16_t)value);
1388
1389                 if (idx == 0) {
1390                         pci_nvme_handle_admin_cmd(sc, value);
1391                 } else {
1392                         /* submission queue; handle new entries in SQ */
1393                         if (idx > sc->num_squeues) {
1394                                 WPRINTF(("%s SQ index %lu overflow from "
1395                                          "guest (max %u)\r\n",
1396                                          __func__, idx, sc->num_squeues));
1397                                 return;
1398                         }
1399                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1400                 }
1401         } else {
1402                 if (idx > sc->num_cqueues) {
1403                         WPRINTF(("%s queue index %lu overflow from "
1404                                  "guest (max %u)\r\n",
1405                                  __func__, idx, sc->num_cqueues));
1406                         return;
1407                 }
1408
1409                 sc->compl_queues[idx].head = (uint16_t)value;
1410         }
1411 }
1412
1413 static void
1414 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1415 {
1416         const char *s = iswrite ? "WRITE" : "READ";
1417
1418         switch (offset) {
1419         case NVME_CR_CAP_LOW:
1420                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1421                 break;
1422         case NVME_CR_CAP_HI:
1423                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1424                 break;
1425         case NVME_CR_VS:
1426                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1427                 break;
1428         case NVME_CR_INTMS:
1429                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1430                 break;
1431         case NVME_CR_INTMC:
1432                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1433                 break;
1434         case NVME_CR_CC:
1435                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1436                 break;
1437         case NVME_CR_CSTS:
1438                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1439                 break;
1440         case NVME_CR_NSSR:
1441                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1442                 break;
1443         case NVME_CR_AQA:
1444                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1445                 break;
1446         case NVME_CR_ASQ_LOW:
1447                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1448                 break;
1449         case NVME_CR_ASQ_HI:
1450                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1451                 break;
1452         case NVME_CR_ACQ_LOW:
1453                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1454                 break;
1455         case NVME_CR_ACQ_HI:
1456                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1457                 break;
1458         default:
1459                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1460         }
1461
1462 }
1463
1464 static void
1465 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1466         uint64_t offset, int size, uint64_t value)
1467 {
1468         uint32_t ccreg;
1469
1470         if (offset >= NVME_DOORBELL_OFFSET) {
1471                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1472                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1473                 int is_sq = (belloffset % 8) < 4;
1474
1475                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1476                         WPRINTF(("guest attempted an overflow write offset "
1477                                  "0x%lx, val 0x%lx in %s",
1478                                  offset, value, __func__));
1479                         return;
1480                 }
1481
1482                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1483                 return;
1484         }
1485
1486         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1487                 offset, size, value));
1488
1489         if (size != 4) {
1490                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1491                          "val 0x%lx) to bar0 in %s",
1492                          size, offset, value, __func__));
1493                 /* TODO: shutdown device */
1494                 return;
1495         }
1496
1497         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1498
1499         pthread_mutex_lock(&sc->mtx);
1500
1501         switch (offset) {
1502         case NVME_CR_CAP_LOW:
1503         case NVME_CR_CAP_HI:
1504                 /* readonly */
1505                 break;
1506         case NVME_CR_VS:
1507                 /* readonly */
1508                 break;
1509         case NVME_CR_INTMS:
1510                 /* MSI-X, so ignore */
1511                 break;
1512         case NVME_CR_INTMC:
1513                 /* MSI-X, so ignore */
1514                 break;
1515         case NVME_CR_CC:
1516                 ccreg = (uint32_t)value;
1517
1518                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1519                          "iocqes %u\r\n",
1520                         __func__,
1521                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1522                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1523                          NVME_CC_GET_IOCQES(ccreg)));
1524
1525                 if (NVME_CC_GET_SHN(ccreg)) {
1526                         /* perform shutdown - flush out data to backend */
1527                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1528                             NVME_CSTS_REG_SHST_SHIFT);
1529                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1530                             NVME_CSTS_REG_SHST_SHIFT;
1531                 }
1532                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1533                         if (NVME_CC_GET_EN(ccreg) == 0)
1534                                 /* transition 1-> causes controller reset */
1535                                 pci_nvme_reset_locked(sc);
1536                         else
1537                                 pci_nvme_init_controller(ctx, sc);
1538                 }
1539
1540                 /* Insert the iocqes, iosqes and en bits from the write */
1541                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1542                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1543                 if (NVME_CC_GET_EN(ccreg) == 0) {
1544                         /* Insert the ams, mps and css bit fields */
1545                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1546                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1547                         sc->regs.csts &= ~NVME_CSTS_RDY;
1548                 } else if (sc->pending_ios == 0) {
1549                         sc->regs.csts |= NVME_CSTS_RDY;
1550                 }
1551                 break;
1552         case NVME_CR_CSTS:
1553                 break;
1554         case NVME_CR_NSSR:
1555                 /* ignore writes; don't support subsystem reset */
1556                 break;
1557         case NVME_CR_AQA:
1558                 sc->regs.aqa = (uint32_t)value;
1559                 break;
1560         case NVME_CR_ASQ_LOW:
1561                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1562                                (0xFFFFF000 & value);
1563                 break;
1564         case NVME_CR_ASQ_HI:
1565                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1566                                (value << 32);
1567                 break;
1568         case NVME_CR_ACQ_LOW:
1569                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1570                                (0xFFFFF000 & value);
1571                 break;
1572         case NVME_CR_ACQ_HI:
1573                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1574                                (value << 32);
1575                 break;
1576         default:
1577                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1578                          __func__, offset, value, size));
1579         }
1580         pthread_mutex_unlock(&sc->mtx);
1581 }
1582
1583 static void
1584 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1585                 int baridx, uint64_t offset, int size, uint64_t value)
1586 {
1587         struct pci_nvme_softc* sc = pi->pi_arg;
1588
1589         if (baridx == pci_msix_table_bar(pi) ||
1590             baridx == pci_msix_pba_bar(pi)) {
1591                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1592                          " value 0x%lx\r\n", baridx, offset, size, value));
1593
1594                 pci_emul_msix_twrite(pi, offset, size, value);
1595                 return;
1596         }
1597
1598         switch (baridx) {
1599         case 0:
1600                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1601                 break;
1602
1603         default:
1604                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1605                          __func__, baridx, value));
1606         }
1607 }
1608
1609 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1610         uint64_t offset, int size)
1611 {
1612         uint64_t value;
1613
1614         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1615
1616         if (offset < NVME_DOORBELL_OFFSET) {
1617                 void *p = &(sc->regs);
1618                 pthread_mutex_lock(&sc->mtx);
1619                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1620                 pthread_mutex_unlock(&sc->mtx);
1621         } else {
1622                 value = 0;
1623                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1624         }
1625
1626         switch (size) {
1627         case 1:
1628                 value &= 0xFF;
1629                 break;
1630         case 2:
1631                 value &= 0xFFFF;
1632                 break;
1633         case 4:
1634                 value &= 0xFFFFFFFF;
1635                 break;
1636         }
1637
1638         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1639                  offset, size, (uint32_t)value));
1640
1641         return (value);
1642 }
1643
1644
1645
1646 static uint64_t
1647 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1648     uint64_t offset, int size)
1649 {
1650         struct pci_nvme_softc* sc = pi->pi_arg;
1651
1652         if (baridx == pci_msix_table_bar(pi) ||
1653             baridx == pci_msix_pba_bar(pi)) {
1654                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1655                         baridx, offset, size));
1656
1657                 return pci_emul_msix_tread(pi, offset, size);
1658         }
1659
1660         switch (baridx) {
1661         case 0:
1662                 return pci_nvme_read_bar_0(sc, offset, size);
1663
1664         default:
1665                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1666         }
1667
1668         return (0);
1669 }
1670
1671
1672 static int
1673 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1674 {
1675         char bident[sizeof("XX:X:X")];
1676         char    *uopt, *xopts, *config;
1677         uint32_t sectsz;
1678         int optidx;
1679
1680         sc->max_queues = NVME_QUEUES;
1681         sc->max_qentries = NVME_MAX_QENTRIES;
1682         sc->ioslots = NVME_IOSLOTS;
1683         sc->num_squeues = sc->max_queues;
1684         sc->num_cqueues = sc->max_queues;
1685         sectsz = 0;
1686
1687         uopt = strdup(opts);
1688         optidx = 0;
1689         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1690                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1691         for (xopts = strtok(uopt, ",");
1692              xopts != NULL;
1693              xopts = strtok(NULL, ",")) {
1694
1695                 if ((config = strchr(xopts, '=')) != NULL)
1696                         *config++ = '\0';
1697
1698                 if (!strcmp("maxq", xopts)) {
1699                         sc->max_queues = atoi(config);
1700                 } else if (!strcmp("qsz", xopts)) {
1701                         sc->max_qentries = atoi(config);
1702                 } else if (!strcmp("ioslots", xopts)) {
1703                         sc->ioslots = atoi(config);
1704                 } else if (!strcmp("sectsz", xopts)) {
1705                         sectsz = atoi(config);
1706                 } else if (!strcmp("ser", xopts)) {
1707                         /*
1708                          * This field indicates the Product Serial Number in
1709                          * 7-bit ASCII, unused bytes should be space characters.
1710                          * Ref: NVMe v1.3c.
1711                          */
1712                         cpywithpad((char *)sc->ctrldata.sn,
1713                                    sizeof(sc->ctrldata.sn), config, ' ');
1714                 } else if (!strcmp("ram", xopts)) {
1715                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1716
1717                         sc->nvstore.type = NVME_STOR_RAM;
1718                         sc->nvstore.size = sz * 1024 * 1024;
1719                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1720                         sc->nvstore.sectsz = 4096;
1721                         sc->nvstore.sectsz_bits = 12;
1722                         if (sc->nvstore.ctx == NULL) {
1723                                 perror("Unable to allocate RAM");
1724                                 free(uopt);
1725                                 return (-1);
1726                         }
1727                 } else if (optidx == 0) {
1728                         snprintf(bident, sizeof(bident), "%d:%d",
1729                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1730                         sc->nvstore.ctx = blockif_open(xopts, bident);
1731                         if (sc->nvstore.ctx == NULL) {
1732                                 perror("Could not open backing file");
1733                                 free(uopt);
1734                                 return (-1);
1735                         }
1736                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1737                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1738                 } else {
1739                         fprintf(stderr, "Invalid option %s\n", xopts);
1740                         free(uopt);
1741                         return (-1);
1742                 }
1743
1744                 optidx++;
1745         }
1746         free(uopt);
1747
1748         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1749                 fprintf(stderr, "backing store not specified\n");
1750                 return (-1);
1751         }
1752         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1753                 sc->nvstore.sectsz = sectsz;
1754         else if (sc->nvstore.type != NVME_STOR_RAM)
1755                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1756         for (sc->nvstore.sectsz_bits = 9;
1757              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1758              sc->nvstore.sectsz_bits++);
1759
1760         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1761                 sc->max_queues = NVME_QUEUES;
1762
1763         if (sc->max_qentries <= 0) {
1764                 fprintf(stderr, "Invalid qsz option\n");
1765                 return (-1);
1766         }
1767         if (sc->ioslots <= 0) {
1768                 fprintf(stderr, "Invalid ioslots option\n");
1769                 return (-1);
1770         }
1771
1772         return (0);
1773 }
1774
1775 static int
1776 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1777 {
1778         struct pci_nvme_softc *sc;
1779         uint32_t pci_membar_sz;
1780         int     error;
1781
1782         error = 0;
1783
1784         sc = calloc(1, sizeof(struct pci_nvme_softc));
1785         pi->pi_arg = sc;
1786         sc->nsc_pi = pi;
1787
1788         error = pci_nvme_parse_opts(sc, opts);
1789         if (error < 0)
1790                 goto done;
1791         else
1792                 error = 0;
1793
1794         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1795         for (int i = 0; i < sc->ioslots; i++) {
1796                 if (i < (sc->ioslots-1))
1797                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1798                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1799                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1800         }
1801         sc->ioreqs_free = sc->ioreqs;
1802         sc->intr_coales_aggr_thresh = 1;
1803
1804         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1805         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1806         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1807         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1808         pci_set_cfgdata8(pi, PCIR_PROGIF,
1809                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1810
1811         /* allocate size of nvme registers + doorbell space for all queues */
1812         pci_membar_sz = sizeof(struct nvme_registers) +
1813                         2*sizeof(uint32_t)*(sc->max_queues);
1814
1815         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1816
1817         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1818         if (error) {
1819                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1820                 goto done;
1821         }
1822
1823         error = pci_emul_add_msixcap(pi, sc->max_queues, NVME_MSIX_BAR);
1824         if (error) {
1825                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1826                 goto done;
1827         }
1828
1829         pthread_mutex_init(&sc->mtx, NULL);
1830         sem_init(&sc->iosemlock, 0, sc->ioslots);
1831
1832         pci_nvme_reset(sc);
1833         pci_nvme_init_ctrldata(sc);
1834         pci_nvme_init_nsdata(sc);
1835
1836         pci_lintr_request(pi);
1837
1838 done:
1839         return (error);
1840 }
1841
1842
1843 struct pci_devemu pci_de_nvme = {
1844         .pe_emu =       "nvme",
1845         .pe_init =      pci_nvme_init,
1846         .pe_barwrite =  pci_nvme_write,
1847         .pe_barread =   pci_nvme_read
1848 };
1849 PCI_EMUL_SET(pci_de_nvme);