]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
Merge llvm trunk r351319, resolve conflicts, and update FREEBSD-Xlist.
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include <sys/types.h>
57
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71
72 #include <dev/nvme/nvme.h>
73
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77
78
79 static int nvme_debug = 0;
80 #define DPRINTF(params) if (nvme_debug) printf params
81 #define WPRINTF(params) printf params
82
83 /* defaults; can be overridden */
84 #define NVME_MSIX_BAR           4
85
86 #define NVME_IOSLOTS            8
87
88 #define NVME_QUEUES             16
89 #define NVME_MAX_QENTRIES       2048
90
91 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
92 #define NVME_MAX_BLOCKIOVS      512
93
94 /* helpers */
95
96 /* Convert a zero-based value into a one-based value */
97 #define ONE_BASED(zero)         ((zero) + 1)
98 /* Convert a one-based value into a zero-based value */
99 #define ZERO_BASED(one)         ((one)  - 1)
100
101 /* Encode number of SQ's and CQ's for Set/Get Features */
102 #define NVME_FEATURE_NUM_QUEUES(sc) \
103         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
104         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
105
106 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
107
108 enum nvme_controller_register_offsets {
109         NVME_CR_CAP_LOW = 0x00,
110         NVME_CR_CAP_HI  = 0x04,
111         NVME_CR_VS      = 0x08,
112         NVME_CR_INTMS   = 0x0c,
113         NVME_CR_INTMC   = 0x10,
114         NVME_CR_CC      = 0x14,
115         NVME_CR_CSTS    = 0x1c,
116         NVME_CR_NSSR    = 0x20,
117         NVME_CR_AQA     = 0x24,
118         NVME_CR_ASQ_LOW = 0x28,
119         NVME_CR_ASQ_HI  = 0x2c,
120         NVME_CR_ACQ_LOW = 0x30,
121         NVME_CR_ACQ_HI  = 0x34,
122 };
123
124 enum nvme_cmd_cdw11 {
125         NVME_CMD_CDW11_PC  = 0x0001,
126         NVME_CMD_CDW11_IEN = 0x0002,
127         NVME_CMD_CDW11_IV  = 0xFFFF0000,
128 };
129
130 #define NVME_CQ_INTEN   0x01
131 #define NVME_CQ_INTCOAL 0x02
132
133 struct nvme_completion_queue {
134         struct nvme_completion *qbase;
135         uint32_t        size;
136         uint16_t        tail; /* nvme progress */
137         uint16_t        head; /* guest progress */
138         uint16_t        intr_vec;
139         uint32_t        intr_en;
140         pthread_mutex_t mtx;
141 };
142
143 struct nvme_submission_queue {
144         struct nvme_command *qbase;
145         uint32_t        size;
146         uint16_t        head; /* nvme progress */
147         uint16_t        tail; /* guest progress */
148         uint16_t        cqid; /* completion queue id */
149         int             busy; /* queue is being processed */
150         int             qpriority;
151 };
152
153 enum nvme_storage_type {
154         NVME_STOR_BLOCKIF = 0,
155         NVME_STOR_RAM = 1,
156 };
157
158 struct pci_nvme_blockstore {
159         enum nvme_storage_type type;
160         void            *ctx;
161         uint64_t        size;
162         uint32_t        sectsz;
163         uint32_t        sectsz_bits;
164 };
165
166 struct pci_nvme_ioreq {
167         struct pci_nvme_softc *sc;
168         struct pci_nvme_ioreq *next;
169         struct nvme_submission_queue *nvme_sq;
170         uint16_t        sqid;
171
172         /* command information */
173         uint16_t        opc;
174         uint16_t        cid;
175         uint32_t        nsid;
176
177         uint64_t        prev_gpaddr;
178         size_t          prev_size;
179
180         /*
181          * lock if all iovs consumed (big IO);
182          * complete transaction before continuing
183          */
184         pthread_mutex_t mtx;
185         pthread_cond_t  cv;
186
187         struct blockif_req io_req;
188
189         /* pad to fit up to 512 page descriptors from guest IO request */
190         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
191 };
192
193 struct pci_nvme_softc {
194         struct pci_devinst *nsc_pi;
195
196         pthread_mutex_t mtx;
197
198         struct nvme_registers regs;
199
200         struct nvme_namespace_data  nsdata;
201         struct nvme_controller_data ctrldata;
202
203         struct pci_nvme_blockstore nvstore;
204
205         uint16_t        max_qentries;   /* max entries per queue */
206         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
207         uint32_t        num_cqueues;
208         uint32_t        num_squeues;
209
210         struct pci_nvme_ioreq *ioreqs;
211         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
212         uint32_t        pending_ios;
213         uint32_t        ioslots;
214         sem_t           iosemlock;
215
216         /*
217          * Memory mapped Submission and Completion queues
218          * Each array includes both Admin and IO queues
219          */
220         struct nvme_completion_queue *compl_queues;
221         struct nvme_submission_queue *submit_queues;
222
223         /* controller features */
224         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
225         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
226         uint32_t        async_ev_config;         /* 0x0B: async event config */
227 };
228
229
230 static void pci_nvme_io_partial(struct blockif_req *br, int err);
231
232 /* Controller Configuration utils */
233 #define NVME_CC_GET_EN(cc) \
234         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
235 #define NVME_CC_GET_CSS(cc) \
236         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
237 #define NVME_CC_GET_SHN(cc) \
238         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
239 #define NVME_CC_GET_IOSQES(cc) \
240         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
241 #define NVME_CC_GET_IOCQES(cc) \
242         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
243
244 #define NVME_CC_WRITE_MASK \
245         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
246          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
247          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
248
249 #define NVME_CC_NEN_WRITE_MASK \
250         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
251          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
252          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
253
254 /* Controller Status utils */
255 #define NVME_CSTS_GET_RDY(sts) \
256         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
257
258 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
259
260 /* Completion Queue status word utils */
261 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
262 #define NVME_STATUS_MASK \
263         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
264          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
265
266 static __inline void
267 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
268 {
269         size_t len;
270
271         len = strnlen(src, dst_size);
272         memset(dst, pad, dst_size);
273         memcpy(dst, src, len);
274 }
275
276 static __inline void
277 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
278 {
279
280         *status &= ~NVME_STATUS_MASK;
281         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
282                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
283 }
284
285 static __inline void
286 pci_nvme_status_genc(uint16_t *status, uint16_t code)
287 {
288
289         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
290 }
291
292 static __inline void
293 pci_nvme_toggle_phase(uint16_t *status, int prev)
294 {
295
296         if (prev)
297                 *status &= ~NVME_STATUS_P;
298         else
299                 *status |= NVME_STATUS_P;
300 }
301
302 static void
303 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
304 {
305         struct nvme_controller_data *cd = &sc->ctrldata;
306
307         cd->vid = 0xFB5D;
308         cd->ssvid = 0x0000;
309
310         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
311         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
312
313         /* Num of submission commands that we can handle at a time (2^rab) */
314         cd->rab   = 4;
315
316         /* FreeBSD OUI */
317         cd->ieee[0] = 0x58;
318         cd->ieee[1] = 0x9c;
319         cd->ieee[2] = 0xfc;
320
321         cd->mic = 0;
322
323         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
324
325         cd->ver = 0x00010300;
326
327         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
328         cd->acl = 2;
329         cd->aerl = 4;
330
331         cd->lpa = 0;    /* TODO: support some simple things like SMART */
332         cd->elpe = 0;   /* max error log page entries */
333         cd->npss = 1;   /* number of power states support */
334
335         /* Warning Composite Temperature Threshold */
336         cd->wctemp = 0x0157;
337
338         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
339             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
340         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
341             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
342         cd->nn = 1;     /* number of namespaces */
343
344         cd->fna = 0x03;
345
346         cd->power_state[0].mp = 10;
347 }
348
349 static void
350 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
351 {
352         struct nvme_namespace_data *nd;
353
354         nd = &sc->nsdata;
355
356         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
357         nd->ncap = nd->nsze;
358         nd->nuse = nd->nsze;
359
360         /* Get LBA and backstore information from backing store */
361         nd->nlbaf = 1;
362         /* LBA data-sz = 2^lbads */
363         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
364
365         nd->flbas = 0;
366 }
367
368 static void
369 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
370 {
371         DPRINTF(("%s\r\n", __func__));
372
373         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
374             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
375             (60 << NVME_CAP_LO_REG_TO_SHIFT);
376
377         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
378
379         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
380
381         sc->regs.cc = 0;
382         sc->regs.csts = 0;
383
384         sc->num_cqueues = sc->num_squeues = sc->max_queues;
385         if (sc->submit_queues != NULL) {
386                 for (int i = 0; i < sc->num_squeues + 1; i++) {
387                         /*
388                          * The Admin Submission Queue is at index 0.
389                          * It must not be changed at reset otherwise the
390                          * emulation will be out of sync with the guest.
391                          */
392                         if (i != 0) {
393                                 sc->submit_queues[i].qbase = NULL;
394                                 sc->submit_queues[i].size = 0;
395                                 sc->submit_queues[i].cqid = 0;
396                         }
397                         sc->submit_queues[i].tail = 0;
398                         sc->submit_queues[i].head = 0;
399                         sc->submit_queues[i].busy = 0;
400                 }
401         } else
402                 sc->submit_queues = calloc(sc->num_squeues + 1,
403                                         sizeof(struct nvme_submission_queue));
404
405         if (sc->compl_queues != NULL) {
406                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
407                         /* See Admin Submission Queue note above */
408                         if (i != 0) {
409                                 sc->compl_queues[i].qbase = NULL;
410                                 sc->compl_queues[i].size = 0;
411                         }
412
413                         sc->compl_queues[i].tail = 0;
414                         sc->compl_queues[i].head = 0;
415                 }
416         } else {
417                 sc->compl_queues = calloc(sc->num_cqueues + 1,
418                                         sizeof(struct nvme_completion_queue));
419
420                 for (int i = 0; i < sc->num_cqueues + 1; i++)
421                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
422         }
423 }
424
425 static void
426 pci_nvme_reset(struct pci_nvme_softc *sc)
427 {
428         pthread_mutex_lock(&sc->mtx);
429         pci_nvme_reset_locked(sc);
430         pthread_mutex_unlock(&sc->mtx);
431 }
432
433 static void
434 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
435 {
436         uint16_t acqs, asqs;
437
438         DPRINTF(("%s\r\n", __func__));
439
440         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
441         sc->submit_queues[0].size = asqs;
442         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
443                     sizeof(struct nvme_command) * asqs);
444
445         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
446                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
447
448         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
449             NVME_AQA_REG_ACQS_MASK) + 1;
450         sc->compl_queues[0].size = acqs;
451         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
452                  sizeof(struct nvme_completion) * acqs);
453         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
454                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
455 }
456
457 static int
458 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
459         struct nvme_completion* compl)
460 {
461         uint16_t qid = command->cdw10 & 0xffff;
462
463         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
464         if (qid == 0 || qid > sc->num_squeues) {
465                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
466                         __func__, qid, sc->num_squeues));
467                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
468                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
469                 return (1);
470         }
471
472         sc->submit_queues[qid].qbase = NULL;
473         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
474         return (1);
475 }
476
477 static int
478 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
479         struct nvme_completion* compl)
480 {
481         if (command->cdw11 & NVME_CMD_CDW11_PC) {
482                 uint16_t qid = command->cdw10 & 0xffff;
483                 struct nvme_submission_queue *nsq;
484
485                 if ((qid == 0) || (qid > sc->num_squeues)) {
486                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
487                                 __func__, qid, sc->num_squeues));
488                         pci_nvme_status_tc(&compl->status,
489                             NVME_SCT_COMMAND_SPECIFIC,
490                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
491                         return (1);
492                 }
493
494                 nsq = &sc->submit_queues[qid];
495                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
496
497                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
498                               sizeof(struct nvme_command) * (size_t)nsq->size);
499                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
500                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
501
502                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
503                         qid, nsq->size, nsq->qbase, nsq->cqid));
504
505                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
506
507                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
508                          __func__, qid));
509         } else {
510                 /* 
511                  * Guest sent non-cont submission queue request.
512                  * This setting is unsupported by this emulation.
513                  */
514                 WPRINTF(("%s unsupported non-contig (list-based) "
515                          "create i/o submission queue\r\n", __func__));
516
517                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
518         }
519         return (1);
520 }
521
522 static int
523 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
524         struct nvme_completion* compl)
525 {
526         uint16_t qid = command->cdw10 & 0xffff;
527
528         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
529         if (qid == 0 || qid > sc->num_cqueues) {
530                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
531                         __func__, qid, sc->num_cqueues));
532                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
533                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
534                 return (1);
535         }
536
537         sc->compl_queues[qid].qbase = NULL;
538         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
539         return (1);
540 }
541
542 static int
543 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
544         struct nvme_completion* compl)
545 {
546         if (command->cdw11 & NVME_CMD_CDW11_PC) {
547                 uint16_t qid = command->cdw10 & 0xffff;
548                 struct nvme_completion_queue *ncq;
549
550                 if ((qid == 0) || (qid > sc->num_cqueues)) {
551                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
552                                 __func__, qid, sc->num_cqueues));
553                         pci_nvme_status_tc(&compl->status,
554                             NVME_SCT_COMMAND_SPECIFIC,
555                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
556                         return (1);
557                 }
558
559                 ncq = &sc->compl_queues[qid];
560                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
561                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
562                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
563
564                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
565                              command->prp1,
566                              sizeof(struct nvme_command) * (size_t)ncq->size);
567
568                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
569         } else {
570                 /* 
571                  * Non-contig completion queue unsupported.
572                  */
573                 WPRINTF(("%s unsupported non-contig (list-based) "
574                          "create i/o completion queue\r\n",
575                          __func__));
576
577                 /* 0x12 = Invalid Use of Controller Memory Buffer */
578                 pci_nvme_status_genc(&compl->status, 0x12);
579         }
580
581         return (1);
582 }
583
584 static int
585 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
586         struct nvme_completion* compl)
587 {
588         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
589         uint8_t logpage = command->cdw10 & 0xFF;
590         void *data;
591
592         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
593
594         if (logpage >= 1 && logpage <= 3)
595                 data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
596                                   PAGE_SIZE);
597
598         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
599
600         switch (logpage) {
601         case 0x01: /* Error information */
602                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
603                 break;
604         case 0x02: /* SMART/Health information */
605                 /* TODO: present some smart info */
606                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
607                 break;
608         case 0x03: /* Firmware slot information */
609                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
610                 break;
611         default:
612                 WPRINTF(("%s get log page %x command not supported\r\n",
613                         __func__, logpage));
614
615                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
616                     NVME_SC_INVALID_LOG_PAGE);
617         }
618
619         return (1);
620 }
621
622 static int
623 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
624         struct nvme_completion* compl)
625 {
626         void *dest;
627
628         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
629                 command->cdw10 & 0xFF, command->nsid));
630
631         switch (command->cdw10 & 0xFF) {
632         case 0x00: /* return Identify Namespace data structure */
633                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
634                                   sizeof(sc->nsdata));
635                 memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
636                 break;
637         case 0x01: /* return Identify Controller data structure */
638                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
639                                   sizeof(sc->ctrldata));
640                 memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
641                 break;
642         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
643                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
644                                   sizeof(uint32_t) * 1024);
645                 ((uint32_t *)dest)[0] = 1;
646                 ((uint32_t *)dest)[1] = 0;
647                 break;
648         case 0x11:
649                 pci_nvme_status_genc(&compl->status,
650                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
651                 return (1);
652         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
653         case 0x10:
654         case 0x12:
655         case 0x13:
656         case 0x14:
657         case 0x15:
658         default:
659                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
660                          __func__, command->cdw10 & 0xFF));
661                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
662                 return (1);
663         }
664
665         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
666         return (1);
667 }
668
669 static int
670 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
671         struct nvme_completion* compl)
672 {
673         uint16_t nqr;   /* Number of Queues Requested */
674
675         nqr = command->cdw11 & 0xFFFF;
676         if (nqr == 0xffff) {
677                 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
678                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
679                 return (-1);
680         }
681
682         sc->num_squeues = ONE_BASED(nqr);
683         if (sc->num_squeues > sc->max_queues) {
684                 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
685                                         sc->max_queues));
686                 sc->num_squeues = sc->max_queues;
687         }
688
689         nqr = (command->cdw11 >> 16) & 0xFFFF;
690         if (nqr == 0xffff) {
691                 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
692                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
693                 return (-1);
694         }
695
696         sc->num_cqueues = ONE_BASED(nqr);
697         if (sc->num_cqueues > sc->max_queues) {
698                 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
699                                         sc->max_queues));
700                 sc->num_cqueues = sc->max_queues;
701         }
702
703         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
704
705         return (0);
706 }
707
708 static int
709 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
710         struct nvme_completion* compl)
711 {
712         int feature = command->cdw10 & 0xFF;
713         uint32_t iv;
714
715         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
716         compl->cdw0 = 0;
717
718         switch (feature) {
719         case NVME_FEAT_ARBITRATION:
720                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
721                 break;
722         case NVME_FEAT_POWER_MANAGEMENT:
723                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
724                 break;
725         case NVME_FEAT_LBA_RANGE_TYPE:
726                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
727                 break;
728         case NVME_FEAT_TEMPERATURE_THRESHOLD:
729                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
730                 break;
731         case NVME_FEAT_ERROR_RECOVERY:
732                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
733                 break;
734         case NVME_FEAT_VOLATILE_WRITE_CACHE:
735                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
736                 break;
737         case NVME_FEAT_NUMBER_OF_QUEUES:
738                 nvme_set_feature_queues(sc, command, compl);
739                 break;
740         case NVME_FEAT_INTERRUPT_COALESCING:
741                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
742
743                 /* in uS */
744                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
745
746                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
747                 break;
748         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
749                 iv = command->cdw11 & 0xFFFF;
750
751                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
752                         command->cdw11));
753
754                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
755                         if (sc->compl_queues[i].intr_vec == iv) {
756                                 if (command->cdw11 & (1 << 16))
757                                         sc->compl_queues[i].intr_en |=
758                                                               NVME_CQ_INTCOAL;  
759                                 else
760                                         sc->compl_queues[i].intr_en &=
761                                                              ~NVME_CQ_INTCOAL;  
762                         }
763                 }
764                 break;
765         case NVME_FEAT_WRITE_ATOMICITY:
766                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
767                 break;
768         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
769                 DPRINTF(("  async event configuration 0x%x\r\n",
770                         command->cdw11));
771                 sc->async_ev_config = command->cdw11;
772                 break;
773         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
774                 DPRINTF(("  software progress marker 0x%x\r\n",
775                         command->cdw11));
776                 break;
777         case 0x0C:
778                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
779                         command->cdw11));
780                 break;
781         default:
782                 WPRINTF(("%s invalid feature\r\n", __func__));
783                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
784                 return (1);
785         }
786
787         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
788         return (1);
789 }
790
791 static int
792 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
793         struct nvme_completion* compl)
794 {
795         int feature = command->cdw10 & 0xFF;
796
797         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
798
799         compl->cdw0 = 0;
800
801         switch (feature) {
802         case NVME_FEAT_ARBITRATION:
803                 DPRINTF(("  arbitration\r\n"));
804                 break;
805         case NVME_FEAT_POWER_MANAGEMENT:
806                 DPRINTF(("  power management\r\n"));
807                 break;
808         case NVME_FEAT_LBA_RANGE_TYPE:
809                 DPRINTF(("  lba range\r\n"));
810                 break;
811         case NVME_FEAT_TEMPERATURE_THRESHOLD:
812                 DPRINTF(("  temperature threshold\r\n"));
813                 switch ((command->cdw11 >> 20) & 0x3) {
814                 case 0:
815                         /* Over temp threshold */
816                         compl->cdw0 = 0xFFFF;
817                         break;
818                 case 1:
819                         /* Under temp threshold */
820                         compl->cdw0 = 0;
821                         break;
822                 default:
823                         WPRINTF(("  invalid threshold type select\r\n"));
824                         pci_nvme_status_genc(&compl->status,
825                             NVME_SC_INVALID_FIELD);
826                         return (1);
827                 }
828                 break;
829         case NVME_FEAT_ERROR_RECOVERY:
830                 DPRINTF(("  error recovery\r\n"));
831                 break;
832         case NVME_FEAT_VOLATILE_WRITE_CACHE:
833                 DPRINTF(("  volatile write cache\r\n"));
834                 break;
835         case NVME_FEAT_NUMBER_OF_QUEUES:
836                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
837
838                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
839                         compl->cdw0 & 0xFFFF,
840                         (compl->cdw0 >> 16) & 0xFFFF));
841
842                 break;
843         case NVME_FEAT_INTERRUPT_COALESCING:
844                 DPRINTF(("  interrupt coalescing\r\n"));
845                 break;
846         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
847                 DPRINTF(("  interrupt vector configuration\r\n"));
848                 break;
849         case NVME_FEAT_WRITE_ATOMICITY:
850                 DPRINTF(("  write atomicity\r\n"));
851                 break;
852         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
853                 DPRINTF(("  async event configuration\r\n"));
854                 sc->async_ev_config = command->cdw11;
855                 break;
856         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
857                 DPRINTF(("  software progress marker\r\n"));
858                 break;
859         case 0x0C:
860                 DPRINTF(("  autonomous power state transition\r\n"));
861                 break;
862         default:
863                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
864                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
865                 return (1);
866         }
867
868         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
869         return (1);
870 }
871
872 static int
873 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
874         struct nvme_completion* compl)
875 {
876         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
877                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
878
879         /* TODO: search for the command ID and abort it */
880
881         compl->cdw0 = 1;
882         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
883         return (1);
884 }
885
886 static int
887 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
888         struct nvme_command* command, struct nvme_completion* compl)
889 {
890         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
891
892         /*
893          * TODO: raise events when they happen based on the Set Features cmd.
894          * These events happen async, so only set completion successful if
895          * there is an event reflective of the request to get event.
896          */
897         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
898             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
899         return (0);
900 }
901
902 static void
903 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
904 {
905         struct nvme_completion compl;
906         struct nvme_command *cmd;
907         struct nvme_submission_queue *sq;
908         struct nvme_completion_queue *cq;
909         int do_intr = 0;
910         uint16_t sqhead;
911
912         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
913
914         sq = &sc->submit_queues[0];
915
916         sqhead = atomic_load_acq_short(&sq->head);
917
918         if (atomic_testandset_int(&sq->busy, 1)) {
919                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
920                         __func__, sqhead, sq->tail));
921                 return;
922         }
923
924         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
925         
926         while (sqhead != atomic_load_acq_short(&sq->tail)) {
927                 cmd = &(sq->qbase)[sqhead];
928                 compl.status = 0;
929
930                 switch (cmd->opc) {
931                 case NVME_OPC_DELETE_IO_SQ:
932                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
933                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
934                         break;
935                 case NVME_OPC_CREATE_IO_SQ:
936                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
937                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
938                         break;
939                 case NVME_OPC_DELETE_IO_CQ:
940                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
941                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
942                         break;
943                 case NVME_OPC_CREATE_IO_CQ:
944                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
945                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
946                         break;
947                 case NVME_OPC_GET_LOG_PAGE:
948                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
949                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
950                         break;
951                 case NVME_OPC_IDENTIFY:
952                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
953                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
954                         break;
955                 case NVME_OPC_ABORT:
956                         DPRINTF(("%s command ABORT\r\n", __func__));
957                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
958                         break;
959                 case NVME_OPC_SET_FEATURES:
960                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
961                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
962                         break;
963                 case NVME_OPC_GET_FEATURES:
964                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
965                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
966                         break;
967                 case NVME_OPC_ASYNC_EVENT_REQUEST:
968                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
969                         /* XXX dont care, unhandled for now
970                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
971                         */
972                         break;
973                 default:
974                         WPRINTF(("0x%x command is not implemented\r\n",
975                             cmd->opc));
976                 }
977         
978                 /* for now skip async event generation */
979                 if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
980                         struct nvme_completion *cp;
981                         int phase;
982
983                         cq = &sc->compl_queues[0];
984
985                         cp = &(cq->qbase)[cq->tail];
986                         cp->cdw0 = compl.cdw0;
987                         cp->sqid = 0;
988                         cp->sqhd = sqhead;
989                         cp->cid = cmd->cid;
990
991                         phase = NVME_STATUS_GET_P(cp->status);
992                         cp->status = compl.status;
993                         pci_nvme_toggle_phase(&cp->status, phase);
994
995                         cq->tail = (cq->tail + 1) % cq->size;
996                 }
997                 sqhead = (sqhead + 1) % sq->size;
998         }
999
1000         DPRINTF(("setting sqhead %u\r\n", sqhead));
1001         atomic_store_short(&sq->head, sqhead);
1002         atomic_store_int(&sq->busy, 0);
1003
1004         if (do_intr)
1005                 pci_generate_msix(sc->nsc_pi, 0);
1006
1007 }
1008
1009 static int
1010 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1011         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1012 {
1013         int iovidx;
1014
1015         if (req != NULL) {
1016                 /* concatenate contig block-iovs to minimize number of iovs */
1017                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1018                         iovidx = req->io_req.br_iovcnt - 1;
1019
1020                         req->io_req.br_iov[iovidx].iov_base =
1021                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1022                                              req->prev_gpaddr, size);
1023
1024                         req->prev_size += size;
1025                         req->io_req.br_resid += size;
1026
1027                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1028                 } else {
1029                         pthread_mutex_lock(&req->mtx);
1030
1031                         iovidx = req->io_req.br_iovcnt;
1032                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1033                                 int err = 0;
1034
1035                                 DPRINTF(("large I/O, doing partial req\r\n"));
1036
1037                                 iovidx = 0;
1038                                 req->io_req.br_iovcnt = 0;
1039
1040                                 req->io_req.br_callback = pci_nvme_io_partial;
1041
1042                                 if (!do_write)
1043                                         err = blockif_read(sc->nvstore.ctx,
1044                                                            &req->io_req);
1045                                 else
1046                                         err = blockif_write(sc->nvstore.ctx,
1047                                                             &req->io_req);
1048
1049                                 /* wait until req completes before cont */
1050                                 if (err == 0)
1051                                         pthread_cond_wait(&req->cv, &req->mtx);
1052                         }
1053                         if (iovidx == 0) {
1054                                 req->io_req.br_offset = lba;
1055                                 req->io_req.br_resid = 0;
1056                                 req->io_req.br_param = req;
1057                         }
1058
1059                         req->io_req.br_iov[iovidx].iov_base =
1060                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1061                                              gpaddr, size);
1062
1063                         req->io_req.br_iov[iovidx].iov_len = size;
1064
1065                         req->prev_gpaddr = gpaddr;
1066                         req->prev_size = size;
1067                         req->io_req.br_resid += size;
1068
1069                         req->io_req.br_iovcnt++;
1070
1071                         pthread_mutex_unlock(&req->mtx);
1072                 }
1073         } else {
1074                 /* RAM buffer: read/write directly */
1075                 void *p = sc->nvstore.ctx;
1076                 void *gptr;
1077
1078                 if ((lba + size) > sc->nvstore.size) {
1079                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1080                         return (-1);
1081                 }
1082
1083                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1084                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1085                 if (do_write) 
1086                         memcpy(p, gptr, size);
1087                 else
1088                         memcpy(gptr, p, size);
1089         }
1090         return (0);
1091 }
1092
1093 static void
1094 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1095         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1096         uint32_t cdw0, uint16_t status, int ignore_busy)
1097 {
1098         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1099         struct nvme_completion *compl;
1100         int do_intr = 0;
1101         int phase;
1102
1103         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1104                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1105                  NVME_STATUS_GET_SC(status)));
1106
1107         pthread_mutex_lock(&cq->mtx);
1108
1109         assert(cq->qbase != NULL);
1110
1111         compl = &cq->qbase[cq->tail];
1112
1113         compl->sqhd = atomic_load_acq_short(&sq->head);
1114         compl->sqid = sqid;
1115         compl->cid = cid;
1116
1117         // toggle phase
1118         phase = NVME_STATUS_GET_P(compl->status);
1119         compl->status = status;
1120         pci_nvme_toggle_phase(&compl->status, phase);
1121
1122         cq->tail = (cq->tail + 1) % cq->size;
1123
1124         if (cq->intr_en & NVME_CQ_INTEN)
1125                 do_intr = 1;
1126
1127         pthread_mutex_unlock(&cq->mtx);
1128
1129         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1130                 if (do_intr)
1131                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1132 }
1133
1134 static void
1135 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1136 {
1137         req->sc = NULL;
1138         req->nvme_sq = NULL;
1139         req->sqid = 0;
1140
1141         pthread_mutex_lock(&sc->mtx);
1142
1143         req->next = sc->ioreqs_free;
1144         sc->ioreqs_free = req;
1145         sc->pending_ios--;
1146
1147         /* when no more IO pending, can set to ready if device reset/enabled */
1148         if (sc->pending_ios == 0 &&
1149             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1150                 sc->regs.csts |= NVME_CSTS_RDY;
1151
1152         pthread_mutex_unlock(&sc->mtx);
1153
1154         sem_post(&sc->iosemlock);
1155 }
1156
1157 static struct pci_nvme_ioreq *
1158 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1159 {
1160         struct pci_nvme_ioreq *req = NULL;;
1161
1162         sem_wait(&sc->iosemlock);
1163         pthread_mutex_lock(&sc->mtx);
1164
1165         req = sc->ioreqs_free;
1166         assert(req != NULL);
1167
1168         sc->ioreqs_free = req->next;
1169
1170         req->next = NULL;
1171         req->sc = sc;
1172
1173         sc->pending_ios++;
1174
1175         pthread_mutex_unlock(&sc->mtx);
1176
1177         req->io_req.br_iovcnt = 0;
1178         req->io_req.br_offset = 0;
1179         req->io_req.br_resid = 0;
1180         req->io_req.br_param = req;
1181         req->prev_gpaddr = 0;
1182         req->prev_size = 0;
1183
1184         return req;
1185 }
1186
1187 static void
1188 pci_nvme_io_done(struct blockif_req *br, int err)
1189 {
1190         struct pci_nvme_ioreq *req = br->br_param;
1191         struct nvme_submission_queue *sq = req->nvme_sq;
1192         uint16_t code, status;
1193
1194         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1195         
1196         /* TODO return correct error */
1197         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1198         pci_nvme_status_genc(&status, code);
1199
1200         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1201         pci_nvme_release_ioreq(req->sc, req);
1202 }
1203
1204 static void
1205 pci_nvme_io_partial(struct blockif_req *br, int err)
1206 {
1207         struct pci_nvme_ioreq *req = br->br_param;
1208
1209         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1210
1211         pthread_cond_signal(&req->cv);
1212 }
1213
1214
1215 static void
1216 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1217 {
1218         struct nvme_submission_queue *sq;
1219         uint16_t status;
1220         uint16_t sqhead;
1221         int err;
1222
1223         /* handle all submissions up to sq->tail index */
1224         sq = &sc->submit_queues[idx];
1225
1226         if (atomic_testandset_int(&sq->busy, 1)) {
1227                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1228                 return;
1229         }
1230
1231         sqhead = atomic_load_acq_short(&sq->head);
1232
1233         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1234                  idx, sqhead, sq->tail, sq->qbase));
1235
1236         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1237                 struct nvme_command *cmd;
1238                 struct pci_nvme_ioreq *req = NULL;
1239                 uint64_t lba;
1240                 uint64_t nblocks, bytes, size, cpsz;
1241
1242                 /* TODO: support scatter gather list handling */
1243
1244                 cmd = &sq->qbase[sqhead];
1245                 sqhead = (sqhead + 1) % sq->size;
1246
1247                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1248
1249                 if (cmd->opc == NVME_OPC_FLUSH) {
1250                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1251                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1252                                                 status, 1);
1253
1254                         continue;
1255                 } else if (cmd->opc == 0x08) {
1256                         /* TODO: write zeroes */
1257                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1258                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1259                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1260                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1261                                                 status, 1);
1262
1263                         continue;
1264                 }
1265
1266                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1267
1268                 bytes = nblocks * sc->nvstore.sectsz;
1269
1270                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1271                         req = pci_nvme_get_ioreq(sc);
1272                         req->nvme_sq = sq;
1273                         req->sqid = idx;
1274                 }
1275
1276                 /*
1277                  * If data starts mid-page and flows into the next page, then
1278                  * increase page count
1279                  */
1280
1281                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1282                          "(%lu-bytes)\r\n",
1283                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1284                          cmd->opc == NVME_OPC_WRITE ?
1285                              "WRITE" : "READ",
1286                          lba, nblocks, bytes));
1287
1288                 cmd->prp1 &= ~(0x03UL);
1289                 cmd->prp2 &= ~(0x03UL);
1290
1291                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1292
1293                 size = bytes;
1294                 lba *= sc->nvstore.sectsz;
1295
1296                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1297
1298                 if (cpsz > bytes)
1299                         cpsz = bytes;
1300
1301                 if (req != NULL) {
1302                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1303                                                 cmd->cdw10;
1304                         req->opc = cmd->opc;
1305                         req->cid = cmd->cid;
1306                         req->nsid = cmd->nsid;
1307                 }
1308
1309                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1310                     cmd->opc == NVME_OPC_WRITE, lba);
1311                 lba += cpsz;
1312                 size -= cpsz;
1313
1314                 if (size == 0)
1315                         goto iodone;
1316
1317                 if (size <= PAGE_SIZE) {
1318                         /* prp2 is second (and final) page in transfer */
1319
1320                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1321                             size,
1322                             cmd->opc == NVME_OPC_WRITE,
1323                             lba);
1324                 } else {
1325                         uint64_t *prp_list;
1326                         int i;
1327
1328                         /* prp2 is pointer to a physical region page list */
1329                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1330                                                     cmd->prp2, PAGE_SIZE);
1331
1332                         i = 0;
1333                         while (size != 0) {
1334                                 cpsz = MIN(size, PAGE_SIZE);
1335
1336                                 /*
1337                                  * Move to linked physical region page list
1338                                  * in last item.
1339                                  */ 
1340                                 if (i == (NVME_PRP2_ITEMS-1) &&
1341                                     size > PAGE_SIZE) {
1342                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1343                                         prp_list = paddr_guest2host(
1344                                                       sc->nsc_pi->pi_vmctx,
1345                                                       prp_list[i], PAGE_SIZE);
1346                                         i = 0;
1347                                 }
1348                                 if (prp_list[i] == 0) {
1349                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1350                                         err = 1;
1351                                         break;
1352                                 }
1353
1354                                 err = pci_nvme_append_iov_req(sc, req,
1355                                     prp_list[i], cpsz,
1356                                     cmd->opc == NVME_OPC_WRITE, lba);
1357                                 if (err)
1358                                         break;
1359
1360                                 lba += cpsz;
1361                                 size -= cpsz;
1362                                 i++;
1363                         }
1364                 }
1365
1366 iodone:
1367                 if (sc->nvstore.type == NVME_STOR_RAM) {
1368                         uint16_t code, status;
1369
1370                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1371                             NVME_SC_SUCCESS;
1372                         pci_nvme_status_genc(&status, code);
1373
1374                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1375                                                 status, 1);
1376
1377                         continue;
1378                 }
1379
1380
1381                 if (err)
1382                         goto do_error;
1383
1384                 req->io_req.br_callback = pci_nvme_io_done;
1385
1386                 err = 0;
1387                 switch (cmd->opc) {
1388                 case NVME_OPC_READ:
1389                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1390                         break;
1391                 case NVME_OPC_WRITE:
1392                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1393                         break;
1394                 default:
1395                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1396                                  __func__, cmd->opc));
1397                         err = 1;
1398                 }
1399
1400 do_error:
1401                 if (err) {
1402                         uint16_t status;
1403
1404                         pci_nvme_status_genc(&status,
1405                             NVME_SC_DATA_TRANSFER_ERROR);
1406
1407                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1408                                                 status, 1);
1409                         pci_nvme_release_ioreq(sc, req);
1410                 }
1411         }
1412
1413         atomic_store_short(&sq->head, sqhead);
1414         atomic_store_int(&sq->busy, 0);
1415 }
1416
1417 static void
1418 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1419         uint64_t idx, int is_sq, uint64_t value)
1420 {
1421         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1422                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1423
1424         if (is_sq) {
1425                 atomic_store_short(&sc->submit_queues[idx].tail,
1426                                    (uint16_t)value);
1427
1428                 if (idx == 0) {
1429                         pci_nvme_handle_admin_cmd(sc, value);
1430                 } else {
1431                         /* submission queue; handle new entries in SQ */
1432                         if (idx > sc->num_squeues) {
1433                                 WPRINTF(("%s SQ index %lu overflow from "
1434                                          "guest (max %u)\r\n",
1435                                          __func__, idx, sc->num_squeues));
1436                                 return;
1437                         }
1438                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1439                 }
1440         } else {
1441                 if (idx > sc->num_cqueues) {
1442                         WPRINTF(("%s queue index %lu overflow from "
1443                                  "guest (max %u)\r\n",
1444                                  __func__, idx, sc->num_cqueues));
1445                         return;
1446                 }
1447
1448                 sc->compl_queues[idx].head = (uint16_t)value;
1449         }
1450 }
1451
1452 static void
1453 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1454 {
1455         const char *s = iswrite ? "WRITE" : "READ";
1456
1457         switch (offset) {
1458         case NVME_CR_CAP_LOW:
1459                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1460                 break;
1461         case NVME_CR_CAP_HI:
1462                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1463                 break;
1464         case NVME_CR_VS:
1465                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1466                 break;
1467         case NVME_CR_INTMS:
1468                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1469                 break;
1470         case NVME_CR_INTMC:
1471                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1472                 break;
1473         case NVME_CR_CC:
1474                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1475                 break;
1476         case NVME_CR_CSTS:
1477                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1478                 break;
1479         case NVME_CR_NSSR:
1480                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1481                 break;
1482         case NVME_CR_AQA:
1483                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1484                 break;
1485         case NVME_CR_ASQ_LOW:
1486                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1487                 break;
1488         case NVME_CR_ASQ_HI:
1489                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1490                 break;
1491         case NVME_CR_ACQ_LOW:
1492                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1493                 break;
1494         case NVME_CR_ACQ_HI:
1495                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1496                 break;
1497         default:
1498                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1499         }
1500
1501 }
1502
1503 static void
1504 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1505         uint64_t offset, int size, uint64_t value)
1506 {
1507         uint32_t ccreg;
1508
1509         if (offset >= NVME_DOORBELL_OFFSET) {
1510                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1511                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1512                 int is_sq = (belloffset % 8) < 4;
1513
1514                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1515                         WPRINTF(("guest attempted an overflow write offset "
1516                                  "0x%lx, val 0x%lx in %s",
1517                                  offset, value, __func__));
1518                         return;
1519                 }
1520
1521                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1522                 return;
1523         }
1524
1525         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1526                 offset, size, value));
1527
1528         if (size != 4) {
1529                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1530                          "val 0x%lx) to bar0 in %s",
1531                          size, offset, value, __func__));
1532                 /* TODO: shutdown device */
1533                 return;
1534         }
1535
1536         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1537
1538         pthread_mutex_lock(&sc->mtx);
1539
1540         switch (offset) {
1541         case NVME_CR_CAP_LOW:
1542         case NVME_CR_CAP_HI:
1543                 /* readonly */
1544                 break;
1545         case NVME_CR_VS:
1546                 /* readonly */
1547                 break;
1548         case NVME_CR_INTMS:
1549                 /* MSI-X, so ignore */
1550                 break;
1551         case NVME_CR_INTMC:
1552                 /* MSI-X, so ignore */
1553                 break;
1554         case NVME_CR_CC:
1555                 ccreg = (uint32_t)value;
1556
1557                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1558                          "iocqes %u\r\n",
1559                         __func__,
1560                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1561                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1562                          NVME_CC_GET_IOCQES(ccreg)));
1563
1564                 if (NVME_CC_GET_SHN(ccreg)) {
1565                         /* perform shutdown - flush out data to backend */
1566                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1567                             NVME_CSTS_REG_SHST_SHIFT);
1568                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1569                             NVME_CSTS_REG_SHST_SHIFT;
1570                 }
1571                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1572                         if (NVME_CC_GET_EN(ccreg) == 0)
1573                                 /* transition 1-> causes controller reset */
1574                                 pci_nvme_reset_locked(sc);
1575                         else
1576                                 pci_nvme_init_controller(ctx, sc);
1577                 }
1578
1579                 /* Insert the iocqes, iosqes and en bits from the write */
1580                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1581                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1582                 if (NVME_CC_GET_EN(ccreg) == 0) {
1583                         /* Insert the ams, mps and css bit fields */
1584                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1585                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1586                         sc->regs.csts &= ~NVME_CSTS_RDY;
1587                 } else if (sc->pending_ios == 0) {
1588                         sc->regs.csts |= NVME_CSTS_RDY;
1589                 }
1590                 break;
1591         case NVME_CR_CSTS:
1592                 break;
1593         case NVME_CR_NSSR:
1594                 /* ignore writes; don't support subsystem reset */
1595                 break;
1596         case NVME_CR_AQA:
1597                 sc->regs.aqa = (uint32_t)value;
1598                 break;
1599         case NVME_CR_ASQ_LOW:
1600                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1601                                (0xFFFFF000 & value);
1602                 break;
1603         case NVME_CR_ASQ_HI:
1604                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1605                                (value << 32);
1606                 break;
1607         case NVME_CR_ACQ_LOW:
1608                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1609                                (0xFFFFF000 & value);
1610                 break;
1611         case NVME_CR_ACQ_HI:
1612                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1613                                (value << 32);
1614                 break;
1615         default:
1616                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1617                          __func__, offset, value, size));
1618         }
1619         pthread_mutex_unlock(&sc->mtx);
1620 }
1621
1622 static void
1623 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1624                 int baridx, uint64_t offset, int size, uint64_t value)
1625 {
1626         struct pci_nvme_softc* sc = pi->pi_arg;
1627
1628         if (baridx == pci_msix_table_bar(pi) ||
1629             baridx == pci_msix_pba_bar(pi)) {
1630                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1631                          " value 0x%lx\r\n", baridx, offset, size, value));
1632
1633                 pci_emul_msix_twrite(pi, offset, size, value);
1634                 return;
1635         }
1636
1637         switch (baridx) {
1638         case 0:
1639                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1640                 break;
1641
1642         default:
1643                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1644                          __func__, baridx, value));
1645         }
1646 }
1647
1648 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1649         uint64_t offset, int size)
1650 {
1651         uint64_t value;
1652
1653         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1654
1655         if (offset < NVME_DOORBELL_OFFSET) {
1656                 void *p = &(sc->regs);
1657                 pthread_mutex_lock(&sc->mtx);
1658                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1659                 pthread_mutex_unlock(&sc->mtx);
1660         } else {
1661                 value = 0;
1662                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1663         }
1664
1665         switch (size) {
1666         case 1:
1667                 value &= 0xFF;
1668                 break;
1669         case 2:
1670                 value &= 0xFFFF;
1671                 break;
1672         case 4:
1673                 value &= 0xFFFFFFFF;
1674                 break;
1675         }
1676
1677         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1678                  offset, size, (uint32_t)value));
1679
1680         return (value);
1681 }
1682
1683
1684
1685 static uint64_t
1686 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1687     uint64_t offset, int size)
1688 {
1689         struct pci_nvme_softc* sc = pi->pi_arg;
1690
1691         if (baridx == pci_msix_table_bar(pi) ||
1692             baridx == pci_msix_pba_bar(pi)) {
1693                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1694                         baridx, offset, size));
1695
1696                 return pci_emul_msix_tread(pi, offset, size);
1697         }
1698
1699         switch (baridx) {
1700         case 0:
1701                 return pci_nvme_read_bar_0(sc, offset, size);
1702
1703         default:
1704                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1705         }
1706
1707         return (0);
1708 }
1709
1710
1711 static int
1712 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1713 {
1714         char bident[sizeof("XX:X:X")];
1715         char    *uopt, *xopts, *config;
1716         uint32_t sectsz;
1717         int optidx;
1718
1719         sc->max_queues = NVME_QUEUES;
1720         sc->max_qentries = NVME_MAX_QENTRIES;
1721         sc->ioslots = NVME_IOSLOTS;
1722         sc->num_squeues = sc->max_queues;
1723         sc->num_cqueues = sc->max_queues;
1724         sectsz = 0;
1725
1726         uopt = strdup(opts);
1727         optidx = 0;
1728         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1729                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1730         for (xopts = strtok(uopt, ",");
1731              xopts != NULL;
1732              xopts = strtok(NULL, ",")) {
1733
1734                 if ((config = strchr(xopts, '=')) != NULL)
1735                         *config++ = '\0';
1736
1737                 if (!strcmp("maxq", xopts)) {
1738                         sc->max_queues = atoi(config);
1739                 } else if (!strcmp("qsz", xopts)) {
1740                         sc->max_qentries = atoi(config);
1741                 } else if (!strcmp("ioslots", xopts)) {
1742                         sc->ioslots = atoi(config);
1743                 } else if (!strcmp("sectsz", xopts)) {
1744                         sectsz = atoi(config);
1745                 } else if (!strcmp("ser", xopts)) {
1746                         /*
1747                          * This field indicates the Product Serial Number in
1748                          * 7-bit ASCII, unused bytes should be space characters.
1749                          * Ref: NVMe v1.3c.
1750                          */
1751                         cpywithpad((char *)sc->ctrldata.sn,
1752                                    sizeof(sc->ctrldata.sn), config, ' ');
1753                 } else if (!strcmp("ram", xopts)) {
1754                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1755
1756                         sc->nvstore.type = NVME_STOR_RAM;
1757                         sc->nvstore.size = sz * 1024 * 1024;
1758                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1759                         sc->nvstore.sectsz = 4096;
1760                         sc->nvstore.sectsz_bits = 12;
1761                         if (sc->nvstore.ctx == NULL) {
1762                                 perror("Unable to allocate RAM");
1763                                 free(uopt);
1764                                 return (-1);
1765                         }
1766                 } else if (optidx == 0) {
1767                         snprintf(bident, sizeof(bident), "%d:%d",
1768                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1769                         sc->nvstore.ctx = blockif_open(xopts, bident);
1770                         if (sc->nvstore.ctx == NULL) {
1771                                 perror("Could not open backing file");
1772                                 free(uopt);
1773                                 return (-1);
1774                         }
1775                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1776                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1777                 } else {
1778                         fprintf(stderr, "Invalid option %s\n", xopts);
1779                         free(uopt);
1780                         return (-1);
1781                 }
1782
1783                 optidx++;
1784         }
1785         free(uopt);
1786
1787         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1788                 fprintf(stderr, "backing store not specified\n");
1789                 return (-1);
1790         }
1791         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1792                 sc->nvstore.sectsz = sectsz;
1793         else if (sc->nvstore.type != NVME_STOR_RAM)
1794                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1795         for (sc->nvstore.sectsz_bits = 9;
1796              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1797              sc->nvstore.sectsz_bits++);
1798
1799         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1800                 sc->max_queues = NVME_QUEUES;
1801
1802         if (sc->max_qentries <= 0) {
1803                 fprintf(stderr, "Invalid qsz option\n");
1804                 return (-1);
1805         }
1806         if (sc->ioslots <= 0) {
1807                 fprintf(stderr, "Invalid ioslots option\n");
1808                 return (-1);
1809         }
1810
1811         return (0);
1812 }
1813
1814 static int
1815 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1816 {
1817         struct pci_nvme_softc *sc;
1818         uint32_t pci_membar_sz;
1819         int     error;
1820
1821         error = 0;
1822
1823         sc = calloc(1, sizeof(struct pci_nvme_softc));
1824         pi->pi_arg = sc;
1825         sc->nsc_pi = pi;
1826
1827         error = pci_nvme_parse_opts(sc, opts);
1828         if (error < 0)
1829                 goto done;
1830         else
1831                 error = 0;
1832
1833         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1834         for (int i = 0; i < sc->ioslots; i++) {
1835                 if (i < (sc->ioslots-1))
1836                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1837                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1838                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1839         }
1840         sc->ioreqs_free = sc->ioreqs;
1841         sc->intr_coales_aggr_thresh = 1;
1842
1843         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1844         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1845         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1846         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1847         pci_set_cfgdata8(pi, PCIR_PROGIF,
1848                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1849
1850         /* allocate size of nvme registers + doorbell space for all queues */
1851         pci_membar_sz = sizeof(struct nvme_registers) +
1852                         2*sizeof(uint32_t)*(sc->max_queues + 1);
1853
1854         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1855
1856         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1857         if (error) {
1858                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1859                 goto done;
1860         }
1861
1862         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
1863         if (error) {
1864                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1865                 goto done;
1866         }
1867
1868         pthread_mutex_init(&sc->mtx, NULL);
1869         sem_init(&sc->iosemlock, 0, sc->ioslots);
1870
1871         pci_nvme_reset(sc);
1872         pci_nvme_init_ctrldata(sc);
1873         pci_nvme_init_nsdata(sc);
1874
1875         pci_lintr_request(pi);
1876
1877 done:
1878         return (error);
1879 }
1880
1881
1882 struct pci_devemu pci_de_nvme = {
1883         .pe_emu =       "nvme",
1884         .pe_init =      pci_nvme_init,
1885         .pe_barwrite =  pci_nvme_write,
1886         .pe_barread =   pci_nvme_read
1887 };
1888 PCI_EMUL_SET(pci_de_nvme);